Commit 3e8299c7 authored by Pietro Abate's avatar Pietro Abate
Browse files

[r2003-05-11 08:14:34 by cvscast] Better localization of characters in strings

Original author: cvscast
Date: 2003-05-11 08:14:34+00:00
parent 099bf50d
......@@ -38,16 +38,15 @@ let cst_nil = Cst (Types.Atom Sequence.nil_atom)
let seq_of_string pos s =
let s = Encodings.Utf8.mk s in
(* What about locations when input file is not Utf8 ?
Or when using special characters in string ! *)
let (pos,_) = pos in
let rec aux pos i j =
if Encodings.Utf8.equal_index i j then []
else
let (len,i) = Encodings.Utf8.next s i in
let (c,i) = Encodings.Utf8.next s i in
((pos,pos+1),c)::(aux (pos+1) i j)
((pos,pos+len),c) :: (aux (pos + len) i j)
in
aux (pos + 1) (Encodings.Utf8.start_index s) (Encodings.Utf8.end_index s)
aux (pos+1) (Encodings.Utf8.start_index s) (Encodings.Utf8.end_index s)
exception Error of string
let error (i,j) s = Location.raise_loc i j (Error s)
......@@ -58,11 +57,11 @@ let make_record loc r =
let parse_char loc s =
let s = seq_of_string loc s in
match s with
| [_,c] -> c
| [ loc,c ] -> c
| _ -> error loc "Character litteral must have length 1"
let char_list pos s =
let s = seq_of_string pos s in
let char_list loc s =
let s = seq_of_string loc s in
List.map (fun (loc,c) -> exp loc (Cst (Types.Char (Chars.mk_int c)))) s
......
......@@ -76,9 +76,17 @@ let nb_classes = 34
exception Unterminated_string_in_comment
(* Buffer for string literals : always encoded in Utf8 *)
(* Buffer for string literals (always encoded in UTF8).
Each character is encoded in two consecutives code point;
the first one gives the number of bytes in the input document;
the second one gives the Unicode representation *)
let string_buff = Buffer.create 1024
let store_len ?(add=0) lexbuf =
let l = add + (Lexing.lexeme_end lexbuf) - (Lexing.lexeme_start lexbuf) in
Encodings.Utf8.store string_buff l
let store_ascii = Buffer.add_char string_buff
let store_char = Buffer.add_string string_buff
let store_code = Encodings.Utf8.store string_buff
......@@ -113,52 +121,52 @@ let nb_classes = 34
let lex_tables = {
Lexing.lex_base =
"\000\000\012\000\018\000\004\000\254\255\005\000\017\000\255\255\
\251\255\250\255\255\255\029\000\253\255\020\000\252\255\252\255\
\250\255\249\255\255\255\029\000\253\255\020\000\252\255\252\255\
\251\255\010\000\003\000\253\255\247\255\246\255\019\000\049\000\
\062\000\018\000\035\000\037\000\250\255\060\000\027\000\036\000\
\063\000\024\000\030\000\041\000\041\000\249\255\248\255\079\000\
\082\000\086\000\099\000\065\000\103\000\116\000\120\000\133\000\
\137\000\150\000\071\000";
\063\000\024\000\030\000\041\000\041\000\249\255\250\255\248\255\
\079\000\082\000\086\000\099\000\065\000\103\000\116\000\120\000\
\133\000\137\000\150\000\071\000";
Lexing.lex_backtrk =
"\255\255\255\255\255\255\255\255\255\255\001\000\255\255\255\255\
\255\255\255\255\255\255\005\000\255\255\255\255\255\255\255\255\
\255\255\255\255\255\255\004\000\255\255\255\255\255\255\255\255\
\255\255\004\000\004\000\255\255\255\255\255\255\000\000\001\000\
\002\000\003\000\005\000\005\000\255\255\005\000\005\000\005\000\
\005\000\005\000\005\000\005\000\005\000\255\255\255\255\255\255\
\004\000\255\255\004\000\003\000\002\000\255\255\002\000\001\000\
\255\255\001\000\000\000";
\255\255\004\000\255\255\004\000\003\000\002\000\255\255\002\000\
\001\000\255\255\001\000\000\000";
Lexing.lex_default =
"\028\000\016\000\009\000\004\000\000\000\255\255\255\255\000\000\
\000\000\000\000\000\000\255\255\000\000\255\255\000\000\000\000\
\000\000\255\255\255\255\000\000\000\000\000\000\255\255\255\255\
\255\255\255\255\255\255\255\255\000\000\255\255\255\255\255\255\
\255\255\255\255\255\255\255\255\255\255\000\000\000\000\255\255\
\255\255\255\255\255\255\255\255\255\255\000\000\000\000\000\000\
\255\255\255\255\255\255\255\255\255\255\255\255\255\255\255\255\
\255\255\255\255\255\255";
\255\255\255\255\255\255\255\255";
Lexing.lex_trans =
"\020\000\021\000\021\000\022\000\023\000\024\000\025\000\023\000\
\026\000\027\000\005\000\006\000\015\000\029\000\030\000\031\000\
\032\000\033\000\008\000\007\000\007\000\034\000\050\000\006\000\
\043\000\035\000\013\000\036\000\033\000\017\000\037\000\007\000\
\037\000\012\000\014\000\013\000\004\000\018\000\039\000\040\000\
\040\000\009\000\019\000\009\000\019\000\009\000\009\000\009\000\
\010\000\011\000\010\000\009\000\009\000\047\000\047\000\047\000\
\047\000\009\000\038\000\004\000\004\000\004\000\048\000\009\000\
\009\000\047\000\044\000\044\000\044\000\044\000\009\000\043\000\
\009\000\009\000\050\000\045\000\000\000\000\000\044\000\009\000\
\000\000\047\000\039\000\040\000\040\000\000\000\040\000\040\000\
\040\000\040\000\042\000\042\000\042\000\042\000\044\000\041\000\
\000\000\000\000\040\000\000\000\000\000\000\000\042\000\042\000\
\042\000\042\000\042\000\044\000\044\000\044\000\044\000\000\000\
\041\000\000\000\040\000\042\000\045\000\000\000\042\000\044\000\
\046\000\046\000\046\000\046\000\046\000\046\000\046\000\046\000\
\000\000\000\000\000\000\042\000\046\000\045\000\000\000\044\000\
\046\000\047\000\047\000\047\000\047\000\049\000\049\000\049\000\
\049\000\000\000\048\000\000\000\046\000\047\000\000\000\000\000\
\046\000\049\000\049\000\049\000\049\000\049\000\000\000\000\000\
\000\000\000\000\000\000\048\000\000\000\047\000\049\000\000\000\
\000\000\049\000\000\000\000\000\000\000\000\000\000\000\000\000\
\000\000\000\000\000\000\000\000\000\000\000\000\049\000\000\000\
\032\000\033\000\008\000\007\000\007\000\034\000\051\000\006\000\
\044\000\035\000\013\000\036\000\033\000\017\000\037\000\007\000\
\037\000\012\000\014\000\013\000\004\000\018\000\040\000\041\000\
\041\000\038\000\019\000\038\000\019\000\038\000\038\000\038\000\
\010\000\011\000\010\000\038\000\038\000\048\000\048\000\048\000\
\048\000\038\000\039\000\004\000\004\000\004\000\049\000\038\000\
\038\000\048\000\045\000\045\000\045\000\045\000\038\000\044\000\
\038\000\038\000\051\000\046\000\000\000\000\000\045\000\038\000\
\000\000\048\000\040\000\041\000\041\000\000\000\041\000\041\000\
\041\000\041\000\043\000\043\000\043\000\043\000\045\000\042\000\
\000\000\000\000\041\000\000\000\000\000\000\000\043\000\043\000\
\043\000\043\000\043\000\045\000\045\000\045\000\045\000\000\000\
\042\000\000\000\041\000\043\000\046\000\000\000\043\000\045\000\
\047\000\047\000\047\000\047\000\047\000\047\000\047\000\047\000\
\000\000\000\000\000\000\043\000\047\000\046\000\000\000\045\000\
\047\000\048\000\048\000\048\000\048\000\050\000\050\000\050\000\
\050\000\000\000\049\000\000\000\047\000\048\000\000\000\000\000\
\047\000\050\000\050\000\050\000\050\000\050\000\000\000\000\000\
\000\000\000\000\000\000\049\000\000\000\048\000\050\000\000\000\
\000\000\050\000\000\000\000\000\000\000\000\000\000\000\000\000\
\000\000\000\000\000\000\000\000\000\000\000\000\050\000\000\000\
";
Lexing.lex_check =
"\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\
......@@ -169,52 +177,52 @@ let lex_tables = {
\026\000\030\000\001\000\026\000\001\000\026\000\027\000\027\000\
\002\000\002\000\002\000\031\000\033\000\023\000\023\000\023\000\
\023\000\034\000\035\000\011\000\011\000\011\000\023\000\036\000\
\031\000\023\000\024\000\024\000\024\000\024\000\029\000\043\000\
\032\000\029\000\050\000\024\000\255\255\255\255\024\000\032\000\
\255\255\023\000\039\000\039\000\039\000\255\255\040\000\040\000\
\040\000\040\000\041\000\041\000\041\000\041\000\024\000\040\000\
\255\255\255\255\040\000\255\255\255\255\255\255\041\000\042\000\
\042\000\042\000\042\000\044\000\044\000\044\000\044\000\255\255\
\042\000\255\255\040\000\042\000\044\000\255\255\041\000\044\000\
\045\000\045\000\045\000\045\000\046\000\046\000\046\000\046\000\
\255\255\255\255\255\255\042\000\045\000\046\000\255\255\044\000\
\046\000\047\000\047\000\047\000\047\000\048\000\048\000\048\000\
\048\000\255\255\047\000\255\255\045\000\047\000\255\255\255\255\
\046\000\048\000\049\000\049\000\049\000\049\000\255\255\255\255\
\255\255\255\255\255\255\049\000\255\255\047\000\049\000\255\255\
\255\255\048\000\255\255\255\255\255\255\255\255\255\255\255\255\
\255\255\255\255\255\255\255\255\255\255\255\255\049\000\255\255\
\031\000\023\000\024\000\024\000\024\000\024\000\029\000\044\000\
\032\000\029\000\051\000\024\000\255\255\255\255\024\000\032\000\
\255\255\023\000\040\000\040\000\040\000\255\255\041\000\041\000\
\041\000\041\000\042\000\042\000\042\000\042\000\024\000\041\000\
\255\255\255\255\041\000\255\255\255\255\255\255\042\000\043\000\
\043\000\043\000\043\000\045\000\045\000\045\000\045\000\255\255\
\043\000\255\255\041\000\043\000\045\000\255\255\042\000\045\000\
\046\000\046\000\046\000\046\000\047\000\047\000\047\000\047\000\
\255\255\255\255\255\255\043\000\046\000\047\000\255\255\045\000\
\047\000\048\000\048\000\048\000\048\000\049\000\049\000\049\000\
\049\000\255\255\048\000\255\255\046\000\048\000\255\255\255\255\
\047\000\049\000\050\000\050\000\050\000\050\000\255\255\255\255\
\255\255\255\255\255\255\050\000\255\255\048\000\050\000\255\255\
\255\255\049\000\255\255\255\255\255\255\255\255\255\255\255\255\
\255\255\255\255\255\255\255\255\255\255\255\255\050\000\255\255\
"
}
let rec token engine lexbuf =
match engine lex_tables 0 lexbuf with
0 -> (
# 65 "parser/wlexer.mll"
# 71 "parser/wlexer.mll"
token engine lexbuf )
| 1 -> (
# 66 "parser/wlexer.mll"
# 72 "parser/wlexer.mll"
let s = Lexing.lexeme lexbuf in
if Hashtbl.mem keywords s then "",s else "LIDENT",s
)
| 2 -> (
# 70 "parser/wlexer.mll"
# 76 "parser/wlexer.mll"
"UIDENT",Lexing.lexeme lexbuf )
| 3 -> (
# 71 "parser/wlexer.mll"
# 77 "parser/wlexer.mll"
"INT",Lexing.lexeme lexbuf )
| 4 -> (
# 72 "parser/wlexer.mll"
# 78 "parser/wlexer.mll"
let s = Lexing.lexeme lexbuf in
"TAG", tag_of_tag s 1
)
| 5 -> (
# 80 "parser/wlexer.mll"
# 86 "parser/wlexer.mll"
"",Lexing.lexeme lexbuf )
| 6 -> (
# 83 "parser/wlexer.mll"
# 89 "parser/wlexer.mll"
let string_start = Lexing.lexeme_start lexbuf in
string_start_pos := string_start;
let double_quote = Lexing.lexeme_char lexbuf 0 = '"' in
......@@ -224,15 +232,15 @@ let rec token engine lexbuf =
(if double_quote then "STRING2" else "STRING1"),
(get_stored_string()) )
| 7 -> (
# 93 "parser/wlexer.mll"
# 99 "parser/wlexer.mll"
comment_start_pos := [Lexing.lexeme_start lexbuf];
comment engine lexbuf;
token engine lexbuf )
| 8 -> (
# 98 "parser/wlexer.mll"
# 104 "parser/wlexer.mll"
"EOI","" )
| 9 -> (
# 100 "parser/wlexer.mll"
# 106 "parser/wlexer.mll"
error
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
(Illegal_character ((Lexing.lexeme lexbuf).[0])) )
......@@ -241,18 +249,19 @@ let rec token engine lexbuf =
and comment engine lexbuf =
match engine lex_tables 1 lexbuf with
0 -> (
# 106 "parser/wlexer.mll"
# 112 "parser/wlexer.mll"
comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
comment engine lexbuf;
)
| 1 -> (
# 110 "parser/wlexer.mll"
# 116 "parser/wlexer.mll"
comment_start_pos := List.tl !comment_start_pos;
if !comment_start_pos <> [] then comment engine lexbuf;
)
| 2 -> (
# 114 "parser/wlexer.mll"
# 120 "parser/wlexer.mll"
string_start_pos := Lexing.lexeme_start lexbuf;
Buffer.clear string_buff;
let ender = Lexing.lexeme lexbuf in
(try string ender engine lexbuf
with Location.Location (_,Unterminated_string) ->
......@@ -261,61 +270,73 @@ and comment engine lexbuf =
Buffer.clear string_buff;
comment engine lexbuf )
| 3 -> (
# 123 "parser/wlexer.mll"
# 130 "parser/wlexer.mll"
let st = List.hd !comment_start_pos in
error st (st+2) Unterminated_comment
)
| 4 -> (
# 127 "parser/wlexer.mll"
# 134 "parser/wlexer.mll"
comment engine lexbuf )
| _ -> failwith "lexing: empty token [comment]"
and string ender engine lexbuf =
match engine lex_tables 2 lexbuf with
0 -> (
# 131 "parser/wlexer.mll"
let c = Lexing.lexeme lexbuf in
# 138 "parser/wlexer.mll"
let c = Lexing.lexeme lexbuf in
if c = ender then ()
else (store_char (Lexing.lexeme lexbuf); string ender engine lexbuf)
)
else (store_len lexbuf;
store_char (Lexing.lexeme lexbuf);
string ender engine lexbuf) )
| 1 -> (
# 137 "parser/wlexer.mll"
store_ascii (Lexing.lexeme_char lexbuf 1);
# 144 "parser/wlexer.mll"
store_len lexbuf;
store_ascii (Lexing.lexeme_char lexbuf 1);
string ender engine lexbuf )
| 2 -> (
# 139 "parser/wlexer.mll"
let c = Lexing.lexeme_char lexbuf 1 in
if c = 'x' then parse_hexa_char engine lexbuf else store_special c;
string ender engine lexbuf )
# 148 "parser/wlexer.mll"
let c = Lexing.lexeme_char lexbuf 1 in
if c = 'x'
then parse_hexa_char engine lexbuf
else (store_len lexbuf; store_special c);
string ender engine lexbuf )
| 3 -> (
# 144 "parser/wlexer.mll"
store_code (numeric_char (Lexing.lexeme lexbuf));
# 154 "parser/wlexer.mll"
store_len lexbuf;
store_code (numeric_char (Lexing.lexeme lexbuf));
string ender engine lexbuf )
| 4 -> (
# 147 "parser/wlexer.mll"
error !string_start_pos (!string_start_pos+1) Unterminated_string )
# 158 "parser/wlexer.mll"
error
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
(Illegal_character '\\') )
| 5 -> (
# 149 "parser/wlexer.mll"
store_code (Char.code (Lexing.lexeme_char lexbuf 0)); (* Adapt when source is UTF8 *)
# 162 "parser/wlexer.mll"
error !string_start_pos (!string_start_pos+1) Unterminated_string )
| 6 -> (
# 164 "parser/wlexer.mll"
store_len lexbuf;
store_code (Char.code (Lexing.lexeme_char lexbuf 0));
(* Adapt when source is UTF8 *)
string ender engine lexbuf )
| _ -> failwith "lexing: empty token [string ender]"
and parse_hexa_char engine lexbuf =
match engine lex_tables 3 lexbuf with
0 -> (
# 154 "parser/wlexer.mll"
store_code (hexa_char (Lexing.lexeme lexbuf)) )
# 171 "parser/wlexer.mll"
store_len ~add:2 lexbuf;
store_code (hexa_char (Lexing.lexeme lexbuf)) )
| 1 -> (
# 156 "parser/wlexer.mll"
store_char "\\x";
store_char (Lexing.lexeme lexbuf); )
# 174 "parser/wlexer.mll"
error
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
(Illegal_character '\\') )
| _ -> failwith "lexing: empty token [parse_hexa_char]"
;;
# 159 "parser/wlexer.mll"
# 179 "parser/wlexer.mll"
let delta_loc = ref 0
......
......@@ -19,9 +19,17 @@ classes
exception Unterminated_string_in_comment
(* Buffer for string literals : always encoded in Utf8 *)
(* Buffer for string literals (always encoded in UTF8).
Each character is encoded in two consecutives code point;
the first one gives the number of bytes in the input document;
the second one gives the Unicode representation *)
let string_buff = Buffer.create 1024
let store_len ?(add=0) lexbuf =
let l = add + (Lexing.lexeme_end lexbuf) - (Lexing.lexeme_start lexbuf) in
Encodings.Utf8.store string_buff l
let store_ascii = Buffer.add_char string_buff
let store_char = Buffer.add_string string_buff
let store_code = Encodings.Utf8.store string_buff
......@@ -59,8 +67,6 @@ classes
let identchar = lowercase | uppercase | ascii_digit | '_' | '\'' | '-'
let ident = identchar* ( ':' identchar+)*
let numeric_char = '\\' ascii_digit+ ';'
rule token = parse
blank+ { token engine lexbuf }
| (lowercase | '_') ident {
......@@ -112,6 +118,7 @@ and comment = parse
}
| '"' | "'"
{ string_start_pos := Lexing.lexeme_start lexbuf;
Buffer.clear string_buff;
let ender = Lexing.lexeme lexbuf in
(try string ender engine lexbuf
with Location.Location (_,Unterminated_string) ->
......@@ -128,33 +135,46 @@ and comment = parse
and string ender = parse
| '"' | "'"
{
let c = Lexing.lexeme lexbuf in
{ let c = Lexing.lexeme lexbuf in
if c = ender then ()
else (store_char (Lexing.lexeme lexbuf); string ender engine lexbuf)
}
else (store_len lexbuf;
store_char (Lexing.lexeme lexbuf);
string ender engine lexbuf) }
| '\\' ['\\' '"' '\'']
{ store_ascii (Lexing.lexeme_char lexbuf 1);
{ store_len lexbuf;
store_ascii (Lexing.lexeme_char lexbuf 1);
string ender engine lexbuf }
| '\\' lowercase {
let c = Lexing.lexeme_char lexbuf 1 in
if c = 'x' then parse_hexa_char engine lexbuf else store_special c;
string ender engine lexbuf }
| numeric_char
{ store_code (numeric_char (Lexing.lexeme lexbuf));
| '\\' lowercase
{ let c = Lexing.lexeme_char lexbuf 1 in
if c = 'x'
then parse_hexa_char engine lexbuf
else (store_len lexbuf; store_special c);
string ender engine lexbuf }
| '\\' ascii_digit+ ';'
{ store_len lexbuf;
store_code (numeric_char (Lexing.lexeme lexbuf));
string ender engine lexbuf }
| '\\'
{ error
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
(Illegal_character '\\') }
| eof
{ error !string_start_pos (!string_start_pos+1) Unterminated_string }
| _
{ store_code (Char.code (Lexing.lexeme_char lexbuf 0)); (* Adapt when source is UTF8 *)
{ store_len lexbuf;
store_code (Char.code (Lexing.lexeme_char lexbuf 0));
(* Adapt when source is UTF8 *)
string ender engine lexbuf }
and parse_hexa_char = parse
| ascii_digit+ ';'
{ store_code (hexa_char (Lexing.lexeme lexbuf)) }
{ store_len ~add:2 lexbuf;
store_code (hexa_char (Lexing.lexeme lexbuf)) }
| _
{ store_char "\\x";
store_char (Lexing.lexeme lexbuf); }
{ error
(Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
(Illegal_character '\\') }
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment