Commit 36b83c45 authored by Kim Nguyễn's avatar Kim Nguyễn
Browse files

Fix the handling of polymorphic variables in the lexer. The solution

to use two lexers (depending on whether we are between square brackets
or not) is too brittle (it crudely tries to parse
 ``( [whitespace] 'a  [whitespace] )'' as a variable, to force the user
to write the variable beetween parenthesis. However this does not scale
to types with two arguments (says [ t ('a, 'b) ]).

We use a simpler heuristic (with look ahead)

(1) try to see if the regular expression

' (anything but ', \n)* '(anything but the first letter of an identifier)

can be found. If so, we put back the lexeme in the buffer and parse it as as
a string.

(2) if (1) failed, try to parse it as a variable

(3) if (3) failed, try to parse it again as a string. We are
guaranteed to fail here but it means we have a malformed string, so we
parse as a string to get a proper error message.

The only thing this does not cover are cases like
type t = [ 'abcd'Int ]
which was tokenized before as [, 'abcd', Int, ]
and is now tokenized as [, 'abcd, 'Int, ]
It does not seem to be a problem in practice though (since in the code
I have seen thus far, people were at least putting a space).
it is easy to emmit a warning in this case, suggesting the user to add
a whitespace to get the old behaviour back.
parent 77556e8b
......@@ -325,7 +325,6 @@ let catch_exn ppf_err exn =
Format.fprintf ppf_err "@."
let parse rule input =
Ulexer.toplevel := !toplevel;
try Parser.localize_exn (fun () -> rule input)
with e -> Parser.sync (); raise e
......@@ -408,4 +407,3 @@ let () =
| [ (None,v) ] -> v
| _ -> Value.failwith' "eval: the string must evaluate to a single value"
)
......@@ -110,8 +110,6 @@ let is_fun_decl =
(fun strm ->
match Stream.npeek 3 strm with
| [ KEYWORD "fun", _; IDENT _, _; KEYWORD "(", _ ]
| [ KEYWORD "fun", _; IDENTPAR _, _; _ ]
| [ IDENTPAR _, _; _ ; _ ]
| [ IDENT _, _; KEYWORD "(", _; _ ] -> ()
| _ -> raise Stream.Failure
)
......@@ -159,12 +157,12 @@ EXTEND Gram
args = OPT [ "("; l = LIST1 [ v = PVAR -> U.mk (clean_pvar v) ] SEP ","; ")" -> l ];
"="; t = pat ->
[ mk _loc (TypeDecl (x, opt_to_list args ,t)) ]
| "using"; name = IDENT; "="; cu = [ x = IDENT -> x | x = STRING -> x ] ->
| "using"; name = IDENT; "="; cu = [ x = IDENT -> x | x = STRING2 -> x ] ->
[ mk _loc (Using (U.mk name, U.mk cu)) ]
| "open"; ids = LIST1 ident_or_keyword SEP "." ->
let ids = List.map (fun x -> ident x) ids in
[ mk _loc (Open ids) ]
| "schema"; name = IDENT; "="; uri = STRING ->
| "schema"; name = IDENT; "="; uri = STRING2 ->
protect_op "schema";
[ mk _loc (SchemaDecl (U.mk name, uri)) ]
| n = namespace_binding ->
......@@ -194,7 +192,7 @@ EXTEND Gram
| "#"; IDENT "help" -> [ mk _loc (Directive (`Help None)) ]
| "#"; IDENT "help"; "debug" -> [ mk _loc (Directive (`Help (Some "debug"))) ]
| "#"; IDENT "builtins" -> [ mk _loc (Directive `Builtins) ]
| "include"; s = STRING ->
| "include"; s = STRING2 ->
protect_op "File inclusion";
let s = Cduce_loc.resolve_filename s in
(* avoid looping; should issue an error ? *)
......@@ -392,7 +390,7 @@ EXTEND Gram
a = expr_attrib_spec; ">"; c = expr ->
exp _loc (Xml (t, Pair (a,c)))
| "{"; r = expr_record_spec; "}" -> r
| s = STRING ->
| s = STRING2 ->
let s = U.mk s in
exp _loc (String (U.start_index s, U.end_index s, s, cst_nil))
| a = IDENT -> exp _loc (Var (ident a))
......@@ -414,7 +412,7 @@ EXTEND Gram
];
seq_elem: [
[ x = STRING2 ->
[ x = STRING1 ->
let s = U.mk x in
`String (_loc, U.start_index s, U.end_index s, s)
| e = expr LEVEL "no_appl" -> `Elems (_loc,e)
......@@ -434,7 +432,7 @@ EXTEND Gram
];
ns_expr: [
[ uri = STRING -> `Uri (Ns.Uri.mk (ident uri))
[ uri = STRING2 -> `Uri (Ns.Uri.mk (ident uri))
| ids = LIST1 ident_or_keyword SEP "." ->
let ids = List.map (fun x -> ident x) ids in
`Path ids ]
......@@ -600,11 +598,11 @@ EXTEND Gram
Elem (mk _loc (Constant ((ident a,c))))
| "/"; p = pat LEVEL "simple" -> Guard p
| IDENT "PCDATA" -> string_regexp
| i = STRING2; "--"; j = STRING2 ->
| i = STRING1; "--"; j = STRING1 ->
let i = Chars.V.mk_int (parse_char _loc i)
and j = Chars.V.mk_int (parse_char _loc j) in
Elem (mk _loc (Internal (Types.char (Chars.char_class i j))))
| s = STRING2 ->
| s = STRING1 ->
List.fold_right
(fun c accu ->
let c = Chars.V.mk_int c in
......@@ -693,7 +691,7 @@ EXTEND Gram
| "("; t = pat; ")" -> t ];
a = attrib_spec; ">"; c = pat ->
mk _loc (XmlT (t, multi_prod _loc [a;c]))
| s = STRING ->
| s = STRING2 ->
let s =
List.map
(fun c ->
......@@ -731,10 +729,9 @@ EXTEND Gram
]
];
char: [
[ c = CHAR -> Chars.V.mk_int (parse_char _loc c)
| c = STRING2 -> Chars.V.mk_int (parse_char _loc c) ]
char:
[
[ c = STRING1 -> Chars.V.mk_int (parse_char _loc c) ]
];
......
......@@ -14,7 +14,6 @@ module Loc = struct
let to_tuple _ = assert false
let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
let smart_merge a b = merge a b
let join (x1, _) = (x1, x1)
let move _ _ _ = assert false
let shift _ _ = assert false
......@@ -49,11 +48,9 @@ type token =
| IDENT of string
| ANY_IN_NS of string
| INT of string
| CHAR of string
| STRING of string
| STRING1 of string
| STRING2 of string
| PVAR of string
| IDENTPAR of string
| EOI
module Token = struct
......@@ -69,13 +66,11 @@ module Token = struct
| KEYWORD s -> sf "KEYWORD %S" s
| IDENT s -> sf "IDENT %S" s
| INT s -> sf "INT %s" s
| CHAR s -> sf "CHAR \'%s\'" s
| STRING s -> sf "STRING \"%s\"" s
| STRING2 s -> sf "STRING \'%s\'" s
| STRING1 s -> sf "STRING \"%s\"" s
| STRING2 s -> sf "STRING \"%s\"" s
(* here it's not %S since the string is already escaped *)
| PVAR s -> sf "PVAR \'%S\'" s
| IDENTPAR s -> sf "IDENTPAR \'%S\'" s
| ANY_IN_NS s -> sf "ANY_IN_NS %S" s
| PVAR s -> sf "PVAR %S" s
| EOI -> sf "EOI"
let print ppf x = pp_print_string ppf (to_string x)
......@@ -87,8 +82,8 @@ module Token = struct
let extract_string =
function
| IDENTPAR s | KEYWORD s | IDENT s | INT s | CHAR s | STRING s | STRING2 s | PVAR s |
ANY_IN_NS s -> s
| KEYWORD s | IDENT s | INT s | STRING1 s | STRING2 s |
ANY_IN_NS s | PVAR s -> s
| tok ->
invalid_arg ("Cannot extract a string from this token: "^
to_string tok)
......@@ -141,8 +136,6 @@ module L = Ulexing
exception Error of int * int * string
let toplevel = ref false
let error i j s = raise (Error (i,j,s))
(* Buffer for string literals *)
......@@ -181,24 +174,99 @@ let parse_char lexbuf base i =
done;
!r
(* this should match the string lexer *)
let regexp utf8_char = [^ '\\' '"' '\'' 9 10 13 ]
let regexp dec_char = '\\' ['0'-'9']+ ';'
let regexp hex_char = "\\x" ['0'-'9''a'-'f''A'-'F']+ ';'
let regexp esc_char = '\\' ['\\' '"' '\'' 'n' 't' 'r']
let regexp single_char = utf8_char | dec_char | hex_char | esc_char | '"'
let regexp ncname_char =
xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
let regexp qname = (ncname ':')? ncname
(* Should be [^ xml_letter ] *)
let regexp not_xml_letter = [^ 'A'-'Z' 'a'-'z' '0'-'9' '_' ]
let regexp character = _ | '\\' ['\\' '"' '\''] | "\\n" | "\\t" | "\\r"
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' | '\\' ['0'-'9']+ ';'
(* We want to write _ \ (xml_letter | '_') but we can't due to a limitation in ulex.
we define explicitely this regexp, based on the definition in ulex-1.1
*)
let regexp not_ncname_letter = [^ (* base_char *)
0x0041-0x005A 0x0061-0x007A 0x00C0-0x00D6 0x00D8-0x00F6
0x00F8-0x00FF 0x0100-0x0131 0x0134-0x013E 0x0141-0x0148
0x014A-0x017E 0x0180-0x01C3 0x01CD-0x01F0 0x01F4-0x01F5
0x01FA-0x0217 0x0250-0x02A8 0x02BB-0x02C1 0x0386-0x0386
0x0388-0x038A 0x038C-0x038C 0x038E-0x03A1 0x03A3-0x03CE
0x03D0-0x03D6 0x03DA-0x03DA 0x03DC-0x03DC 0x03DE-0x03DE
0x03E0-0x03E0 0x03E2-0x03F3
0x0401-0x040C 0x040E-0x044F 0x0451-0x045C 0x045E-0x0481
0x0490-0x04C4 0x04C7-0x04C8 0x04CB-0x04CC 0x04D0-0x04EB
0x04EE-0x04F5 0x04F8-0x04F9 0x0531-0x0556 0x0559-0x0559
0x0561-0x0586 0x05D0-0x05EA 0x05F0-0x05F2 0x0621-0x063A
0x0641-0x064A 0x0671-0x06B7 0x06BA-0x06BE 0x06C0-0x06CE
0x06D0-0x06D3 0x06D5-0x06D5 0x06E5-0x06E6 0x0905-0x0939
0x093D-0x093D
0x0958-0x0961 0x0985-0x098C 0x098F-0x0990 0x0993-0x09A8
0x09AA-0x09B0 0x09B2-0x09B2 0x09B6-0x09B9 0x09DC-0x09DD
0x09DF-0x09E1 0x09F0-0x09F1 0x0A05-0x0A0A 0x0A0F-0x0A10
0x0A13-0x0A28 0x0A2A-0x0A30 0x0A32-0x0A33 0x0A35-0x0A36
0x0A38-0x0A39 0x0A59-0x0A5C 0x0A5E-0x0A5E 0x0A72-0x0A74
0x0A85-0x0A8B 0x0A8D-0x0A8D 0x0A8F-0x0A91 0x0A93-0x0AA8
0x0AAA-0x0AB0 0x0AB2-0x0AB3 0x0AB5-0x0AB9 0x0ABD-0x0ABD
0x0AE0-0x0AE0
0x0B05-0x0B0C 0x0B0F-0x0B10 0x0B13-0x0B28 0x0B2A-0x0B30
0x0B32-0x0B33 0x0B36-0x0B39 0x0B3D-0x0B3D 0x0B5C-0x0B5D
0x0B5F-0x0B61 0x0B85-0x0B8A 0x0B8E-0x0B90 0x0B92-0x0B95
0x0B99-0x0B9A 0x0B9C-0x0B9C 0x0B9E-0x0B9F 0x0BA3-0x0BA4
0x0BA8-0x0BAA 0x0BAE-0x0BB5 0x0BB7-0x0BB9 0x0C05-0x0C0C
0x0C0E-0x0C10 0x0C12-0x0C28 0x0C2A-0x0C33 0x0C35-0x0C39
0x0C60-0x0C61 0x0C85-0x0C8C 0x0C8E-0x0C90 0x0C92-0x0CA8
0x0CAA-0x0CB3 0x0CB5-0x0CB9 0x0CDE-0x0CDE 0x0CE0-0x0CE1
0x0D05-0x0D0C 0x0D0E-0x0D10 0x0D12-0x0D28 0x0D2A-0x0D39
0x0D60-0x0D61 0x0E01-0x0E2E 0x0E30-0x0E30 0x0E32-0x0E33
0x0E40-0x0E45 0x0E81-0x0E82 0x0E84-0x0E84 0x0E87-0x0E88
0x0E8A-0x0E8A
0x0E8D-0x0E8D 0x0E94-0x0E97 0x0E99-0x0E9F 0x0EA1-0x0EA3
0x0EA5-0x0EA5
0x0EA7-0x0EA7 0x0EAA-0x0EAB 0x0EAD-0x0EAE 0x0EB0-0x0EB0
0x0EB2-0x0EB3
0x0EBD-0x0EBD 0x0EC0-0x0EC4 0x0F40-0x0F47 0x0F49-0x0F69
0x10A0-0x10C5 0x10D0-0x10F6 0x1100-0x1100 0x1102-0x1103
0x1105-0x1107 0x1109-0x1109 0x110B-0x110C 0x110E-0x1112
0x113C-0x113C
0x113E-0x113E 0x1140-0x1140 0x114C-0x114C 0x114E-0x114E
0x1150-0x1150 0x1154-0x1155 0x1159-0x1159
0x115F-0x1161 0x1163-0x1163 0x1165-0x1165 0x1167-0x1167
0x1169-0x1169 0x116D-0x116E
0x1172-0x1173 0x1175-0x1175 0x119E-0x119E 0x11A8-0x11A8
0x11AB-0x11AB 0x11AE-0x11AF
0x11B7-0x11B8 0x11BA-0x11BA 0x11BC-0x11C2 0x11EB-0x11EB
0x11F0-0x11F0 0x11F9-0x11F9
0x1E00-0x1E9B 0x1EA0-0x1EF9 0x1F00-0x1F15 0x1F18-0x1F1D
0x1F20-0x1F45 0x1F48-0x1F4D 0x1F50-0x1F57 0x1F59-0x1F59
0x1F5B-0x1F5B
0x1F5D-0x1F5D 0x1F5F-0x1F7D 0x1F80-0x1FB4 0x1FB6-0x1FBC
0x1FBE-0x1FBE
0x1FC2-0x1FC4 0x1FC6-0x1FCC 0x1FD0-0x1FD3 0x1FD6-0x1FDB
0x1FE0-0x1FEC 0x1FF2-0x1FF4 0x1FF6-0x1FFC 0x2126-0x2126
0x212A-0x212B 0x212E-0x212E 0x2180-0x2182 0x3041-0x3094
0x30A1-0x30FA 0x3105-0x312C 0xAC00-0xD7A3
(* ideographic *)
0x3007-0x3007 0x3021-0x3029 0x4E00-0x9FA5
(* '_' *)
'_'
]
let illegal lexbuf =
error
(L.lexeme_start lexbuf)
(L.lexeme_end lexbuf)
"Illegal character"
("Illegal character : '" ^ (L.utf8_lexeme lexbuf) ^ "'")
let in_comment = ref false
let in_brackets = ref 0
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))
......@@ -215,183 +283,57 @@ let rec token = lexer
return lexbuf (ANY_IN_NS "")
| '-'? ['0'-'9']+ ->
return lexbuf (INT (L.utf8_lexeme lexbuf))
| [ "<>=.,:;+-*/@&{}()|?`!$" ]
| [ "<>=.,:;+-*/@&{}[]()|?`!" ]
| "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
| "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
| ".."
| ["?+*"] "?" | "#" ->
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "[" ->
incr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "]" ->
decr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| '"' ->
let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '"' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING s)
| "'" character "'" ->
(* the three followning rules work together and are replicated in the comment lexer *)
| "'" ([^ '\t' '\n' '\r' '\''] | '\\' '\'')* "'" (not_ncname_letter | eof) ->
(* two single quotes not followed by an xml_letter must be a string
we put it back call an auxiliary lexer to consume the first ' and read it as a string.
*)
L.rollback lexbuf;
(fun _ -> lexer
| "'" -> let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '\'' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (CHAR s)
| _ -> assert false) () lexbuf
do_string lexbuf
| "'" ncname ->
let s = L.utf8_lexeme lexbuf in
let s = String.sub s 1 (String.length s - 1) in
return lexbuf (PVAR s)
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| "/*" ->
in_comment := true;
tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| eof ->
return lexbuf EOI
| _ ->
illegal lexbuf
(* then try to read it as variable *)
let s = L.utf8_sub_lexeme lexbuf 1 (L.lexeme_length lexbuf - 1) in
return lexbuf (PVAR (s))
and token2 = lexer
| xml_blank+ -> token2 lexbuf
| qname ->
let s = L.utf8_lexeme lexbuf in
return lexbuf (IDENT s)
| ncname ":*" ->
let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
return lexbuf (ANY_IN_NS s)
| ".:*" ->
return lexbuf (ANY_IN_NS "")
| '-'? ['0'-'9']+ ->
return lexbuf (INT (L.utf8_lexeme lexbuf))
| [ "<>=.,:;+-*/@&{}()|?`!$" ]
| "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
| "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
| ".."
| ["?+*"] "?" | "#" ->
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "[" ->
incr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "]" ->
decr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| '"' ->
let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '"' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING s)
| "'" character "'--'" character "'"
| "'" [^ '\'']+ "'" not_xml_letter ->
| ('"' | "'") ->
(* otherwise we will fail for sure, but try to read it character by character as a string
to get a decent error message *)
L.rollback lexbuf;
(fun _ -> lexer
| "'" -> let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '\'' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING2 s)
| _ -> assert false) () lexbuf
| "(" [" \t"]* "'" ncname [" \t"]* ")" ->
let s = L.utf8_lexeme lexbuf in
let idstart = String.index s '\'' + 1 in
let s = String.sub s idstart (String.length s - idstart) in
let len = String.length s in
let idend = min (min (try String.index s ' ' with _ -> len)
(try String.index s '\t' with _ -> len))
(try String.index s ')' with _ -> len) in
let s = String.sub s 0 idend in
return lexbuf (PVAR ("()"^s)) (* UGLY hack to not loose the "("; ")" tokens. *)
| "'" ncname ->
let s = L.utf8_lexeme lexbuf in
let s = String.sub s 1 (String.length s - 1) in
return lexbuf (PVAR s)
do_string lexbuf
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token2 lexbuf
token lexbuf
| "/*" ->
in_comment := true;
tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token2 lexbuf
token lexbuf
| eof ->
return lexbuf EOI
| _ ->
illegal lexbuf
| _ -> illegal lexbuf
and token2toplevel = lexer
| xml_blank+ -> token2toplevel lexbuf
| qname ->
let s = L.utf8_lexeme lexbuf in
return lexbuf (IDENT s)
| ncname ":*" ->
let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
return lexbuf (ANY_IN_NS s)
| ".:*" ->
return lexbuf (ANY_IN_NS "")
| '-'? ['0'-'9']+ ->
return lexbuf (INT (L.utf8_lexeme lexbuf))
| [ "<>=.,:;+-*/@&{}()|?`!$" ]
| "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
| "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
| ".."
| ["?+*"] "?" | "#" ->
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "[" ->
incr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "]" ->
decr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| '"' ->
and do_string = lexer
| "'" | '"' ->
let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '"' lexbuf;
let double = (L.latin1_lexeme lexbuf).[0] == '"' in
string start double lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING s)
| "'" character "'--'" character "'"
| "'" ((";"[^ ";'"]) | [^ ";'"])* ";"? "'" not_xml_letter ->
L.rollback lexbuf;
(fun _ -> lexer
| "'" -> let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '\'' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING2 s)
| _ -> assert false) () lexbuf
| "(" [" \t"]* "'" ncname [" \t"]* ")" ->
let s = L.utf8_lexeme lexbuf in
let idstart = String.index s '\'' + 1 in
let s = String.sub s idstart (String.length s - idstart) in
let len = String.length s in
let idend = min (min (try String.index s ' ' with _ -> len)
(try String.index s '\t' with _ -> len))
(try String.index s ')' with _ -> len) in
let s = String.sub s 0 idend in
return lexbuf (PVAR ("()" ^ s)) (* UGLY hack to not loose the "("; ")" tokens. *)
| "'" ncname ->
let s = L.utf8_lexeme lexbuf in
let s = String.sub s 1 (String.length s - 1) in
return lexbuf (PVAR s)
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token2toplevel lexbuf
| "/*" ->
in_comment := true;
tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token2toplevel lexbuf
| eof ->
return lexbuf EOI
| _ ->
illegal lexbuf
return_loc start (L.lexeme_end lexbuf)
(if double then STRING2 s else STRING1 s)
| _ -> assert false
and comment start = lexer
| "(*" ->
......@@ -399,6 +341,24 @@ and comment start = lexer
comment start lexbuf
| "*)" ->
()
| "'" ([^ '\t' '\n' '\r' '\''] | '\\' '\'')* "'" (not_ncname_letter | eof) ->
L.rollback lexbuf;
ignore (do_string lexbuf);
comment start lexbuf
| "'" ncname -> comment start lexbuf
| ('"' | "'") ->
(* otherwise we will fail for sure, but try to read it character by character as a string
to get a decent error message *)
L.rollback lexbuf;
ignore (do_string lexbuf);
comment start lexbuf
| eof ->
error start (start+2) "Unterminated comment"
| _ ->
......@@ -412,32 +372,39 @@ and tcomment start = lexer
| _ ->
tcomment start lexbuf
and string start endchar = lexer
| '"' -> if endchar = '"' then ()
else (store_lexeme lexbuf; string start endchar lexbuf)
| "'" -> if endchar = '\'' then ()
else (store_lexeme lexbuf; string start endchar lexbuf)
and string start double = lexer
| '"' | "'" ->
let d = L.latin1_lexeme_char lexbuf 0 = '"' in
if d != double then (store_lexeme lexbuf; string start double lexbuf)
| '\\' ['\\' '"' '\''] ->
store_ascii (L.latin1_lexeme_char lexbuf 1); string start endchar lexbuf
| "\\n" -> store_ascii '\n'; string start endchar lexbuf
| "\\t" -> store_ascii '\t'; string start endchar lexbuf
| "\\r" -> store_ascii '\r'; string start endchar lexbuf
store_ascii (L.latin1_lexeme_char lexbuf 1);
string start double lexbuf
| "\\n" ->
store_ascii '\n'; string start double lexbuf
| "\\t" ->
store_ascii '\t'; string start double lexbuf
| "\\r" ->
store_ascii '\r'; string start double lexbuf
| '\\' ['0'-'9']+ ';' ->
store_code (parse_char lexbuf 10 1); string start endchar lexbuf
store_code (parse_char lexbuf 10 1);
string start double lexbuf
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
store_code (parse_char lexbuf 16 2); string start endchar lexbuf
| '\\' -> illegal lexbuf;
| eof -> error start (start+1) "Unterminated string"
| _ -> store_lexeme lexbuf; string start endchar lexbuf
store_code (parse_char lexbuf 16 2);
string start double lexbuf
| '\\' -> illegal lexbuf
| eof ->
error start (start+1) "Unterminated string"
| _ ->
store_lexeme lexbuf;
string start double lexbuf
let token lexbuf =
if !in_brackets = 0 then token lexbuf
else if !toplevel then token2toplevel lexbuf
else token2 lexbuf
let lexbuf = ref None
let last_tok = ref (KEYWORD "DUMMY")
let rec sync lb =
match !last_tok with
| KEYWORD ";;" | EOI -> ()
......@@ -449,6 +416,7 @@ let raise_clean e =
(* reinit encoding ? *)
raise e
let mk () _FIXME_loc cs =
let lb = L.from_var_enc_stream enc cs in
(lexer ("#!" [^ '\n']* "\n")? -> ()) lb;
......
......@@ -5,17 +5,13 @@ type token =
| IDENT of string
| ANY_IN_NS of string
| INT of string
| CHAR of string
| STRING of string
| STRING1 of string
| STRING2 of string
| PVAR of string
| IDENTPAR of string
| EOI
exception Error of int * int * string
val toplevel : bool ref
module Loc : Loc with type t = int * int
module Token : Token with module Loc = Loc and type t = token
module Error : Error
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment