Commit 36b83c45 authored by Kim Nguyễn's avatar Kim Nguyễn
Browse files

Fix the handling of polymorphic variables in the lexer. The solution

to use two lexers (depending on whether we are between square brackets
or not) is too brittle (it crudely tries to parse
 ``( [whitespace] 'a  [whitespace] )'' as a variable, to force the user
to write the variable beetween parenthesis. However this does not scale
to types with two arguments (says [ t ('a, 'b) ]).

We use a simpler heuristic (with look ahead)

(1) try to see if the regular expression

' (anything but ', \n)* '(anything but the first letter of an identifier)

can be found. If so, we put back the lexeme in the buffer and parse it as as
a string.

(2) if (1) failed, try to parse it as a variable

(3) if (3) failed, try to parse it again as a string. We are
guaranteed to fail here but it means we have a malformed string, so we
parse as a string to get a proper error message.

The only thing this does not cover are cases like
type t = [ 'abcd'Int ]
which was tokenized before as [, 'abcd', Int, ]
and is now tokenized as [, 'abcd, 'Int, ]
It does not seem to be a problem in practice though (since in the code
I have seen thus far, people were at least putting a space).
it is easy to emmit a warning in this case, suggesting the user to add
a whitespace to get the old behaviour back.
parent 77556e8b
...@@ -325,7 +325,6 @@ let catch_exn ppf_err exn = ...@@ -325,7 +325,6 @@ let catch_exn ppf_err exn =
Format.fprintf ppf_err "@." Format.fprintf ppf_err "@."
let parse rule input = let parse rule input =
Ulexer.toplevel := !toplevel;
try Parser.localize_exn (fun () -> rule input) try Parser.localize_exn (fun () -> rule input)
with e -> Parser.sync (); raise e with e -> Parser.sync (); raise e
...@@ -408,4 +407,3 @@ let () = ...@@ -408,4 +407,3 @@ let () =
| [ (None,v) ] -> v | [ (None,v) ] -> v
| _ -> Value.failwith' "eval: the string must evaluate to a single value" | _ -> Value.failwith' "eval: the string must evaluate to a single value"
) )
...@@ -110,8 +110,6 @@ let is_fun_decl = ...@@ -110,8 +110,6 @@ let is_fun_decl =
(fun strm -> (fun strm ->
match Stream.npeek 3 strm with match Stream.npeek 3 strm with
| [ KEYWORD "fun", _; IDENT _, _; KEYWORD "(", _ ] | [ KEYWORD "fun", _; IDENT _, _; KEYWORD "(", _ ]
| [ KEYWORD "fun", _; IDENTPAR _, _; _ ]
| [ IDENTPAR _, _; _ ; _ ]
| [ IDENT _, _; KEYWORD "(", _; _ ] -> () | [ IDENT _, _; KEYWORD "(", _; _ ] -> ()
| _ -> raise Stream.Failure | _ -> raise Stream.Failure
) )
...@@ -159,12 +157,12 @@ EXTEND Gram ...@@ -159,12 +157,12 @@ EXTEND Gram
args = OPT [ "("; l = LIST1 [ v = PVAR -> U.mk (clean_pvar v) ] SEP ","; ")" -> l ]; args = OPT [ "("; l = LIST1 [ v = PVAR -> U.mk (clean_pvar v) ] SEP ","; ")" -> l ];
"="; t = pat -> "="; t = pat ->
[ mk _loc (TypeDecl (x, opt_to_list args ,t)) ] [ mk _loc (TypeDecl (x, opt_to_list args ,t)) ]
| "using"; name = IDENT; "="; cu = [ x = IDENT -> x | x = STRING -> x ] -> | "using"; name = IDENT; "="; cu = [ x = IDENT -> x | x = STRING2 -> x ] ->
[ mk _loc (Using (U.mk name, U.mk cu)) ] [ mk _loc (Using (U.mk name, U.mk cu)) ]
| "open"; ids = LIST1 ident_or_keyword SEP "." -> | "open"; ids = LIST1 ident_or_keyword SEP "." ->
let ids = List.map (fun x -> ident x) ids in let ids = List.map (fun x -> ident x) ids in
[ mk _loc (Open ids) ] [ mk _loc (Open ids) ]
| "schema"; name = IDENT; "="; uri = STRING -> | "schema"; name = IDENT; "="; uri = STRING2 ->
protect_op "schema"; protect_op "schema";
[ mk _loc (SchemaDecl (U.mk name, uri)) ] [ mk _loc (SchemaDecl (U.mk name, uri)) ]
| n = namespace_binding -> | n = namespace_binding ->
...@@ -194,7 +192,7 @@ EXTEND Gram ...@@ -194,7 +192,7 @@ EXTEND Gram
| "#"; IDENT "help" -> [ mk _loc (Directive (`Help None)) ] | "#"; IDENT "help" -> [ mk _loc (Directive (`Help None)) ]
| "#"; IDENT "help"; "debug" -> [ mk _loc (Directive (`Help (Some "debug"))) ] | "#"; IDENT "help"; "debug" -> [ mk _loc (Directive (`Help (Some "debug"))) ]
| "#"; IDENT "builtins" -> [ mk _loc (Directive `Builtins) ] | "#"; IDENT "builtins" -> [ mk _loc (Directive `Builtins) ]
| "include"; s = STRING -> | "include"; s = STRING2 ->
protect_op "File inclusion"; protect_op "File inclusion";
let s = Cduce_loc.resolve_filename s in let s = Cduce_loc.resolve_filename s in
(* avoid looping; should issue an error ? *) (* avoid looping; should issue an error ? *)
...@@ -392,7 +390,7 @@ EXTEND Gram ...@@ -392,7 +390,7 @@ EXTEND Gram
a = expr_attrib_spec; ">"; c = expr -> a = expr_attrib_spec; ">"; c = expr ->
exp _loc (Xml (t, Pair (a,c))) exp _loc (Xml (t, Pair (a,c)))
| "{"; r = expr_record_spec; "}" -> r | "{"; r = expr_record_spec; "}" -> r
| s = STRING -> | s = STRING2 ->
let s = U.mk s in let s = U.mk s in
exp _loc (String (U.start_index s, U.end_index s, s, cst_nil)) exp _loc (String (U.start_index s, U.end_index s, s, cst_nil))
| a = IDENT -> exp _loc (Var (ident a)) | a = IDENT -> exp _loc (Var (ident a))
...@@ -414,7 +412,7 @@ EXTEND Gram ...@@ -414,7 +412,7 @@ EXTEND Gram
]; ];
seq_elem: [ seq_elem: [
[ x = STRING2 -> [ x = STRING1 ->
let s = U.mk x in let s = U.mk x in
`String (_loc, U.start_index s, U.end_index s, s) `String (_loc, U.start_index s, U.end_index s, s)
| e = expr LEVEL "no_appl" -> `Elems (_loc,e) | e = expr LEVEL "no_appl" -> `Elems (_loc,e)
...@@ -434,7 +432,7 @@ EXTEND Gram ...@@ -434,7 +432,7 @@ EXTEND Gram
]; ];
ns_expr: [ ns_expr: [
[ uri = STRING -> `Uri (Ns.Uri.mk (ident uri)) [ uri = STRING2 -> `Uri (Ns.Uri.mk (ident uri))
| ids = LIST1 ident_or_keyword SEP "." -> | ids = LIST1 ident_or_keyword SEP "." ->
let ids = List.map (fun x -> ident x) ids in let ids = List.map (fun x -> ident x) ids in
`Path ids ] `Path ids ]
...@@ -600,11 +598,11 @@ EXTEND Gram ...@@ -600,11 +598,11 @@ EXTEND Gram
Elem (mk _loc (Constant ((ident a,c)))) Elem (mk _loc (Constant ((ident a,c))))
| "/"; p = pat LEVEL "simple" -> Guard p | "/"; p = pat LEVEL "simple" -> Guard p
| IDENT "PCDATA" -> string_regexp | IDENT "PCDATA" -> string_regexp
| i = STRING2; "--"; j = STRING2 -> | i = STRING1; "--"; j = STRING1 ->
let i = Chars.V.mk_int (parse_char _loc i) let i = Chars.V.mk_int (parse_char _loc i)
and j = Chars.V.mk_int (parse_char _loc j) in and j = Chars.V.mk_int (parse_char _loc j) in
Elem (mk _loc (Internal (Types.char (Chars.char_class i j)))) Elem (mk _loc (Internal (Types.char (Chars.char_class i j))))
| s = STRING2 -> | s = STRING1 ->
List.fold_right List.fold_right
(fun c accu -> (fun c accu ->
let c = Chars.V.mk_int c in let c = Chars.V.mk_int c in
...@@ -693,7 +691,7 @@ EXTEND Gram ...@@ -693,7 +691,7 @@ EXTEND Gram
| "("; t = pat; ")" -> t ]; | "("; t = pat; ")" -> t ];
a = attrib_spec; ">"; c = pat -> a = attrib_spec; ">"; c = pat ->
mk _loc (XmlT (t, multi_prod _loc [a;c])) mk _loc (XmlT (t, multi_prod _loc [a;c]))
| s = STRING -> | s = STRING2 ->
let s = let s =
List.map List.map
(fun c -> (fun c ->
...@@ -731,10 +729,9 @@ EXTEND Gram ...@@ -731,10 +729,9 @@ EXTEND Gram
] ]
]; ];
char: [ char:
[ c = CHAR -> Chars.V.mk_int (parse_char _loc c) [
| c = STRING2 -> Chars.V.mk_int (parse_char _loc c) ] [ c = STRING1 -> Chars.V.mk_int (parse_char _loc c) ]
]; ];
......
...@@ -14,7 +14,6 @@ module Loc = struct ...@@ -14,7 +14,6 @@ module Loc = struct
let to_tuple _ = assert false let to_tuple _ = assert false
let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2) let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
let smart_merge a b = merge a b
let join (x1, _) = (x1, x1) let join (x1, _) = (x1, x1)
let move _ _ _ = assert false let move _ _ _ = assert false
let shift _ _ = assert false let shift _ _ = assert false
...@@ -49,11 +48,9 @@ type token = ...@@ -49,11 +48,9 @@ type token =
| IDENT of string | IDENT of string
| ANY_IN_NS of string | ANY_IN_NS of string
| INT of string | INT of string
| CHAR of string | STRING1 of string
| STRING of string
| STRING2 of string | STRING2 of string
| PVAR of string | PVAR of string
| IDENTPAR of string
| EOI | EOI
module Token = struct module Token = struct
...@@ -69,13 +66,11 @@ module Token = struct ...@@ -69,13 +66,11 @@ module Token = struct
| KEYWORD s -> sf "KEYWORD %S" s | KEYWORD s -> sf "KEYWORD %S" s
| IDENT s -> sf "IDENT %S" s | IDENT s -> sf "IDENT %S" s
| INT s -> sf "INT %s" s | INT s -> sf "INT %s" s
| CHAR s -> sf "CHAR \'%s\'" s | STRING1 s -> sf "STRING \"%s\"" s
| STRING s -> sf "STRING \"%s\"" s | STRING2 s -> sf "STRING \"%s\"" s
| STRING2 s -> sf "STRING \'%s\'" s
(* here it's not %S since the string is already escaped *) (* here it's not %S since the string is already escaped *)
| PVAR s -> sf "PVAR \'%S\'" s
| IDENTPAR s -> sf "IDENTPAR \'%S\'" s
| ANY_IN_NS s -> sf "ANY_IN_NS %S" s | ANY_IN_NS s -> sf "ANY_IN_NS %S" s
| PVAR s -> sf "PVAR %S" s
| EOI -> sf "EOI" | EOI -> sf "EOI"
let print ppf x = pp_print_string ppf (to_string x) let print ppf x = pp_print_string ppf (to_string x)
...@@ -87,8 +82,8 @@ module Token = struct ...@@ -87,8 +82,8 @@ module Token = struct
let extract_string = let extract_string =
function function
| IDENTPAR s | KEYWORD s | IDENT s | INT s | CHAR s | STRING s | STRING2 s | PVAR s | | KEYWORD s | IDENT s | INT s | STRING1 s | STRING2 s |
ANY_IN_NS s -> s ANY_IN_NS s | PVAR s -> s
| tok -> | tok ->
invalid_arg ("Cannot extract a string from this token: "^ invalid_arg ("Cannot extract a string from this token: "^
to_string tok) to_string tok)
...@@ -141,8 +136,6 @@ module L = Ulexing ...@@ -141,8 +136,6 @@ module L = Ulexing
exception Error of int * int * string exception Error of int * int * string
let toplevel = ref false
let error i j s = raise (Error (i,j,s)) let error i j s = raise (Error (i,j,s))
(* Buffer for string literals *) (* Buffer for string literals *)
...@@ -181,24 +174,99 @@ let parse_char lexbuf base i = ...@@ -181,24 +174,99 @@ let parse_char lexbuf base i =
done; done;
!r !r
(* this should match the string lexer *)
let regexp utf8_char = [^ '\\' '"' '\'' 9 10 13 ]
let regexp dec_char = '\\' ['0'-'9']+ ';'
let regexp hex_char = "\\x" ['0'-'9''a'-'f''A'-'F']+ ';'
let regexp esc_char = '\\' ['\\' '"' '\'' 'n' 't' 'r']
let regexp single_char = utf8_char | dec_char | hex_char | esc_char | '"'
let regexp ncname_char = let regexp ncname_char =
xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+) let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
let regexp qname = (ncname ':')? ncname let regexp qname = (ncname ':')? ncname
(* Should be [^ xml_letter ] *)
let regexp not_xml_letter = [^ 'A'-'Z' 'a'-'z' '0'-'9' '_' ] (* We want to write _ \ (xml_letter | '_') but we can't due to a limitation in ulex.
let regexp character = _ | '\\' ['\\' '"' '\''] | "\\n" | "\\t" | "\\r" we define explicitely this regexp, based on the definition in ulex-1.1
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' | '\\' ['0'-'9']+ ';' *)
let regexp not_ncname_letter = [^ (* base_char *)
0x0041-0x005A 0x0061-0x007A 0x00C0-0x00D6 0x00D8-0x00F6
0x00F8-0x00FF 0x0100-0x0131 0x0134-0x013E 0x0141-0x0148
0x014A-0x017E 0x0180-0x01C3 0x01CD-0x01F0 0x01F4-0x01F5
0x01FA-0x0217 0x0250-0x02A8 0x02BB-0x02C1 0x0386-0x0386
0x0388-0x038A 0x038C-0x038C 0x038E-0x03A1 0x03A3-0x03CE
0x03D0-0x03D6 0x03DA-0x03DA 0x03DC-0x03DC 0x03DE-0x03DE
0x03E0-0x03E0 0x03E2-0x03F3
0x0401-0x040C 0x040E-0x044F 0x0451-0x045C 0x045E-0x0481
0x0490-0x04C4 0x04C7-0x04C8 0x04CB-0x04CC 0x04D0-0x04EB
0x04EE-0x04F5 0x04F8-0x04F9 0x0531-0x0556 0x0559-0x0559
0x0561-0x0586 0x05D0-0x05EA 0x05F0-0x05F2 0x0621-0x063A
0x0641-0x064A 0x0671-0x06B7 0x06BA-0x06BE 0x06C0-0x06CE
0x06D0-0x06D3 0x06D5-0x06D5 0x06E5-0x06E6 0x0905-0x0939
0x093D-0x093D
0x0958-0x0961 0x0985-0x098C 0x098F-0x0990 0x0993-0x09A8
0x09AA-0x09B0 0x09B2-0x09B2 0x09B6-0x09B9 0x09DC-0x09DD
0x09DF-0x09E1 0x09F0-0x09F1 0x0A05-0x0A0A 0x0A0F-0x0A10
0x0A13-0x0A28 0x0A2A-0x0A30 0x0A32-0x0A33 0x0A35-0x0A36
0x0A38-0x0A39 0x0A59-0x0A5C 0x0A5E-0x0A5E 0x0A72-0x0A74
0x0A85-0x0A8B 0x0A8D-0x0A8D 0x0A8F-0x0A91 0x0A93-0x0AA8
0x0AAA-0x0AB0 0x0AB2-0x0AB3 0x0AB5-0x0AB9 0x0ABD-0x0ABD
0x0AE0-0x0AE0
0x0B05-0x0B0C 0x0B0F-0x0B10 0x0B13-0x0B28 0x0B2A-0x0B30
0x0B32-0x0B33 0x0B36-0x0B39 0x0B3D-0x0B3D 0x0B5C-0x0B5D
0x0B5F-0x0B61 0x0B85-0x0B8A 0x0B8E-0x0B90 0x0B92-0x0B95
0x0B99-0x0B9A 0x0B9C-0x0B9C 0x0B9E-0x0B9F 0x0BA3-0x0BA4
0x0BA8-0x0BAA 0x0BAE-0x0BB5 0x0BB7-0x0BB9 0x0C05-0x0C0C
0x0C0E-0x0C10 0x0C12-0x0C28 0x0C2A-0x0C33 0x0C35-0x0C39
0x0C60-0x0C61 0x0C85-0x0C8C 0x0C8E-0x0C90 0x0C92-0x0CA8
0x0CAA-0x0CB3 0x0CB5-0x0CB9 0x0CDE-0x0CDE 0x0CE0-0x0CE1
0x0D05-0x0D0C 0x0D0E-0x0D10 0x0D12-0x0D28 0x0D2A-0x0D39
0x0D60-0x0D61 0x0E01-0x0E2E 0x0E30-0x0E30 0x0E32-0x0E33
0x0E40-0x0E45 0x0E81-0x0E82 0x0E84-0x0E84 0x0E87-0x0E88
0x0E8A-0x0E8A
0x0E8D-0x0E8D 0x0E94-0x0E97 0x0E99-0x0E9F 0x0EA1-0x0EA3
0x0EA5-0x0EA5
0x0EA7-0x0EA7 0x0EAA-0x0EAB 0x0EAD-0x0EAE 0x0EB0-0x0EB0
0x0EB2-0x0EB3
0x0EBD-0x0EBD 0x0EC0-0x0EC4 0x0F40-0x0F47 0x0F49-0x0F69
0x10A0-0x10C5 0x10D0-0x10F6 0x1100-0x1100 0x1102-0x1103
0x1105-0x1107 0x1109-0x1109 0x110B-0x110C 0x110E-0x1112
0x113C-0x113C
0x113E-0x113E 0x1140-0x1140 0x114C-0x114C 0x114E-0x114E
0x1150-0x1150 0x1154-0x1155 0x1159-0x1159
0x115F-0x1161 0x1163-0x1163 0x1165-0x1165 0x1167-0x1167
0x1169-0x1169 0x116D-0x116E
0x1172-0x1173 0x1175-0x1175 0x119E-0x119E 0x11A8-0x11A8
0x11AB-0x11AB 0x11AE-0x11AF
0x11B7-0x11B8 0x11BA-0x11BA 0x11BC-0x11C2 0x11EB-0x11EB
0x11F0-0x11F0 0x11F9-0x11F9
0x1E00-0x1E9B 0x1EA0-0x1EF9 0x1F00-0x1F15 0x1F18-0x1F1D
0x1F20-0x1F45 0x1F48-0x1F4D 0x1F50-0x1F57 0x1F59-0x1F59
0x1F5B-0x1F5B
0x1F5D-0x1F5D 0x1F5F-0x1F7D 0x1F80-0x1FB4 0x1FB6-0x1FBC
0x1FBE-0x1FBE
0x1FC2-0x1FC4 0x1FC6-0x1FCC 0x1FD0-0x1FD3 0x1FD6-0x1FDB
0x1FE0-0x1FEC 0x1FF2-0x1FF4 0x1FF6-0x1FFC 0x2126-0x2126
0x212A-0x212B 0x212E-0x212E 0x2180-0x2182 0x3041-0x3094
0x30A1-0x30FA 0x3105-0x312C 0xAC00-0xD7A3
(* ideographic *)
0x3007-0x3007 0x3021-0x3029 0x4E00-0x9FA5
(* '_' *)
'_'
]
let illegal lexbuf = let illegal lexbuf =
error error
(L.lexeme_start lexbuf) (L.lexeme_start lexbuf)
(L.lexeme_end lexbuf) (L.lexeme_end lexbuf)
"Illegal character" ("Illegal character : '" ^ (L.utf8_lexeme lexbuf) ^ "'")
let in_comment = ref false let in_comment = ref false
let in_brackets = ref 0
let return lexbuf tok = (tok, L.loc lexbuf) let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j)) let return_loc i j tok = (tok, (i,j))
...@@ -215,183 +283,57 @@ let rec token = lexer ...@@ -215,183 +283,57 @@ let rec token = lexer
return lexbuf (ANY_IN_NS "") return lexbuf (ANY_IN_NS "")
| '-'? ['0'-'9']+ -> | '-'? ['0'-'9']+ ->
return lexbuf (INT (L.utf8_lexeme lexbuf)) return lexbuf (INT (L.utf8_lexeme lexbuf))
| [ "<>=.,:;+-*/@&{}()|?`!$" ] | [ "<>=.,:;+-*/@&{}[]()|?`!" ]
| "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++" | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
| "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_" | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
| ".." | ".."
| ["?+*"] "?" | "#" -> | ["?+*"] "?" | "#" ->
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf)) return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "[" ->
incr in_brackets; (* the three followning rules work together and are replicated in the comment lexer *)
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "]" -> | "'" ([^ '\t' '\n' '\r' '\''] | '\\' '\'')* "'" (not_ncname_letter | eof) ->
decr in_brackets; (* two single quotes not followed by an xml_letter must be a string
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf)) we put it back call an auxiliary lexer to consume the first ' and read it as a string.
| '"' -> *)
let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '"' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING s)
| "'" character "'" ->
L.rollback lexbuf; L.rollback lexbuf;
(fun _ -> lexer do_string lexbuf
| "'" -> let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '\'' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (CHAR s)
| _ -> assert false) () lexbuf
| "'" ncname -> | "'" ncname ->
let s = L.utf8_lexeme lexbuf in (* then try to read it as variable *)
let s = String.sub s 1 (String.length s - 1) in let s = L.utf8_sub_lexeme lexbuf 1 (L.lexeme_length lexbuf - 1) in
return lexbuf (PVAR s) return lexbuf (PVAR (s))
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| "/*" ->
in_comment := true;
tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| eof ->
return lexbuf EOI
| _ ->
illegal lexbuf
and token2 = lexer | ('"' | "'") ->
| xml_blank+ -> token2 lexbuf (* otherwise we will fail for sure, but try to read it character by character as a string
| qname -> to get a decent error message *)
let s = L.utf8_lexeme lexbuf in
return lexbuf (IDENT s)
| ncname ":*" ->
let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
return lexbuf (ANY_IN_NS s)
| ".:*" ->
return lexbuf (ANY_IN_NS "")
| '-'? ['0'-'9']+ ->
return lexbuf (INT (L.utf8_lexeme lexbuf))
| [ "<>=.,:;+-*/@&{}()|?`!$" ]
| "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
| "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
| ".."
| ["?+*"] "?" | "#" ->
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "[" ->
incr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "]" ->
decr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| '"' ->
let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '"' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING s)
| "'" character "'--'" character "'"
| "'" [^ '\'']+ "'" not_xml_letter ->
L.rollback lexbuf; L.rollback lexbuf;
(fun _ -> lexer do_string lexbuf
| "'" -> let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '\'' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING2 s)
| _ -> assert false) () lexbuf
| "(" [" \t"]* "'" ncname [" \t"]* ")" ->
let s = L.utf8_lexeme lexbuf in
let idstart = String.index s '\'' + 1 in
let s = String.sub s idstart (String.length s - idstart) in
let len = String.length s in
let idend = min (min (try String.index s ' ' with _ -> len)
(try String.index s '\t' with _ -> len))
(try String.index s ')' with _ -> len) in
let s = String.sub s 0 idend in
return lexbuf (PVAR ("()"^s)) (* UGLY hack to not loose the "("; ")" tokens. *)
| "'" ncname ->
let s = L.utf8_lexeme lexbuf in
let s = String.sub s 1 (String.length s - 1) in
return lexbuf (PVAR s)
| "(*" -> | "(*" ->
in_comment := true; in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf; comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false; in_comment := false;
token2 lexbuf token lexbuf
| "/*" -> | "/*" ->
in_comment := true; in_comment := true;
tcomment (L.lexeme_start lexbuf) lexbuf; tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false; in_comment := false;
token2 lexbuf token lexbuf
| eof -> | eof ->
return lexbuf EOI return lexbuf EOI
| _ -> | _ -> illegal lexbuf
illegal lexbuf
and token2toplevel = lexer and do_string = lexer
| xml_blank+ -> token2toplevel lexbuf | "'" | '"' ->
| qname ->
let s = L.utf8_lexeme lexbuf in
return lexbuf (IDENT s)
| ncname ":*" ->
let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
return lexbuf (ANY_IN_NS s)
| ".:*" ->
return lexbuf (ANY_IN_NS "")
| '-'? ['0'-'9']+ ->
return lexbuf (INT (L.utf8_lexeme lexbuf))
| [ "<>=.,:;+-*/@&{}()|?`!$" ]
| "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
| "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
| ".."
| ["?+*"] "?" | "#" ->
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "[" ->
incr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| "]" ->
decr in_brackets;
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| '"' ->
let start = L.lexeme_start lexbuf in let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '"' lexbuf; let double = (L.latin1_lexeme lexbuf).[0] == '"' in
string start double lexbuf;
let s = get_stored_string () in let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING s) return_loc start (L.lexeme_end lexbuf)
| "'" character "'--'" character "'" (if double then STRING2 s else STRING1 s)
| "'" ((";"[^ ";'"]) | [^ ";'"])* ";"? "'" not_xml_letter -> | _ -> assert false
L.rollback lexbuf;
(fun _ -> lexer
| "'" -> let start = L.lexeme_start lexbuf in
string (L.lexeme_start lexbuf) '\'' lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (STRING2 s)
| _ -> assert false) () lexbuf
| "(" [" \t"]* "'" ncname [" \t"]* ")" ->
let s = L.utf8_lexeme lexbuf in
let idstart = String.index s '\'' + 1 in
let s = String.sub s idstart (String.length s - idstart) in
let len = String.length s in
let idend = min (min (try String.index s ' ' with _ -> len)
(try String.index s '\t' with _ -> len))
(try String.index s ')' with _ -> len) in
let s = String.sub s 0 idend in
return lexbuf (PVAR ("()" ^ s)) (* UGLY hack to not loose the "("; ")" tokens. *)
| "'" ncname ->
let s = L.utf8_lexeme lexbuf in
let s = String.sub s 1 (String.length s - 1) in
return lexbuf (PVAR s)
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token2toplevel lexbuf
| "/*" ->
in_comment := true;
tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token2toplevel lexbuf
| eof ->
return lexbuf EOI
| _ ->
illegal lexbuf
and comment start = lexer and comment start = lexer
| "(*" -> | "(*" ->
...@@ -399,6 +341,24 @@ and comment start = lexer ...@@ -399,6 +341,24 @@ and comment start = lexer
comment start lexbuf comment start lexbuf
| "*)" -> | "*)" ->
() ()
| "'" ([^ '\t' '\n' '\r' '\''] | '\\' '\'')* "'" (not_ncname_letter | eof) ->
L.rollback lexbuf;
ignore (do_string lexbuf);
comment start lexbuf
| "'" ncname -> comment start lexbuf
| ('"' | "'") ->
(* otherwise we will fail for sure, but try to read it character by character as a string
to get a decent error message *)
L.rollback lexbuf;
ignore (do_string lexbuf);
comment start lexbuf
| eof -> | eof ->