Commit 162b4367 authored by Julien Lopez's avatar Julien Lopez
Browse files

First attempt to get the "'IDENT" syntax for type variables.

At this point, the lexer gives a type if it sees a "'" then IDENT then ' '.
This is not enough, but now simple tests like this one works:
	fun ('A -> Int) _ -> 0
parent 4219614b
......@@ -634,6 +634,7 @@ EXTEND Gram
| i = char ; "--"; j = char ->
mk _loc (Internal (Types.char (Chars.char_class i j)))
| "`"; c = tag_type -> c
| x = PVAR -> mk _loc (TVar x)
| "("; l = LIST1 pat SEP ","; ")" -> multi_prod _loc l
| "["; r = [ r = regexp -> r | -> Epsilon ];
q = [ ";"; q = pat -> Some q
......
......@@ -37,7 +37,7 @@ module Loc = struct
let to_string _ = assert false
exception Exc_located of t * exn
let raise loc exn =
match exn with
match exn with
| Exc_located _ -> raise exn
| _ -> raise (Exc_located (loc, exn))
let name = ref "_loc"
......@@ -46,6 +46,7 @@ end
type token =
| KEYWORD of string
| IDENT of string
| PVAR of string
| ANY_IN_NS of string
| INT of string
| STRING1 of string
......@@ -62,14 +63,15 @@ module Token = struct
let to_string =
function
| KEYWORD s -> sf "KEYWORD %S" s
| IDENT s -> sf "IDENT %S" s
| INT s -> sf "INT %s" s
| STRING1 s -> sf "STRING \"%s\"" s
| STRING2 s -> sf "STRING \"%s\"" s
| KEYWORD s -> sf "KEYWORD %S" s
| IDENT s -> sf "IDENT %S" s
| PVAR s -> sf "PVAR \"%s\"" s
| INT s -> sf "INT %s" s
| STRING1 s -> sf "STRING \"%s\"" s
| STRING2 s -> sf "STRING \"%s\"" s
(* here it's not %S since the string is already escaped *)
| ANY_IN_NS s -> sf "ANY_IN_NS %S" s
| EOI -> sf "EOI"
| ANY_IN_NS s -> sf "ANY_IN_NS %S" s
| EOI -> sf "EOI"
let print ppf x = pp_print_string ppf (to_string x)
......@@ -80,7 +82,7 @@ module Token = struct
let extract_string =
function
| KEYWORD s | IDENT s | INT s | STRING1 s | STRING2 s |
| KEYWORD s | IDENT s | PVAR s | INT s | STRING1 s | STRING2 s |
ANY_IN_NS s -> s
| tok ->
invalid_arg ("Cannot extract a string from this token: "^
......@@ -137,10 +139,10 @@ exception Error of int * int * string
let error i j s = raise (Error (i,j,s))
(* Buffer for string literals *)
let string_buff = Buffer.create 1024
let store_lexeme lexbuf =
let store_lexeme lexbuf =
Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code = Utf8.store string_buff
......@@ -154,26 +156,26 @@ let get_stored_string () =
let enc = ref L.Latin1
(* Parse characters literals \123; \x123; *)
let hexa_digit = function
| '0'..'9' as c -> (Char.code c) - (Char.code '0')
| 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
| 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
| _ -> -1
let parse_char lexbuf base i =
let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
let r = ref 0 in
for i = 0 to String.length s - 1 do
let c = hexa_digit s.[i] in
if (c >= base) || (c < 0) then
if (c >= base) || (c < 0) then
error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
r := !r * base + c;
done;
!r
let regexp ncname_char =
let regexp ncname_char =
xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
let regexp qname = (ncname ':')? ncname
......@@ -182,7 +184,7 @@ let regexp qname = (ncname ':')? ncname
let illegal lexbuf =
error
(L.lexeme_start lexbuf)
(L.lexeme_end lexbuf)
(L.lexeme_end lexbuf)
"Illegal character"
let in_comment = ref false
......@@ -190,7 +192,7 @@ let in_comment = ref false
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))
let rec token = lexer
let rec token = lexer
| xml_blank+ -> token lexbuf
| qname ->
let s = L.utf8_lexeme lexbuf in
......@@ -198,7 +200,7 @@ let rec token = lexer
| ncname ":*" ->
let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
return lexbuf (ANY_IN_NS s)
| ".:*" ->
| ".:*" ->
return lexbuf (ANY_IN_NS "")
| '-'? ['0'-'9']+ ->
return lexbuf (INT (L.utf8_lexeme lexbuf))
......@@ -208,13 +210,16 @@ let rec token = lexer
| ".."
| ["?+*"] "?" | "#" ->
return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
| '"' | "'" ->
| '"' ->
let start = L.lexeme_start lexbuf in
let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
string (L.lexeme_start lexbuf) double_quote lexbuf;
string_double (L.lexeme_start lexbuf) lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf)
(if double_quote then STRING2 s else STRING1 s)
return_loc start (L.lexeme_end lexbuf) (STRING2 s)
| "'" ->
let start = L.lexeme_start lexbuf in
let b = string_simple (L.lexeme_start lexbuf) 0 lexbuf in
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf) (if b then STRING1(s) else PVAR(s))
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
......@@ -225,21 +230,23 @@ let rec token = lexer
tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| eof ->
| eof ->
return lexbuf EOI
| _ ->
| _ ->
illegal lexbuf
and comment start = lexer
| "(*" ->
comment (L.lexeme_start lexbuf) lexbuf;
comment start lexbuf
| "*)" ->
()
| '"' | "'" ->
let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
string (L.lexeme_start lexbuf) double_quote lexbuf;
| '"' ->
string_double (L.lexeme_start lexbuf) lexbuf;
clear_buff ();
comment start lexbuf
| "'" ->
ignore(string_simple (L.lexeme_start lexbuf) 0 lexbuf);
clear_buff ();
comment start lexbuf
| eof ->
......@@ -255,32 +262,37 @@ and tcomment start = lexer
| _ ->
tcomment start lexbuf
and string start double = lexer
| '"' | "'" ->
let d = L.latin1_lexeme_char lexbuf 0 = '"' in
if d != double then (store_lexeme lexbuf; string start double lexbuf)
and string_double start = lexer
| '"' -> ()
| '\\' ['\\' '"' '\''] ->
store_ascii (L.latin1_lexeme_char lexbuf 1);
string start double lexbuf
| "\\n" ->
store_ascii '\n'; string start double lexbuf
| "\\t" ->
store_ascii '\t'; string start double lexbuf
| "\\r" ->
store_ascii '\r'; string start double lexbuf
store_ascii (L.latin1_lexeme_char lexbuf 1); string_double start lexbuf
| "\\n" -> store_ascii '\n'; string_double start lexbuf
| "\\t" -> store_ascii '\t'; string_double start lexbuf
| "\\r" -> store_ascii '\r'; string_double start lexbuf
| '\\' ['0'-'9']+ ';' ->
store_code (parse_char lexbuf 10 1);
string start double lexbuf
store_code (parse_char lexbuf 10 1); string_double start lexbuf
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
store_code (parse_char lexbuf 16 2);
string start double lexbuf
| '\\' ->
illegal lexbuf;
| eof ->
error start (start+1) "Unterminated string"
| _ ->
store_lexeme lexbuf;
string start double lexbuf
store_code (parse_char lexbuf 16 2); string_double start lexbuf
| '\\' -> illegal lexbuf;
| eof -> error start (start+1) "Unterminated string_double"
| _ -> store_lexeme lexbuf; string_double start lexbuf
and string_simple start nb = lexer
| "'" -> true
| ' ' -> if nb != 0 then false
else (store_lexeme lexbuf; string_simple (nb+1) start lexbuf)
| '\\' ['\\' '"' '\''] -> store_ascii (L.latin1_lexeme_char lexbuf 1);
string_simple (nb+1) start lexbuf
| "\\n" -> store_ascii '\n'; string_simple (nb+1) start lexbuf
| "\\t" -> store_ascii '\t'; string_simple (nb+1) start lexbuf
| "\\r" -> store_ascii '\r'; string_simple (nb+1) start lexbuf
| '\\' ['0'-'9']+ ';' ->
store_code (parse_char lexbuf 10 1); string_simple (nb+1) start lexbuf
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
store_code (parse_char lexbuf 16 2); string_simple (nb+1) start lexbuf
| '\\' -> illegal lexbuf;
| eof -> error start (start+1) "Unterminated string_simple (nb+1)"
| _ -> store_lexeme lexbuf; string_simple (nb+1) start lexbuf
let lexbuf = ref None
......@@ -302,10 +314,10 @@ let mk () _FIXME_loc cs =
(lexer ("#!" [^ '\n']* "\n")? -> ()) lb;
lexbuf := Some lb;
let next _ =
let tok, loc =
let tok, loc =
try token lb
with
| Ulexing.Error ->
| Ulexing.Error ->
raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
"Unexpected character"))
| Ulexing.InvalidCodepoint i ->
......@@ -327,15 +339,15 @@ let dump_file f =
Format.printf "%a@." Token.print tok;
if tok = EOI then exit 0
done
with
| Ulexing.Error ->
Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n"
with
| Ulexing.Error ->
Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n"
(Ulexing.lexeme_end lexbuf)
| Error (i,j,s) ->
Printf.eprintf "Lexing error at offset %i-%i:\n%s\n"
Printf.eprintf "Lexing error at offset %i-%i:\n%s\n"
i j s
| Ulexing.InvalidCodepoint i ->
Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n"
Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n"
(Ulexing.lexeme_end lexbuf)
);
close_in ic
......@@ -3,6 +3,7 @@ open Camlp4.Sig
type token =
| KEYWORD of string
| IDENT of string
| PVAR of string
| ANY_IN_NS of string
| INT of string
| STRING1 of string
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment