Commit 2273103e authored by Kim Nguyễn's avatar Kim Nguyễn
Browse files

WIP New parser:

    lexer, parser and wrapper module compile.
parent 16d016c0
......@@ -54,7 +54,9 @@ compile/cduce_lib__lambda.cmo runtime/cduce_lib__run_dispatch.cmo \
runtime/cduce_lib__explain.cmo runtime/cduce_lib__eval.cmo \
parser/cduce_lib__cduce_loc.cmo parser/cduce_lib__url.cmo \
parser/cduce_lib__sedlexer.cmo parser/cduce_lib__ast.cmo \
parser/cduce_lib__cparser.cmo \
\
parser/cduce_lib__cparser.cmo parser/cduce_lib__csedlexer.cmo parser/cduce_lib__parse.cmo\
\
parser/cduce_lib__parser.cmo typing/cduce_lib__typed.cmo \
typing/cduce_lib__typepat.cmo types/cduce_lib__externals.cmo \
typing/cduce_lib__typer.cmo compile/cduce_lib__compile.cmo \
......
This diff is collapsed.
open Cparser
module L = Sedlexing
exception Error of int * int * string
let error i j s = raise (Error (i, j, s))
(* Buffer for string literals *)
let string_buff = Buffer.create 1024
let store_lexeme lexbuf =
let s = L.Utf8.lexeme lexbuf in
Buffer.add_string string_buff s
let store_ascii = Buffer.add_char string_buff
let store_code = Encodings.Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
let s = Buffer.contents string_buff in
clear_buff ();
Buffer.clear string_buff;
s
(* Parse characters literals \123; \x123; *)
let hexa_digit = function
| '0' .. '9' as c -> Char.code c - Char.code '0'
| 'a' .. 'f' as c -> Char.code c - Char.code 'a' + 10
| 'A' .. 'F' as c -> Char.code c - Char.code 'A' + 10
| _ -> -1
let parse_char lexbuf base i =
let s = L.Utf8.sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
let r = ref 0 in
for i = 0 to String.length s - 1 do
let c = hexa_digit s.[i] in
if c >= base || c < 0 then
error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
r := (!r * base) + c
done;
!r
let ncname_char =
[%sedlex.regexp?
( xml_letter | xml_digit
| Chars "_-"
| xml_combining_char | xml_extender | "\\." )]
let ncname =
[%sedlex.regexp? xml_letter, Star ncname_char | '_', Plus ncname_char]
let qname = [%sedlex.regexp? Opt (ncname, ':'), ncname]
let illegal lexbuf =
error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "Illegal character"
let in_comment = ref false
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i, j))
let ident_or_keyword =
let l =
[
("and", AND);
("debug", DEBUG);
("div", DIV);
("else", ELSE);
("from", FROM);
("fun", FUN);
("if", IF);
("in", IN);
("include", INCLUDE);
("let", LET);
("map", MAP);
("match", MATCH);
("mod", MOD);
("namespace", NAMESPACE);
("off", OFF);
("on", ON);
("open", OPEN);
("or", OR);
("ref", REF);
("schema", SCHEMA);
("select", SELECT);
("then", THEN);
("transform", TRANSFORM);
("try", TRY);
("type", TYPE);
("using", USING);
("validate", VALIDATE);
("where", WHERE);
("with", WITH);
("xtransform", XTRANSFORM);
]
in
let hash = Hashtbl.create 17 in
List.iter (fun (a, b) -> Hashtbl.add hash a b) l;
function s -> ( try Hashtbl.find hash s with Not_found -> IDENT s )
let rec token lexbuf =
match%sedlex lexbuf with
| Plus xml_blank -> token lexbuf
| qname ->
let s = L.Utf8.lexeme lexbuf in
return lexbuf (ident_or_keyword s)
| "#print_type" -> return lexbuf HASH_PRINT_TYPE
| "#dump_value" -> return lexbuf HASH_DUMP_VALUE
| "#", qname -> return lexbuf
(HASH_DIRECTIVE (L.Utf8.lexeme lexbuf))
| ncname, ":*" ->
let s = L.Utf8.sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
return lexbuf (ANY_IN_NS s)
| ".:*" -> return lexbuf (ANY_IN_NS "")
| Opt '-', Plus '0' .. '9' -> return lexbuf (INT (L.Utf8.lexeme lexbuf))
| "=" -> return lexbuf EQ
| "+" -> return lexbuf PLUS
| "-" -> return lexbuf MINUS
| "@" -> return lexbuf AT
| "|" -> return lexbuf BAR
| "." -> return lexbuf DOT
| "`" -> return lexbuf BQUOTE
| "!" -> return lexbuf BANG
| "\\" -> return lexbuf SETMINUS
| "*" -> return lexbuf STAR
| "&" -> return lexbuf AMP
| "/" -> return lexbuf SLASH
| ":=" -> return lexbuf COLEQ
| "->" -> return lexbuf MINUSGT
| "<=" -> return lexbuf LTEQ
| "<<" -> return lexbuf LTLT
| ">>" -> return lexbuf GTGT
| ">=" -> return lexbuf GTEQ
| "!=" -> return lexbuf BANGEQ
| "&&" -> return lexbuf AMPAMP
| "**" -> return lexbuf STARSTAR
| "/@" -> return lexbuf SLASHAT
| "//" -> return lexbuf SLASHSLASH
| "::" -> return lexbuf COLCOL
| ".." -> return lexbuf DOTDOT
| "--" -> return lexbuf MINUSMINUS
| "??" -> return lexbuf QMARKQMARK
| "+?" -> return lexbuf PLUSQMARK
| "*?" -> return lexbuf STARQMARK
| "=?" -> return lexbuf EQQMARK
| "||" -> return lexbuf BARBAR
| '"' | "'" ->
let start = L.lexeme_start lexbuf in
let double_quote = L.Latin1.lexeme_char lexbuf 0 = '"' in
string (L.lexeme_start lexbuf) double_quote lexbuf;
let s = get_stored_string () in
return_loc start (L.lexeme_end lexbuf)
(if double_quote then STRING2 s else STRING1 s)
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| "/*" ->
in_comment := true;
tcomment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| eof -> return lexbuf EOI
| any -> illegal lexbuf
| _ -> assert false
and comment start lexbuf =
match%sedlex lexbuf with
| "(*" ->
comment (L.lexeme_start lexbuf) lexbuf;
comment start lexbuf
| "*)" -> ()
| '"' | "'" ->
let double_quote = L.Latin1.lexeme_char lexbuf 0 = '"' in
string (L.lexeme_start lexbuf) double_quote lexbuf;
clear_buff ();
comment start lexbuf
| eof -> error start (start + 2) "Unterminated comment"
| any -> comment start lexbuf
| _ -> assert false
and tcomment start lexbuf =
match%sedlex lexbuf with
| "*/" -> ()
| eof -> error start (start + 2) "Unterminated comment"
| any -> tcomment start lexbuf
| _ -> assert false
and string start double lexbuf =
match%sedlex lexbuf with
| '"' | "'" ->
let d = L.Latin1.lexeme_char lexbuf 0 = '"' in
if d != double then (
store_lexeme lexbuf;
string start double lexbuf )
| '\\', Chars "\\\"\'" ->
store_ascii (L.Latin1.lexeme_char lexbuf 1);
string start double lexbuf
| "\\n" ->
store_ascii '\n';
string start double lexbuf
| "\\t" ->
store_ascii '\t';
string start double lexbuf
| "\\r" ->
store_ascii '\r';
string start double lexbuf
| '\\', Plus '0' .. '9', ';' ->
store_code (parse_char lexbuf 10 1);
string start double lexbuf
| "\\x", Plus ('0' .. '9' | 'a' .. 'f' | 'A' .. 'F'), ';' ->
store_code (parse_char lexbuf 16 2);
string start double lexbuf
| '\\' -> illegal lexbuf
| eof -> error start (start + 1) "Unterminated string"
| any ->
store_lexeme lexbuf;
string start double lexbuf
| _ -> assert false
let lexbuf = ref None
let last_tok = ref (IDENT "$$$DUMMY")
let rec sync lb =
match !last_tok with
| SEMISEMI | EOI -> ()
| _ ->
last_tok := fst (token lb);
sync lb
let raise_clean e =
clear_buff ();
in_comment := false;
(* reinit encoding ? *)
raise e
let mk () _FIXME_loc cs =
let lb =
(* Workaround the sedlex functions buffer 512 characters before
propagating them, making it unusable with interactive input *)
let uchars = Bytes.make 4 '\000' in
L.create (fun arr pos _num ->
let module U = Encodings.Utf8 in
try
let next cs = Stream.next cs in
Bytes.set uchars 0 '\000';
Bytes.set uchars 1 '\000';
Bytes.set uchars 2 '\000';
Bytes.set uchars 3 '\000';
let c0 = next cs in
let () =
match c0 with
| '\x00' .. '\x7f' -> Bytes.set uchars 0 c0
| '\xc0' .. '\xdf' ->
Bytes.set uchars 0 c0;
Bytes.set uchars 1 (next cs)
| '\xe0' .. '\xef' ->
Bytes.set uchars 0 c0;
Bytes.set uchars 1 (next cs);
Bytes.set uchars 2 (next cs)
| '\xf0' .. '\xf7' ->
Bytes.set uchars 0 c0;
Bytes.set uchars 1 (next cs);
Bytes.set uchars 2 (next cs);
Bytes.set uchars 3 (next cs)
| _ -> raise L.MalFormed
in
let us = U.mk (Bytes.unsafe_to_string uchars) in
let uc = U.get us (U.start_index us) in
arr.(pos) <- Uchar.unsafe_of_int uc;
1
with Stream.Failure -> 0)
in
(match%sedlex lb with Opt ("#!", Star (Compl '\n'), "\n") -> () | _ -> ());
lexbuf := Some lb;
let next _ =
let tok, loc =
try token lb with
| Sedlexing.MalFormed ->
raise_clean
(Error
( Sedlexing.lexeme_end lb,
Sedlexing.lexeme_end lb,
"Unexpected character" ))
| Sedlexing.InvalidCodepoint i ->
raise_clean
(Error
( Sedlexing.lexeme_end lb,
Sedlexing.lexeme_end lb,
"Code point invalid for the current encoding" ))
| e -> raise_clean e
in
last_tok := tok;
Some (tok, loc)
in
Stream.from next
exception Error of string
let lexbuf_of stream =
let token_stream = Csedlexer.mk () () stream in
let buff = Lexing.from_string "" in
buff,
function (_ : Lexing.lexbuf) ->
let t, (i, j) = Stream.next token_stream in
let pos1 = { Lexing.dummy_pos with pos_cnum = i } in
let pos2 = { Lexing.dummy_pos with pos_cnum = j } in
buff.lex_start_p <- pos1;
buff.lex_curr_p <- pos2;
t
let parser gram stream =
let b, f = lexbuf_of stream in
try
gram f b
with
Cparser.Error -> raise (Error "")
let prog = parser Cparser.prog
let top_phrases = parser Cparser.top_phrases
let sync () = ()
let localize_exn f = (* TODO Location *)
f ()
exception Error of string
val top_phrases : char Stream.t -> Ast.pmodule_item list
val prog : char Stream.t -> Ast.pmodule_item list
val sync : unit -> unit
val localize_exn: (unit -> 'a) -> 'a
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment