Commit deb43e17 authored by Kim Nguyễn's avatar Kim Nguyễn
Browse files

Use menhir incremental API and a custom driver to give much better parsing error messages.

parent 89a9232d
......@@ -32,6 +32,7 @@ depends: [
"ocaml" {>= "4.07.0"}
"dune" {build & >= "2.4"}
"menhir" {build & >= "20181026"}
"menhirLib" { >= "20181026" }
"cduce-types"
"odoc" {with-doc}
"sedlex" {>= "2.0"}
......
......@@ -2,6 +2,7 @@ depends: [
"ocaml" {>= "4.07.0"}
"dune" {build & >= "2.4"}
"menhir" {build & >= "20181026"}
"menhirLib" { >= "20181026" }
"cduce-types"
"odoc" {with-doc}
"sedlex" {>= "2.0"}
......
......@@ -3,7 +3,7 @@
(library
(name cduce_core)
(public_name cduce.lib.core)
(libraries cduce-types sedlex)
(libraries cduce-types sedlex menhirLib)
(flags
(-open Cduce_types))
(modules
......
(menhir
(modules parser)
(infer false))
\ No newline at end of file
(flags --table --inspection)
(infer true))
......@@ -21,18 +21,6 @@ let invalid_byte s e =
(* Taken from Menhir/Lib/Convert.ml*)
let for_sedlex parser lexer =
let lexbuf : Lexing.lexbuf = Lexing.from_string "" in
let lexer (lexbuf : Lexing.lexbuf) =
let token, startp, endp = lexer () in
lexbuf.Lexing.lex_start_p <- startp;
lexbuf.Lexing.lex_curr_p <- endp;
token
in
parser lexer lexbuf
(**)
let mk_lexbuf enc cs =
(* Workaround the sedlex functions buffer 512 characters before
propagating them, making it unusable with interactive input *)
......@@ -53,10 +41,11 @@ let mk_lexbuf enc cs =
invalid_byte (Bytes.sub_string uchars 0 (i + 1)) Utf8
in
let lexbuf =
let closed = ref false in
Sedlexing.create (fun arr pos _num ->
if !closed then raise End_of_file;
try
let next cs = Stream.next cs in
Bytes.set uchars 0 '\000';
Bytes.set uchars 1 '\000';
Bytes.set uchars 2 '\000';
......@@ -85,7 +74,9 @@ let mk_lexbuf enc cs =
in
arr.(pos) <- codepoint;
1
with Stream.Failure -> 0)
with Stream.Failure when not !closed ->
closed := true;
0)
in
Sedlexing.set_position lexbuf
Lexing.{ pos_fname = ""; pos_lnum = 1; pos_bol = 0; pos_cnum = 0 };
......@@ -98,8 +89,6 @@ let mk_lexbuf enc cs =
let include_stack = ref []
let pre_prog = for_sedlex Parser.prog
let close_in ic = try close_in ic with _ -> ()
let exit_include ic =
......@@ -161,6 +150,69 @@ let rec token enc lexbuf =
in
f
and incremental parser token =
let open Parser.MenhirInterpreter in
let init =
parser Lexing.{ dummy_pos with pos_lnum = 1; pos_bol = 0; pos_cnum = 0 }
in
let last_token = ref (Parser.EOI, Lexing.dummy_pos, Lexing.dummy_pos) in
let last_checkpoint = ref init in
let par_stack = ref [] in
let rec loop checkpoint =
match checkpoint with
| InputNeeded _ ->
last_checkpoint := checkpoint;
last_token := token ();
begin
match (!last_token, !par_stack) with
| ((LP | LSB | LCB), _, _), _ ->
par_stack := !last_token :: !par_stack
| (((RP | RSB | RCB) as b), _, _), (t, _, _) :: rest when t = b ->
par_stack := rest
| _ -> () (* will yield an error*)
end;
loop (offer checkpoint !last_token)
| Shifting _ | AboutToReduce _ -> loop (resume checkpoint)
| Accepted v -> v
| Rejected -> raise Parser.Error
| HandlingError env ->
let last_token, last_spos, last_epos = !last_token in
let has_open, candidates =
List.fold_left
(fun (cp, acc) (tok, stok) ->
match
(tok, !par_stack, acceptable !last_checkpoint tok last_spos)
with
| RP, (LP, _, _) :: _, true | RCB, (LCB, _, _) :: _, true ->
(true, stok :: acc)
| RSB, (LSB, _, _) :: _, true -> (true, stok :: acc)
| _, _, true -> (cp, stok :: acc)
| _, _, false -> (cp, acc)
| exception _ -> (cp, acc))
(false, []) Parse_util.all_tokens
in
let i, j = (last_spos.Lexing.pos_cnum, last_epos.Lexing.pos_cnum) in
let msg =
Format.asprintf "invalid token ``%s''"
(Parse_util.string_of_token last_token)
^
if has_open then
let op, i, j = List.hd !par_stack in
let i, j = (i.Lexing.pos_cnum, i.Lexing.pos_cnum) in
Format.asprintf
"@\n%aThe opening parenthesis ``%s'' might be unmatched"
(fun fmt l -> Cduce_loc.print_loc fmt (l, `Full))
(Cduce_loc.loc_of_pos (i, j))
(Parse_util.string_of_token op)
else Format.asprintf "%a" Parse_util.expect_message candidates
in
Cduce_loc.raise_loc i j (Ast.Parsing_error msg)
in
loop init
and pre_prog lb = incremental Parser.Incremental.prog lb
let rec sync f =
match !last_tok with
| Parser.EOI | Parser.SEMISEMI -> ()
......@@ -175,7 +227,9 @@ let get_loc lexbuf =
(loc1.Lexing.pos_cnum, loc2.Lexing.pos_cnum)
let protect_parser ?global_enc do_sync gram stream =
let enc = match global_enc with Some e -> e | None -> ref default_encoding in
let enc =
match global_enc with Some e -> e | None -> ref default_encoding
in
let b = mk_lexbuf enc stream in
try
let f = token enc b in
......@@ -203,7 +257,8 @@ let protect_parser ?global_enc do_sync gram stream =
let prog = protect_parser false pre_prog
let top_phrases =
protect_parser ~global_enc:(ref default_encoding) true (for_sedlex Parser.top_phrases)
protect_parser ~global_enc:(ref default_encoding) true
(incremental Parser.Incremental.top_phrases)
let protect_exn f g =
try
......@@ -216,4 +271,17 @@ let protect_exn f g =
let sync () = ()
let () = Printexc.record_backtrace true
let dump_tokens fmt cs =
let enc = ref default_encoding in
let b = mk_lexbuf enc cs in
let f = token enc b in
let f () =
let t, _, _ = f () in
t
in
let token = ref (f ()) in
while !token != Parser.EOI do
Format.fprintf fmt "%a " (Parse_util.pp_token ~content:false) !token;
token := f ()
done;
Format.fprintf fmt "%a\n%!" (Parse_util.pp_token ~content:false) !token
......@@ -3,3 +3,5 @@ val top_phrases : char Stream.t -> Ast.pmodule_item list
val prog : char Stream.t -> Ast.pmodule_item list
val sync : unit -> unit
val dump_tokens : Format.formatter -> char Stream.t -> unit
\ No newline at end of file
let pp_token ?(content = false) fmt t =
let pp = Format.fprintf fmt in
let open Parser in
match t with
| AMP -> pp "%s" "AMP"
| AMPAMP -> pp "%s" "AMPAMP"
| AND -> pp "%s" "AND"
| ANY_IN_NS s ->
pp "%s" "ANY_IN_NS";
if content then pp "(\"%s\")" s
| AT -> pp "%s" "AT"
| BANG -> pp "%s" "BANG"
| BANGEQ -> pp "%s" "BANGEQ"
| BAR -> pp "%s" "BAR"
| BARBAR -> pp "%s" "BARBAR"
| BQUOTE -> pp "%s" "BQUOTE"
| COLCOL -> pp "%s" "COLCOL"
| COLEQ -> pp "%s" "COLEQ"
| COLON -> pp "%s" "COLON"
| COMMA -> pp "%s" "COMMA"
| DEBUG -> pp "%s" "DEBUG"
| DIV -> pp "%s" "DIV"
| DOT -> pp "%s" "DOT"
| DOTDOT -> pp "%s" "DOTDOT"
| ELSE -> pp "%s" "ELSE"
| EOI -> pp "%s" "EOI"
| EQ -> pp "%s" "EQ"
| EQQMARK -> pp "%s" "EQQMARK"
| FROM -> pp "%s" "FROM"
| FUN -> pp "%s" "FUN"
| GT -> pp "%s" "GT"
| GTEQ -> pp "%s" "GTEQ"
| GTGT -> pp "%s" "GTGT"
| HASH_ASCII -> pp "%s" "HASH_ASCII"
| HASH_DIRECTIVE s ->
pp "%s" "HASH_DIRECTIVE";
if content then pp "(\"%s\")" s
| HASH_DUMP_VALUE -> pp "%s" "HASH_DUMP_VALUE"
| HASH_LATIN1 -> pp "%s" "HASH_LATIN1"
| HASH_PRINT_TYPE -> pp "%s" "HASH_PRINT_TYPE"
| HASH_UTF8 -> pp "%s" "HASH_UTF8"
| IDENT s ->
pp "%s" "IDENT";
if content then pp "(\"%s\")" s
| IF -> pp "%s" "IF"
| IN -> pp "%s" "IN"
| INCLUDE -> pp "%s" "INCLUDE"
| INT s ->
pp "%s" "INT";
if content then pp "(\"%s\")" s
| LCB -> pp "%s" "LCB"
| LET -> pp "%s" "LET"
| LP -> pp "%s" "LP"
| LSB -> pp "%s" "LSB"
| LT -> pp "%s" "LT"
| LTEQ -> pp "%s" "LTEQ"
| LTLT -> pp "%s" "LTLT"
| MAP -> pp "%s" "MAP"
| MATCH -> pp "%s" "MATCH"
| MINUS -> pp "%s" "MINUS"
| MINUSGT -> pp "%s" "MINUSGT"
| MINUSMINUS -> pp "%s" "MINUSMINUS"
| MOD -> pp "%s" "MOD"
| NAMESPACE -> pp "%s" "NAMESPACE"
| OFF -> pp "%s" "OFF"
| ON -> pp "%s" "ON"
| OPEN -> pp "%s" "OPEN"
| OR -> pp "%s" "OR"
| PLUS -> pp "%s" "PLUS"
| PLUSQMARK -> pp "%s" "PLUSQMARK"
| QMARK -> pp "%s" "QMARK"
| QMARKQMARK -> pp "%s" "QMARKQMARK"
| RCB -> pp "%s" "RCB"
| REF -> pp "%s" "REF"
| RESOLVED_INCLUDE _ ->
pp "%s" "RESOLVED_INCLUDE";
if content then pp "%s" "([ ... ])"
| RP -> pp "%s" "RP"
| RSB -> pp "%s" "RSB"
| SCHEMA -> pp "%s" "SCHEMA"
| SELECT -> pp "%s" "SELECT"
| SEMI -> pp "%s" "SEMI"
| SEMISEMI -> pp "%s" "SEMISEMI"
| SETMINUS -> pp "%s" "SETMINUS"
| SLASH -> pp "%s" "SLASH"
| SLASHAT -> pp "%s" "SLASHAT"
| SLASHSLASH -> pp "%s" "SLASHSLASH"
| STAR -> pp "%s" "STAR"
| STARQMARK -> pp "%s" "STARQMARK"
| STARSTAR -> pp "%s" "STARSTAR"
| STRING1 s ->
pp "%s" "STRING1";
if content then pp "(\"%s\")" s
| STRING2 s ->
pp "%s" "STRING2";
if content then pp "(\"%s\")" s
| THEN -> pp "%s" "THEN"
| TRANSFORM -> pp "%s" "TRANSFORM"
| TRY -> pp "%s" "TRY"
| TYPE -> pp "%s" "TYPE"
| UNDERSCORE -> pp "%s" "UNDERSCORE"
| USING -> pp "%s" "USING"
| VALIDATE -> pp "%s" "VALIDATE"
| WHERE -> pp "%s" "WHERE"
| WITH -> pp "%s" "WITH"
| XTRANSFORM -> pp "%s" "XTRANSFORM"
let all_tokens =
Parser.
[
(AMP, "&");
(AMPAMP, "&&");
(AND, "and");
(ANY_IN_NS "ns", ".:*");
(AT, "@");
(BANG, "!");
(BANGEQ, "!=");
(BAR, "|");
(BARBAR, "||");
(BQUOTE, "`");
(COLCOL, "::");
(COLEQ, ":=");
(COLON, ":");
(COMMA, ",");
(DEBUG, "debug");
(DIV, "div");
(DOT, ".");
(DOTDOT, "..");
(ELSE, "else");
(EOI, "the end of input");
(EQ, "=");
(EQQMARK, "=?");
(FROM, "from");
(FUN, "fun");
(GT, ">");
(GTEQ, ">=");
(GTGT, ">>");
(HASH_ASCII, "#ascii");
(HASH_DIRECTIVE "#quiet", "#quiet");
(HASH_DUMP_VALUE, "#dump_value");
(HASH_LATIN1, "#latin1");
(HASH_PRINT_TYPE, "#print_type");
(HASH_UTF8, "#utf8");
(IDENT "x", "a variable");
(IF, "if");
(IN, "in");
(INCLUDE, "include");
(INT "42", "an integer");
(LCB, "{");
(LET, "let");
(LP, "(");
(LSB, "[");
(LT, "<");
(LTEQ, "<=");
(LTLT, "<<");
(MAP, "map");
(MATCH, "match");
(MINUS, "-");
(MINUSGT, "->");
(MINUSMINUS, "--");
(MOD, "mod");
(NAMESPACE, "namespace");
(OFF, "off");
(ON, "on");
(OPEN, "open");
(OR, "or");
(PLUS, "+");
(PLUSQMARK, "+?");
(QMARK, "?");
(QMARKQMARK, "??");
(RCB, "]");
(REF, "ref");
(* (this is a pseudo token, don't consider it a candidate)
(RESOLVED_INCLUDE [], "");
*)
(RP, ")");
(RSB, "]");
(SCHEMA, "schema");
(SELECT, "select");
(SEMI, ";");
(SEMISEMI, ";;");
(SETMINUS, "\\");
(SLASH, "/");
(SLASHAT, "/@");
(SLASHSLASH, "//");
(STAR, "*");
(STARQMARK, "?");
(STARSTAR, "**");
(STRING1 "hello", "a character literal");
(STRING2 "hello", "a string literal");
(THEN, "then");
(TRANSFORM, "transform");
(TRY, "try");
(TYPE, "type");
(UNDERSCORE, "_");
(USING, "using");
(VALIDATE, "validate");
(WHERE, "where");
(WITH, "with");
(XTRANSFORM, "xtransform");
]
let escape_string s =
let b = Buffer.create (String.length s) in
let rec loop idx end_ us =
if idx = end_ then Buffer.contents b
else
let cp, nidx = Encodings.Utf8.next us idx in
let () =
match cp with
| 10 -> Buffer.add_string b "\\n"
| 9 -> Buffer.add_string b "\\t"
| 13 -> Buffer.add_string b "\\r"
| _ when cp < 32 || cp > 127 ->
Buffer.add_char b '\\';
Buffer.add_string b (string_of_int cp)
| _ -> Buffer.add_char b (Char.unsafe_chr cp)
in
loop nidx end_ us
in
let us = Encodings.Utf8.mk s in
loop (Encodings.Utf8.start_index us) (Encodings.Utf8.end_index us) us
let string_of_token tok =
let open Parser in
match tok with
| ANY_IN_NS n -> n ^ ":*"
| HASH_DIRECTIVE n -> "n"
| INT i -> i
| STRING1 s -> "'" ^ escape_string s ^ "'"
| STRING2 s -> "\"" ^ escape_string s ^ "\""
| RESOLVED_INCLUDE _ -> ""
| _ -> List.assoc tok all_tokens
let text_of_token tok = try List.assoc tok all_tokens with Not_found -> ""
let expect_message fmt l =
let l = List.sort compare l in
match l with
| [] -> ()
| [ t ] -> Format.fprintf fmt "@\nExpecting token ``%s''" t
| f :: rest ->
let rec loop l =
match l with
| [] -> ()
| [ t ] -> Format.fprintf fmt " or ``%s''" t
| t :: ll ->
Format.fprintf fmt ", ``%s''" t;
loop ll
in
Format.fprintf fmt "@\nExpecting token ``%s''" f;
loop rest
......@@ -57,7 +57,7 @@ let seq_of_string s =
let parse_char loc s =
match seq_of_string s with
| [ c ] -> c
| _ -> parsing_error loc (Format.sprintf "Invalid character litteral '%s'" s)
| _ -> parsing_error loc (Format.sprintf "invalid character litteral '%s'" s)
let mk_rec_field loc lab def =
let o, x, y =
......@@ -186,7 +186,11 @@ let id_dummy = U.mk "$$$"
%start <Ast.pprog> prog
%start <Ast.pprog> top_phrases
%type <Ast.ppat> pat
%type <Ast.ppat> pat
%start <Ast.ppat> parse_pat
%start <Ast.pexpr> parse_expr
%start <Ast.pmodule_item> parse_pmodule_item
%%
/* Macros */
......@@ -201,6 +205,16 @@ x=X { mk $sloc x }
;
/* Toplevel definitions */
parse_pat:
p = pat EOI { p }
;
parse_expr:
e = expr EOI { e }
;
parse_pmodule_item:
pi = loc(prog_item_) EOI { pi }
;
top_phrases:
......@@ -265,7 +279,7 @@ match n with
| "sample", [t] -> `Sample t
| "subtype", [t1; t2] -> `Subtype (t1, t2)
| "single", [t] -> `Single t
| _ -> parsing_error $loc(d) (Format.sprintf "Invalid debug directive %s" d)
| _ -> parsing_error $loc(d) (Format.sprintf "invalid debug directive %s" d)
in Directive (`Debug dir)
}
| d = HASH_DIRECTIVE {
......@@ -277,7 +291,7 @@ match n with
| "#reinit_ns" -> `Reinit_ns
| "#help" -> `Help
| "#builtins" -> `Builtins
| _ -> parsing_error $loc(d) (Format.sprintf "Invalid toplevel directive %s" d)
| _ -> parsing_error $loc(d) (Format.sprintf "invalid toplevel directive %s" d)
in Directive dir
}
| HASH_PRINT_TYPE t = pat { Directive(`Print_type t) }
......@@ -348,7 +362,7 @@ constr_pat:
let i = V.mk i and j = V.mk j in
mk $sloc (Internal (Types.interval (bounded i j)))
| None, None -> parsing_error $sloc
(Format.sprintf "Invalid interval *--*")
(Format.sprintf "invalid interval *--*")
}
| i = INT {
let open Intervals in
......@@ -536,13 +550,13 @@ regexp_and:
match x, y with
Elem x, Elem y -> Elem (mk $sloc (And (x, y)))
| _ -> parsing_error $sloc
"Conjunction not allowed in regular expression"
"conjunction not allowed in regular expression"
}
| x = regexp_and SETMINUS y = regexp_acc {
match x, y with
Elem x, Elem y -> Elem (mk $sloc (Diff (x, y)))
| _ -> parsing_error $sloc
"Difference not allowed in regular expression"
"difference not allowed in regular expression"
}
| r = regexp_acc { r }
......@@ -570,16 +584,16 @@ regexp_simple:
try
let i = int_of_string i in
if (i > 1024) then raise Exit else i
with Failure _ | Exit -> parsing_error $loc(i) "Repetition number too large"
with Failure _ | Exit -> parsing_error $loc(i) "repetition number too large"
in
if i <= 0 then parsing_error $sloc "Repetition number must be a positive integer";
if i <= 0 then parsing_error $sloc "repetition number must be a positive integer";
aux i Epsilon
}
| "(" x = separated_nonempty_list(",", regexp) ")" {
match x with
[ x ] -> x
| _ -> let x = List.map (function Elem x -> x | _ ->
parsing_error $sloc "Mixing regular expressions and products") x
parsing_error $sloc "mixing regular expressions and products is not allowed") x
in
Elem (multi_prod $sloc x)
}
......@@ -632,7 +646,7 @@ namespace_binding:
| _, (`Path _ as path) ->`Prefix(ident name, path)
| "on", `Empty -> `Keep true
| "off", `Empty -> `Keep false
| _ -> parsing_error $sloc "Invalid namespace specification"
| _ -> parsing_error $sloc "invalid namespace specification"
}
;
......
......@@ -11,7 +11,7 @@ let string_buff = Buffer.create 1024
let store_lexeme lexbuf =
let s = L.lexeme lexbuf in
for i = 0 to Array.length s - 1 do
Encodings.Utf8.store string_buff (Uchar.to_int s.(i));
Encodings.Utf8.store string_buff (Uchar.to_int s.(i))
done
let store_ascii = Buffer.add_char string_buff
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment