Commit 40775718 authored by Kim Nguyễn's avatar Kim Nguyễn
Browse files

Restore handling of various input file encoding (#ascii, #latin1, #utf8) with

default to Latin1 for compatibility.
parent 002f9580
type encoding = Ascii | Latin1 | Utf8
let str_encoding = function
| Ascii -> "ascii"
| Latin1 -> "latin-1"
| Utf8 -> "utf-8"
exception Invalid_byte of string * encoding
let invalid_byte_c c e =
raise (Invalid_byte (Format.sprintf "\\%x" (Char.code c), e))
let invalid_byte s e =
let acc = ref "" in
for i = 0 to String.length s - 1 do
acc := Format.sprintf "\\%x%s" (Char.code s.[i]) !acc
done;
raise (Invalid_byte (!acc, e))
(* Taken from Menhir/Lib/Convert.ml*)
let for_sedlex parser lexer =
......@@ -15,10 +36,24 @@ let for_sedlex parser lexer =
let mk_lexbuf cs =
(* Workaround the sedlex functions buffer 512 characters before
propagating them, making it unusable with interactive input *)
let module U = Encodings.Utf8 in
let uchars = Bytes.make 4 '\000' in
let read_uchar () =
let us = U.mk (Bytes.unsafe_to_string uchars) in
let uc = U.get us (U.start_index us) in
Uchar.unsafe_of_int uc
in
let set_continuation_byte i c =
(* assumes i = 1, 2 or 3 *)
Bytes.set uchars i c; (* we set it anyway, and test after *)
let cc = Char.code c in
if cc lsr 6 != 0b10 then
(* throw exception with invalid byte in the buffer *)
invalid_byte (Bytes.sub_string uchars 0 (i+1)) Utf8
in
let enc = ref Latin1 in
let lexbuf =
Sedlexing.create (fun arr pos _num ->
let module U = Encodings.Utf8 in
try
let next cs = Stream.next cs in
Bytes.set uchars 0 '\000';
......@@ -26,33 +61,35 @@ let mk_lexbuf cs =
Bytes.set uchars 2 '\000';
Bytes.set uchars 3 '\000';
let c0 = next cs in
let () =
match c0 with
| '\x00' .. '\x7f' -> Bytes.set uchars 0 c0
| '\xc0' .. '\xdf' ->
let codepoint =
match (c0, !enc) with
| '\x00' .. '\x7f', _ -> Uchar.of_char c0
| '\x80' .. '\xff', Latin1 -> Uchar.of_char c0
| '\xc0' .. '\xdf', Utf8 ->
Bytes.set uchars 0 c0;
Bytes.set uchars 1 (next cs)
| '\xe0' .. '\xef' ->
set_continuation_byte 1 (next cs);
read_uchar ()
| '\xe0' .. '\xef', Utf8 ->
Bytes.set uchars 0 c0;
Bytes.set uchars 1 (next cs);
Bytes.set uchars 2 (next cs)
| '\xf0' .. '\xf7' ->
set_continuation_byte 1 (next cs);
set_continuation_byte 2 (next cs);
read_uchar ()
| '\xf0' .. '\xf7', Utf8 ->
Bytes.set uchars 0 c0;
Bytes.set uchars 1 (next cs);
Bytes.set uchars 2 (next cs);
Bytes.set uchars 3 (next cs)
| _ -> raise Sedlexing.MalFormed
set_continuation_byte 1 (next cs);
set_continuation_byte 2 (next cs);
set_continuation_byte 3 (next cs);
read_uchar ()
| c, e -> invalid_byte_c c e
in
let us = U.mk (Bytes.unsafe_to_string uchars) in
let uc = U.get us (U.start_index us) in
arr.(pos) <- Uchar.unsafe_of_int uc;
arr.(pos) <- codepoint;
1
with Stream.Failure -> 0)
in
Sedlexing.set_position lexbuf
Lexing.{ pos_fname = ""; pos_lnum = 1; pos_bol = 0; pos_cnum = 0 };
Sedlexer.eat_shebang lexbuf;
lexbuf
lexbuf, enc
let include_stack = ref []
......@@ -69,12 +106,15 @@ let last_tok = ref Parser.EOI
let last_tok_pos = ref (Lexing.dummy_pos, Lexing.dummy_pos)
let rec token lexbuf =
let rec token enc lexbuf =
let f = Sedlexing.with_tokenizer Sedlexer.token lexbuf in
let f () =
let tok, p1, p2 = f () in
let tok =
match (!last_tok, tok) with
| _, HASH_ASCII -> enc := Ascii; tok
| _, HASH_LATIN1 -> enc := Latin1; tok
| _, HASH_UTF8 -> enc := Utf8; tok
| Parser.INCLUDE, Parser.STRING2 path -> (
Cduce_loc.protect_op "File inclusion";
let path = Cduce_loc.resolve_filename path in
......@@ -92,8 +132,8 @@ let rec token lexbuf =
Cduce_loc.push_source (`File path);
try
let cs = Stream.of_channel ic in
let newlb = mk_lexbuf cs in
let past = pre_prog (token newlb) in
let newlb, enc = mk_lexbuf cs in
let past = pre_prog (token enc newlb) in
exit_include ic;
Parser.RESOLVED_INCLUDE past
with e ->
......@@ -116,19 +156,29 @@ let rec sync f =
last_tok_pos := (p1, p2);
sync f
let get_loc lexbuf =
let loc1, loc2 = Sedlexing.lexing_positions lexbuf in
(loc1.Lexing.pos_cnum, loc2.Lexing.pos_cnum)
let protect_parser do_sync gram stream =
let b = mk_lexbuf stream in
let b, enc = mk_lexbuf stream in
try
let f = token b in
let f = token enc b in
try gram f
with e ->
if do_sync then sync f;
raise e
with
| Parser.Error ->
let loc1, loc2 = Sedlexing.lexing_positions b in
Cduce_loc.raise_loc loc1.Lexing.pos_cnum loc2.Lexing.pos_cnum
(Ast.Parsing_error "")
let i, j = get_loc b in
Cduce_loc.raise_loc i j (Ast.Parsing_error "")
| Invalid_byte (c, e) ->
let i, j = get_loc b in
let msg = if String.length c > 1 then " sequence" else "" in
Cduce_loc.raise_loc i j
(Ast.Parsing_error
(Format.sprintf "Invalid byte%s %s for %s encoding" msg c
(str_encoding e)))
| Sedlexer.Error (i, j, msg) ->
Cduce_loc.raise_loc i j (Ast.Parsing_error msg)
......
......@@ -90,6 +90,7 @@ let id_dummy = U.mk "$$$"
/* Keywords */
%token HASH_PRINT_TYPE "#print_type"
%token HASH_DUMP_VALUE "#dump_value"
%token HASH_ASCII HASH_LATIN1 HASH_UTF8
%token AND "and"
%token DEBUG "debug"
%token DIV "div"
......@@ -224,9 +225,11 @@ prog_items:
;
prog_item:
| item = loc(prog_item_) { [ item ] }
| item = loc(prog_item_) { [item] }
| "include" items = RESOLVED_INCLUDE { items }
| "include" _s = STRING2 { [] }
| HASH_ASCII | HASH_LATIN1 | HASH_UTF8
| "include" STRING2 { [] }
;
%inline prog_item_:
......
......@@ -106,6 +106,9 @@ let rec token lexbuf =
| "_" -> UNDERSCORE
| "#print_type" -> HASH_PRINT_TYPE
| "#dump_value" -> HASH_DUMP_VALUE
| "#ascii" -> HASH_ASCII
| "#latin1" -> HASH_LATIN1
| "#utf8" -> HASH_UTF8
| "#", qname -> HASH_DIRECTIVE (L.Utf8.lexeme lexbuf)
| ncname, ":*" ->
let s = L.Utf8.sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment