Commit 4cd1b774 authored by Kim Nguyễn's avatar Kim Nguyễn
Browse files

Fix handling of latin1/utf8 files both for reading and writing.

parent 05591e95
...@@ -75,13 +75,13 @@ let get_line_start lb i = ...@@ -75,13 +75,13 @@ let get_line_start lb i =
let get_line_number src i = let get_line_number src i =
let ic = open_in_bin src in let ic = open_in_bin src in
let lb = Sedlexing.Utf8.from_channel ic in let lb = Sedlexing.Latin1.from_channel ic in
let r = get_line_start lb i in let r = get_line_start lb i in
close_in ic; close_in ic;
r r
let get_line_number_str src i = let get_line_number_str src i =
let lb = Sedlexing.Utf8.from_string src in let lb = Sedlexing.Latin1.from_string src in
get_line_start lb i get_line_start lb i
let print_precise ppf = function let print_precise ppf = function
......
...@@ -12,13 +12,11 @@ let invalid_byte_c c e = ...@@ -12,13 +12,11 @@ let invalid_byte_c c e =
let invalid_byte s e = let invalid_byte s e =
let acc = ref "" in let acc = ref "" in
for i = 0 to String.length s - 1 do for i = 0 to String.length s - 1 do
acc := Format.sprintf "\\%x%s" (Char.code s.[i]) !acc acc := Format.sprintf "\\%x%s" (Char.code s.[i]) !acc
done; done;
raise (Invalid_byte (!acc, e)) raise (Invalid_byte (!acc, e))
(* Taken from Menhir/Lib/Convert.ml*) (* Taken from Menhir/Lib/Convert.ml*)
let for_sedlex parser lexer = let for_sedlex parser lexer =
...@@ -33,7 +31,7 @@ let for_sedlex parser lexer = ...@@ -33,7 +31,7 @@ let for_sedlex parser lexer =
(**) (**)
let mk_lexbuf cs = let mk_lexbuf enc cs =
(* Workaround the sedlex functions buffer 512 characters before (* Workaround the sedlex functions buffer 512 characters before
propagating them, making it unusable with interactive input *) propagating them, making it unusable with interactive input *)
let module U = Encodings.Utf8 in let module U = Encodings.Utf8 in
...@@ -45,17 +43,18 @@ let mk_lexbuf cs = ...@@ -45,17 +43,18 @@ let mk_lexbuf cs =
in in
let set_continuation_byte i c = let set_continuation_byte i c =
(* assumes i = 1, 2 or 3 *) (* assumes i = 1, 2 or 3 *)
Bytes.set uchars i c; (* we set it anyway, and test after *) Bytes.set uchars i c;
(* we set it anyway, and test after *)
let cc = Char.code c in let cc = Char.code c in
if cc lsr 6 != 0b10 then if cc lsr 6 != 0b10 then
(* throw exception with invalid byte in the buffer *) (* throw exception with invalid byte in the buffer *)
invalid_byte (Bytes.sub_string uchars 0 (i+1)) Utf8 invalid_byte (Bytes.sub_string uchars 0 (i + 1)) Utf8
in in
let enc = ref Latin1 in
let lexbuf = let lexbuf =
Sedlexing.create (fun arr pos _num -> Sedlexing.create (fun arr pos _num ->
try try
let next cs = Stream.next cs in let next cs = Stream.next cs in
Bytes.set uchars 0 '\000'; Bytes.set uchars 0 '\000';
Bytes.set uchars 1 '\000'; Bytes.set uchars 1 '\000';
Bytes.set uchars 2 '\000'; Bytes.set uchars 2 '\000';
...@@ -88,8 +87,12 @@ let mk_lexbuf cs = ...@@ -88,8 +87,12 @@ let mk_lexbuf cs =
in in
Sedlexing.set_position lexbuf Sedlexing.set_position lexbuf
Lexing.{ pos_fname = ""; pos_lnum = 1; pos_bol = 0; pos_cnum = 0 }; Lexing.{ pos_fname = ""; pos_lnum = 1; pos_bol = 0; pos_cnum = 0 };
Sedlexer.eat_shebang lexbuf; let () =
lexbuf, enc try Sedlexer.eat_shebang lexbuf
with Sedlexing.MalFormed ->
invalid_byte (Sedlexing.Latin1.lexeme lexbuf) !enc
in
lexbuf
let include_stack = ref [] let include_stack = ref []
...@@ -107,14 +110,21 @@ let last_tok = ref Parser.EOI ...@@ -107,14 +110,21 @@ let last_tok = ref Parser.EOI
let last_tok_pos = ref (Lexing.dummy_pos, Lexing.dummy_pos) let last_tok_pos = ref (Lexing.dummy_pos, Lexing.dummy_pos)
let rec token enc lexbuf = let rec token enc lexbuf =
let set_enc e = enc := e in
let f = Sedlexing.with_tokenizer Sedlexer.token lexbuf in let f = Sedlexing.with_tokenizer Sedlexer.token lexbuf in
let f () = let f () =
let tok, p1, p2 = f () in let tok, p1, p2 = f () in
let tok = let tok =
match (!last_tok, tok) with match (!last_tok, tok) with
| _, HASH_ASCII -> enc := Ascii; tok | _, HASH_ASCII ->
| _, HASH_LATIN1 -> enc := Latin1; tok set_enc Ascii;
| _, HASH_UTF8 -> enc := Utf8; tok tok
| _, HASH_LATIN1 ->
set_enc Latin1;
tok
| _, HASH_UTF8 ->
set_enc Utf8;
tok
| Parser.INCLUDE, Parser.STRING2 path -> ( | Parser.INCLUDE, Parser.STRING2 path -> (
Cduce_loc.protect_op "File inclusion"; Cduce_loc.protect_op "File inclusion";
let path = Cduce_loc.resolve_filename path in let path = Cduce_loc.resolve_filename path in
...@@ -132,8 +142,10 @@ let rec token enc lexbuf = ...@@ -132,8 +142,10 @@ let rec token enc lexbuf =
Cduce_loc.push_source (`File path); Cduce_loc.push_source (`File path);
try try
let cs = Stream.of_channel ic in let cs = Stream.of_channel ic in
let newlb, enc = mk_lexbuf cs in let newenc = ref Utf8 in
let past = pre_prog (token enc newlb) in (* or ref !enc ? *)
let newlb = mk_lexbuf newenc cs in
let past = pre_prog (token newenc newlb) in
exit_include ic; exit_include ic;
Parser.RESOLVED_INCLUDE past Parser.RESOLVED_INCLUDE past
with e -> with e ->
...@@ -160,8 +172,9 @@ let get_loc lexbuf = ...@@ -160,8 +172,9 @@ let get_loc lexbuf =
let loc1, loc2 = Sedlexing.lexing_positions lexbuf in let loc1, loc2 = Sedlexing.lexing_positions lexbuf in
(loc1.Lexing.pos_cnum, loc2.Lexing.pos_cnum) (loc1.Lexing.pos_cnum, loc2.Lexing.pos_cnum)
let protect_parser do_sync gram stream = let protect_parser ?global_enc do_sync gram stream =
let b, enc = mk_lexbuf stream in let enc = match global_enc with Some e -> e | None -> ref Utf8 in
let b = mk_lexbuf enc stream in
try try
let f = token enc b in let f = token enc b in
try gram f try gram f
...@@ -179,12 +192,16 @@ let protect_parser do_sync gram stream = ...@@ -179,12 +192,16 @@ let protect_parser do_sync gram stream =
(Ast.Parsing_error (Ast.Parsing_error
(Format.sprintf "Invalid byte%s %s for %s encoding" msg c (Format.sprintf "Invalid byte%s %s for %s encoding" msg c
(str_encoding e))) (str_encoding e)))
| Sedlexing.MalFormed ->
let i, j = get_loc b in
Cduce_loc.raise_loc i j (Ast.Parsing_error "MalFormed")
| Sedlexer.Error (i, j, msg) -> | Sedlexer.Error (i, j, msg) ->
Cduce_loc.raise_loc i j (Ast.Parsing_error msg) Cduce_loc.raise_loc i j (Ast.Parsing_error msg)
let prog cs = protect_parser false pre_prog cs let prog = protect_parser false pre_prog
let top_phrases cs = protect_parser true (for_sedlex Parser.top_phrases) cs let top_phrases =
protect_parser ~global_enc:(ref Utf8) true (for_sedlex Parser.top_phrases)
let protect_exn f g = let protect_exn f g =
try try
...@@ -196,3 +213,5 @@ let protect_exn f g = ...@@ -196,3 +213,5 @@ let protect_exn f g =
raise e raise e
let sync () = () let sync () = ()
let () = Printexc.record_backtrace true
...@@ -699,6 +699,7 @@ let_binding: ...@@ -699,6 +699,7 @@ let_binding:
ident_or_let_pat: ident_or_let_pat:
| id = located_ident { mk $sloc (PatVar [ (snd id) ]) } | id = located_ident { mk $sloc (PatVar [ (snd id) ]) }
| p = constr_pat { p } | p = constr_pat { p }
| p1 = ident_or_let_pat "&" p2 =constr_pat { mk $sloc (And(p1, p2)) }
; ;
branches_: branches_:
......
...@@ -9,8 +9,10 @@ let error i j s = raise (Error (i, j, s)) ...@@ -9,8 +9,10 @@ let error i j s = raise (Error (i, j, s))
let string_buff = Buffer.create 1024 let string_buff = Buffer.create 1024
let store_lexeme lexbuf = let store_lexeme lexbuf =
let s = L.Utf8.lexeme lexbuf in let s = L.lexeme lexbuf in
Buffer.add_string string_buff s for i = 0 to Array.length s - 1 do
Encodings.Utf8.store string_buff (Uchar.to_int s.(i));
done
let store_ascii = Buffer.add_char string_buff let store_ascii = Buffer.add_char string_buff
......
...@@ -160,10 +160,9 @@ let to_buf ~utf8 buffer ns_table v subst = ...@@ -160,10 +160,9 @@ let to_buf ~utf8 buffer ns_table v subst =
wms "/>" wms "/>"
and element_end q = wms ("</" ^ Ns.Printer.tag printer (Atoms.V.value q) ^ ">") and element_end q = wms ("</" ^ Ns.Printer.tag printer (Atoms.V.value q) ^ ">")
and document_start () = and document_start () =
(* wms ("<?xml version='1.0' encoding='" ^ wms ("<?xml version='1.0' encoding='" ^
Netconversion.string_of_encoding to_enc ^ (match to_enc with `Enc_utf8 -> "UTF-8" | `Enc_iso88591 -> "ISO-8859-1")^
"'?>\n") *) "'?>\n")
()
in in
let rec register_elt = function let rec register_elt = function
......
...@@ -27,7 +27,7 @@ module V = struct ...@@ -27,7 +27,7 @@ module V = struct
| 39 -> Format.fprintf ppf "\\'" | 39 -> Format.fprintf ppf "\\'"
| 34 -> Format.fprintf ppf "\\\"" | 34 -> Format.fprintf ppf "\\\""
| c -> | c ->
if c < 32 || (c >= 128 && c < 192) || c > 255 then if c < 32 || c >= 128 then
Format.fprintf ppf "\\%i;" c Format.fprintf ppf "\\%i;" c
else Format.fprintf ppf "%c" (Char.chr c) else Format.fprintf ppf "%c" (Char.chr c)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment