Commit 4cd1b774 authored by Kim Nguyễn's avatar Kim Nguyễn
Browse files

Fix handling of latin1/utf8 files both for reading and writing.

parent 05591e95
......@@ -75,13 +75,13 @@ let get_line_start lb i =
let get_line_number src i =
let ic = open_in_bin src in
let lb = Sedlexing.Utf8.from_channel ic in
let lb = Sedlexing.Latin1.from_channel ic in
let r = get_line_start lb i in
close_in ic;
r
let get_line_number_str src i =
let lb = Sedlexing.Utf8.from_string src in
let lb = Sedlexing.Latin1.from_string src in
get_line_start lb i
let print_precise ppf = function
......
......@@ -17,8 +17,6 @@ let invalid_byte s e =
done;
raise (Invalid_byte (!acc, e))
(* Taken from Menhir/Lib/Convert.ml*)
let for_sedlex parser lexer =
......@@ -33,7 +31,7 @@ let for_sedlex parser lexer =
(**)
let mk_lexbuf cs =
let mk_lexbuf enc cs =
(* Workaround the sedlex functions buffer 512 characters before
propagating them, making it unusable with interactive input *)
let module U = Encodings.Utf8 in
......@@ -45,17 +43,18 @@ let mk_lexbuf cs =
in
let set_continuation_byte i c =
(* assumes i = 1, 2 or 3 *)
Bytes.set uchars i c; (* we set it anyway, and test after *)
Bytes.set uchars i c;
(* we set it anyway, and test after *)
let cc = Char.code c in
if cc lsr 6 != 0b10 then
(* throw exception with invalid byte in the buffer *)
invalid_byte (Bytes.sub_string uchars 0 (i+1)) Utf8
invalid_byte (Bytes.sub_string uchars 0 (i + 1)) Utf8
in
let enc = ref Latin1 in
let lexbuf =
Sedlexing.create (fun arr pos _num ->
try
let next cs = Stream.next cs in
Bytes.set uchars 0 '\000';
Bytes.set uchars 1 '\000';
Bytes.set uchars 2 '\000';
......@@ -88,8 +87,12 @@ let mk_lexbuf cs =
in
Sedlexing.set_position lexbuf
Lexing.{ pos_fname = ""; pos_lnum = 1; pos_bol = 0; pos_cnum = 0 };
Sedlexer.eat_shebang lexbuf;
lexbuf, enc
let () =
try Sedlexer.eat_shebang lexbuf
with Sedlexing.MalFormed ->
invalid_byte (Sedlexing.Latin1.lexeme lexbuf) !enc
in
lexbuf
let include_stack = ref []
......@@ -107,14 +110,21 @@ let last_tok = ref Parser.EOI
let last_tok_pos = ref (Lexing.dummy_pos, Lexing.dummy_pos)
let rec token enc lexbuf =
let set_enc e = enc := e in
let f = Sedlexing.with_tokenizer Sedlexer.token lexbuf in
let f () =
let tok, p1, p2 = f () in
let tok =
match (!last_tok, tok) with
| _, HASH_ASCII -> enc := Ascii; tok
| _, HASH_LATIN1 -> enc := Latin1; tok
| _, HASH_UTF8 -> enc := Utf8; tok
| _, HASH_ASCII ->
set_enc Ascii;
tok
| _, HASH_LATIN1 ->
set_enc Latin1;
tok
| _, HASH_UTF8 ->
set_enc Utf8;
tok
| Parser.INCLUDE, Parser.STRING2 path -> (
Cduce_loc.protect_op "File inclusion";
let path = Cduce_loc.resolve_filename path in
......@@ -132,8 +142,10 @@ let rec token enc lexbuf =
Cduce_loc.push_source (`File path);
try
let cs = Stream.of_channel ic in
let newlb, enc = mk_lexbuf cs in
let past = pre_prog (token enc newlb) in
let newenc = ref Utf8 in
(* or ref !enc ? *)
let newlb = mk_lexbuf newenc cs in
let past = pre_prog (token newenc newlb) in
exit_include ic;
Parser.RESOLVED_INCLUDE past
with e ->
......@@ -160,8 +172,9 @@ let get_loc lexbuf =
let loc1, loc2 = Sedlexing.lexing_positions lexbuf in
(loc1.Lexing.pos_cnum, loc2.Lexing.pos_cnum)
let protect_parser do_sync gram stream =
let b, enc = mk_lexbuf stream in
let protect_parser ?global_enc do_sync gram stream =
let enc = match global_enc with Some e -> e | None -> ref Utf8 in
let b = mk_lexbuf enc stream in
try
let f = token enc b in
try gram f
......@@ -179,12 +192,16 @@ let protect_parser do_sync gram stream =
(Ast.Parsing_error
(Format.sprintf "Invalid byte%s %s for %s encoding" msg c
(str_encoding e)))
| Sedlexing.MalFormed ->
let i, j = get_loc b in
Cduce_loc.raise_loc i j (Ast.Parsing_error "MalFormed")
| Sedlexer.Error (i, j, msg) ->
Cduce_loc.raise_loc i j (Ast.Parsing_error msg)
let prog cs = protect_parser false pre_prog cs
let prog = protect_parser false pre_prog
let top_phrases cs = protect_parser true (for_sedlex Parser.top_phrases) cs
let top_phrases =
protect_parser ~global_enc:(ref Utf8) true (for_sedlex Parser.top_phrases)
let protect_exn f g =
try
......@@ -196,3 +213,5 @@ let protect_exn f g =
raise e
let sync () = ()
let () = Printexc.record_backtrace true
......@@ -699,6 +699,7 @@ let_binding:
ident_or_let_pat:
| id = located_ident { mk $sloc (PatVar [ (snd id) ]) }
| p = constr_pat { p }
| p1 = ident_or_let_pat "&" p2 =constr_pat { mk $sloc (And(p1, p2)) }
;
branches_:
......
......@@ -9,8 +9,10 @@ let error i j s = raise (Error (i, j, s))
let string_buff = Buffer.create 1024
let store_lexeme lexbuf =
let s = L.Utf8.lexeme lexbuf in
Buffer.add_string string_buff s
let s = L.lexeme lexbuf in
for i = 0 to Array.length s - 1 do
Encodings.Utf8.store string_buff (Uchar.to_int s.(i));
done
let store_ascii = Buffer.add_char string_buff
......
......@@ -160,10 +160,9 @@ let to_buf ~utf8 buffer ns_table v subst =
wms "/>"
and element_end q = wms ("</" ^ Ns.Printer.tag printer (Atoms.V.value q) ^ ">")
and document_start () =
(* wms ("<?xml version='1.0' encoding='" ^
Netconversion.string_of_encoding to_enc ^
"'?>\n") *)
()
wms ("<?xml version='1.0' encoding='" ^
(match to_enc with `Enc_utf8 -> "UTF-8" | `Enc_iso88591 -> "ISO-8859-1")^
"'?>\n")
in
let rec register_elt = function
......
......@@ -27,7 +27,7 @@ module V = struct
| 39 -> Format.fprintf ppf "\\'"
| 34 -> Format.fprintf ppf "\\\""
| c ->
if c < 32 || (c >= 128 && c < 192) || c > 255 then
if c < 32 || c >= 128 then
Format.fprintf ppf "\\%i;" c
else Format.fprintf ppf "%c" (Char.chr c)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment