Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
cduce
cduce
Commits
4cd1b774
Commit
4cd1b774
authored
Mar 31, 2021
by
Kim Nguyễn
Browse files
Fix handling of latin1/utf8 files both for reading and writing.
parent
05591e95
Changes
6
Hide whitespace changes
Inline
Side-by-side
lang/parser/cduce_loc.ml
View file @
4cd1b774
...
...
@@ -75,13 +75,13 @@ let get_line_start lb i =
let
get_line_number
src
i
=
let
ic
=
open_in_bin
src
in
let
lb
=
Sedlexing
.
Utf8
.
from_channel
ic
in
let
lb
=
Sedlexing
.
Latin1
.
from_channel
ic
in
let
r
=
get_line_start
lb
i
in
close_in
ic
;
r
let
get_line_number_str
src
i
=
let
lb
=
Sedlexing
.
Utf8
.
from_string
src
in
let
lb
=
Sedlexing
.
Latin1
.
from_string
src
in
get_line_start
lb
i
let
print_precise
ppf
=
function
...
...
lang/parser/parse.ml
View file @
4cd1b774
...
...
@@ -12,13 +12,11 @@ let invalid_byte_c c e =
let
invalid_byte
s
e
=
let
acc
=
ref
""
in
for
i
=
0
to
String
.
length
s
-
1
do
for
i
=
0
to
String
.
length
s
-
1
do
acc
:=
Format
.
sprintf
"
\\
%x%s"
(
Char
.
code
s
.
[
i
])
!
acc
done
;
raise
(
Invalid_byte
(
!
acc
,
e
))
(* Taken from Menhir/Lib/Convert.ml*)
let
for_sedlex
parser
lexer
=
...
...
@@ -33,7 +31,7 @@ let for_sedlex parser lexer =
(**)
let
mk_lexbuf
cs
=
let
mk_lexbuf
enc
cs
=
(* Workaround the sedlex functions buffer 512 characters before
propagating them, making it unusable with interactive input *)
let
module
U
=
Encodings
.
Utf8
in
...
...
@@ -45,17 +43,18 @@ let mk_lexbuf cs =
in
let
set_continuation_byte
i
c
=
(* assumes i = 1, 2 or 3 *)
Bytes
.
set
uchars
i
c
;
(* we set it anyway, and test after *)
Bytes
.
set
uchars
i
c
;
(* we set it anyway, and test after *)
let
cc
=
Char
.
code
c
in
if
cc
lsr
6
!=
0b10
then
(* throw exception with invalid byte in the buffer *)
invalid_byte
(
Bytes
.
sub_string
uchars
0
(
i
+
1
))
Utf8
invalid_byte
(
Bytes
.
sub_string
uchars
0
(
i
+
1
))
Utf8
in
let
enc
=
ref
Latin1
in
let
lexbuf
=
Sedlexing
.
create
(
fun
arr
pos
_num
->
try
let
next
cs
=
Stream
.
next
cs
in
Bytes
.
set
uchars
0
'\000'
;
Bytes
.
set
uchars
1
'\000'
;
Bytes
.
set
uchars
2
'\000'
;
...
...
@@ -88,8 +87,12 @@ let mk_lexbuf cs =
in
Sedlexing
.
set_position
lexbuf
Lexing
.{
pos_fname
=
""
;
pos_lnum
=
1
;
pos_bol
=
0
;
pos_cnum
=
0
};
Sedlexer
.
eat_shebang
lexbuf
;
lexbuf
,
enc
let
()
=
try
Sedlexer
.
eat_shebang
lexbuf
with
Sedlexing
.
MalFormed
->
invalid_byte
(
Sedlexing
.
Latin1
.
lexeme
lexbuf
)
!
enc
in
lexbuf
let
include_stack
=
ref
[]
...
...
@@ -107,14 +110,21 @@ let last_tok = ref Parser.EOI
let
last_tok_pos
=
ref
(
Lexing
.
dummy_pos
,
Lexing
.
dummy_pos
)
let
rec
token
enc
lexbuf
=
let
set_enc
e
=
enc
:=
e
in
let
f
=
Sedlexing
.
with_tokenizer
Sedlexer
.
token
lexbuf
in
let
f
()
=
let
tok
,
p1
,
p2
=
f
()
in
let
tok
=
match
(
!
last_tok
,
tok
)
with
|
_
,
HASH_ASCII
->
enc
:=
Ascii
;
tok
|
_
,
HASH_LATIN1
->
enc
:=
Latin1
;
tok
|
_
,
HASH_UTF8
->
enc
:=
Utf8
;
tok
|
_
,
HASH_ASCII
->
set_enc
Ascii
;
tok
|
_
,
HASH_LATIN1
->
set_enc
Latin1
;
tok
|
_
,
HASH_UTF8
->
set_enc
Utf8
;
tok
|
Parser
.
INCLUDE
,
Parser
.
STRING2
path
->
(
Cduce_loc
.
protect_op
"File inclusion"
;
let
path
=
Cduce_loc
.
resolve_filename
path
in
...
...
@@ -132,8 +142,10 @@ let rec token enc lexbuf =
Cduce_loc
.
push_source
(
`File
path
);
try
let
cs
=
Stream
.
of_channel
ic
in
let
newlb
,
enc
=
mk_lexbuf
cs
in
let
past
=
pre_prog
(
token
enc
newlb
)
in
let
newenc
=
ref
Utf8
in
(* or ref !enc ? *)
let
newlb
=
mk_lexbuf
newenc
cs
in
let
past
=
pre_prog
(
token
newenc
newlb
)
in
exit_include
ic
;
Parser
.
RESOLVED_INCLUDE
past
with
e
->
...
...
@@ -160,8 +172,9 @@ let get_loc lexbuf =
let
loc1
,
loc2
=
Sedlexing
.
lexing_positions
lexbuf
in
(
loc1
.
Lexing
.
pos_cnum
,
loc2
.
Lexing
.
pos_cnum
)
let
protect_parser
do_sync
gram
stream
=
let
b
,
enc
=
mk_lexbuf
stream
in
let
protect_parser
?
global_enc
do_sync
gram
stream
=
let
enc
=
match
global_enc
with
Some
e
->
e
|
None
->
ref
Utf8
in
let
b
=
mk_lexbuf
enc
stream
in
try
let
f
=
token
enc
b
in
try
gram
f
...
...
@@ -179,12 +192,16 @@ let protect_parser do_sync gram stream =
(
Ast
.
Parsing_error
(
Format
.
sprintf
"Invalid byte%s %s for %s encoding"
msg
c
(
str_encoding
e
)))
|
Sedlexing
.
MalFormed
->
let
i
,
j
=
get_loc
b
in
Cduce_loc
.
raise_loc
i
j
(
Ast
.
Parsing_error
"MalFormed"
)
|
Sedlexer
.
Error
(
i
,
j
,
msg
)
->
Cduce_loc
.
raise_loc
i
j
(
Ast
.
Parsing_error
msg
)
let
prog
cs
=
protect_parser
false
pre_prog
cs
let
prog
=
protect_parser
false
pre_prog
let
top_phrases
cs
=
protect_parser
true
(
for_sedlex
Parser
.
top_phrases
)
cs
let
top_phrases
=
protect_parser
~
global_enc
:
(
ref
Utf8
)
true
(
for_sedlex
Parser
.
top_phrases
)
let
protect_exn
f
g
=
try
...
...
@@ -196,3 +213,5 @@ let protect_exn f g =
raise
e
let
sync
()
=
()
let
()
=
Printexc
.
record_backtrace
true
lang/parser/parser.mly
View file @
4cd1b774
...
...
@@ -699,6 +699,7 @@ let_binding:
ident_or_let_pat
:
|
id
=
located_ident
{
mk
$
sloc
(
PatVar
[
(
snd
id
)
])
}
|
p
=
constr_pat
{
p
}
|
p1
=
ident_or_let_pat
"&"
p2
=
constr_pat
{
mk
$
sloc
(
And
(
p1
,
p2
))
}
;
branches_
:
...
...
lang/parser/sedlexer.ml
View file @
4cd1b774
...
...
@@ -9,8 +9,10 @@ let error i j s = raise (Error (i, j, s))
let
string_buff
=
Buffer
.
create
1024
let
store_lexeme
lexbuf
=
let
s
=
L
.
Utf8
.
lexeme
lexbuf
in
Buffer
.
add_string
string_buff
s
let
s
=
L
.
lexeme
lexbuf
in
for
i
=
0
to
Array
.
length
s
-
1
do
Encodings
.
Utf8
.
store
string_buff
(
Uchar
.
to_int
s
.
(
i
));
done
let
store_ascii
=
Buffer
.
add_char
string_buff
...
...
lang/runtime/print_xml.ml
View file @
4cd1b774
...
...
@@ -160,10 +160,9 @@ let to_buf ~utf8 buffer ns_table v subst =
wms
"/>"
and
element_end
q
=
wms
(
"</"
^
Ns
.
Printer
.
tag
printer
(
Atoms
.
V
.
value
q
)
^
">"
)
and
document_start
()
=
(* wms ("<?xml version='1.0' encoding='" ^
Netconversion.string_of_encoding to_enc ^
"'?>\n") *)
()
wms
(
"<?xml version='1.0' encoding='"
^
(
match
to_enc
with
`Enc_utf8
->
"UTF-8"
|
`Enc_iso88591
->
"ISO-8859-1"
)
^
"'?>
\n
"
)
in
let
rec
register_elt
=
function
...
...
types/chars.ml
View file @
4cd1b774
...
...
@@ -27,7 +27,7 @@ module V = struct
|
39
->
Format
.
fprintf
ppf
"
\\
'"
|
34
->
Format
.
fprintf
ppf
"
\\\"
"
|
c
->
if
c
<
32
||
(
c
>=
128
&&
c
<
192
)
||
c
>
255
then
if
c
<
32
||
c
>=
128
then
Format
.
fprintf
ppf
"
\\
%i;"
c
else
Format
.
fprintf
ppf
"%c"
(
Char
.
chr
c
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment