Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
cduce
cduce
Commits
40775718
Commit
40775718
authored
Mar 31, 2021
by
Kim Nguyễn
Browse files
Restore handling of various input file encoding (#ascii, #latin1, #utf8) with
default to Latin1 for compatibility.
parent
002f9580
Changes
3
Hide whitespace changes
Inline
Side-by-side
lang/parser/parse.ml
View file @
40775718
type
encoding
=
Ascii
|
Latin1
|
Utf8
let
str_encoding
=
function
|
Ascii
->
"ascii"
|
Latin1
->
"latin-1"
|
Utf8
->
"utf-8"
exception
Invalid_byte
of
string
*
encoding
let
invalid_byte_c
c
e
=
raise
(
Invalid_byte
(
Format
.
sprintf
"
\\
%x"
(
Char
.
code
c
)
,
e
))
let
invalid_byte
s
e
=
let
acc
=
ref
""
in
for
i
=
0
to
String
.
length
s
-
1
do
acc
:=
Format
.
sprintf
"
\\
%x%s"
(
Char
.
code
s
.
[
i
])
!
acc
done
;
raise
(
Invalid_byte
(
!
acc
,
e
))
(* Taken from Menhir/Lib/Convert.ml*)
let
for_sedlex
parser
lexer
=
...
...
@@ -15,10 +36,24 @@ let for_sedlex parser lexer =
let
mk_lexbuf
cs
=
(* Workaround the sedlex functions buffer 512 characters before
propagating them, making it unusable with interactive input *)
let
module
U
=
Encodings
.
Utf8
in
let
uchars
=
Bytes
.
make
4
'\000'
in
let
read_uchar
()
=
let
us
=
U
.
mk
(
Bytes
.
unsafe_to_string
uchars
)
in
let
uc
=
U
.
get
us
(
U
.
start_index
us
)
in
Uchar
.
unsafe_of_int
uc
in
let
set_continuation_byte
i
c
=
(* assumes i = 1, 2 or 3 *)
Bytes
.
set
uchars
i
c
;
(* we set it anyway, and test after *)
let
cc
=
Char
.
code
c
in
if
cc
lsr
6
!=
0b10
then
(* throw exception with invalid byte in the buffer *)
invalid_byte
(
Bytes
.
sub_string
uchars
0
(
i
+
1
))
Utf8
in
let
enc
=
ref
Latin1
in
let
lexbuf
=
Sedlexing
.
create
(
fun
arr
pos
_num
->
let
module
U
=
Encodings
.
Utf8
in
try
let
next
cs
=
Stream
.
next
cs
in
Bytes
.
set
uchars
0
'\000'
;
...
...
@@ -26,33 +61,35 @@ let mk_lexbuf cs =
Bytes
.
set
uchars
2
'\000'
;
Bytes
.
set
uchars
3
'\000'
;
let
c0
=
next
cs
in
let
()
=
match
c0
with
|
'\x00'
..
'\x7f'
->
Bytes
.
set
uchars
0
c0
|
'\xc0'
..
'\xdf'
->
let
codepoint
=
match
(
c0
,
!
enc
)
with
|
'\x00'
..
'\x7f'
,
_
->
Uchar
.
of_char
c0
|
'\x80'
..
'\xff'
,
Latin1
->
Uchar
.
of_char
c0
|
'\xc0'
..
'\xdf'
,
Utf8
->
Bytes
.
set
uchars
0
c0
;
Bytes
.
set
uchars
1
(
next
cs
)
|
'\xe0'
..
'\xef'
->
set_continuation_byte
1
(
next
cs
);
read_uchar
()
|
'\xe0'
..
'\xef'
,
Utf8
->
Bytes
.
set
uchars
0
c0
;
Bytes
.
set
uchars
1
(
next
cs
);
Bytes
.
set
uchars
2
(
next
cs
)
|
'\xf0'
..
'\xf7'
->
set_continuation_byte
1
(
next
cs
);
set_continuation_byte
2
(
next
cs
);
read_uchar
()
|
'\xf0'
..
'\xf7'
,
Utf8
->
Bytes
.
set
uchars
0
c0
;
Bytes
.
set
uchars
1
(
next
cs
);
Bytes
.
set
uchars
2
(
next
cs
);
Bytes
.
set
uchars
3
(
next
cs
)
|
_
->
raise
Sedlexing
.
MalFormed
set_continuation_byte
1
(
next
cs
);
set_continuation_byte
2
(
next
cs
);
set_continuation_byte
3
(
next
cs
);
read_uchar
()
|
c
,
e
->
invalid_byte_c
c
e
in
let
us
=
U
.
mk
(
Bytes
.
unsafe_to_string
uchars
)
in
let
uc
=
U
.
get
us
(
U
.
start_index
us
)
in
arr
.
(
pos
)
<-
Uchar
.
unsafe_of_int
uc
;
arr
.
(
pos
)
<-
codepoint
;
1
with
Stream
.
Failure
->
0
)
in
Sedlexing
.
set_position
lexbuf
Lexing
.{
pos_fname
=
""
;
pos_lnum
=
1
;
pos_bol
=
0
;
pos_cnum
=
0
};
Sedlexer
.
eat_shebang
lexbuf
;
lexbuf
lexbuf
,
enc
let
include_stack
=
ref
[]
...
...
@@ -69,12 +106,15 @@ let last_tok = ref Parser.EOI
let
last_tok_pos
=
ref
(
Lexing
.
dummy_pos
,
Lexing
.
dummy_pos
)
let
rec
token
lexbuf
=
let
rec
token
enc
lexbuf
=
let
f
=
Sedlexing
.
with_tokenizer
Sedlexer
.
token
lexbuf
in
let
f
()
=
let
tok
,
p1
,
p2
=
f
()
in
let
tok
=
match
(
!
last_tok
,
tok
)
with
|
_
,
HASH_ASCII
->
enc
:=
Ascii
;
tok
|
_
,
HASH_LATIN1
->
enc
:=
Latin1
;
tok
|
_
,
HASH_UTF8
->
enc
:=
Utf8
;
tok
|
Parser
.
INCLUDE
,
Parser
.
STRING2
path
->
(
Cduce_loc
.
protect_op
"File inclusion"
;
let
path
=
Cduce_loc
.
resolve_filename
path
in
...
...
@@ -92,8 +132,8 @@ let rec token lexbuf =
Cduce_loc
.
push_source
(
`File
path
);
try
let
cs
=
Stream
.
of_channel
ic
in
let
newlb
=
mk_lexbuf
cs
in
let
past
=
pre_prog
(
token
newlb
)
in
let
newlb
,
enc
=
mk_lexbuf
cs
in
let
past
=
pre_prog
(
token
enc
newlb
)
in
exit_include
ic
;
Parser
.
RESOLVED_INCLUDE
past
with
e
->
...
...
@@ -116,19 +156,29 @@ let rec sync f =
last_tok_pos
:=
(
p1
,
p2
);
sync
f
let
get_loc
lexbuf
=
let
loc1
,
loc2
=
Sedlexing
.
lexing_positions
lexbuf
in
(
loc1
.
Lexing
.
pos_cnum
,
loc2
.
Lexing
.
pos_cnum
)
let
protect_parser
do_sync
gram
stream
=
let
b
=
mk_lexbuf
stream
in
let
b
,
enc
=
mk_lexbuf
stream
in
try
let
f
=
token
b
in
let
f
=
token
enc
b
in
try
gram
f
with
e
->
if
do_sync
then
sync
f
;
raise
e
with
|
Parser
.
Error
->
let
loc1
,
loc2
=
Sedlexing
.
lexing_positions
b
in
Cduce_loc
.
raise_loc
loc1
.
Lexing
.
pos_cnum
loc2
.
Lexing
.
pos_cnum
(
Ast
.
Parsing_error
""
)
let
i
,
j
=
get_loc
b
in
Cduce_loc
.
raise_loc
i
j
(
Ast
.
Parsing_error
""
)
|
Invalid_byte
(
c
,
e
)
->
let
i
,
j
=
get_loc
b
in
let
msg
=
if
String
.
length
c
>
1
then
" sequence"
else
""
in
Cduce_loc
.
raise_loc
i
j
(
Ast
.
Parsing_error
(
Format
.
sprintf
"Invalid byte%s %s for %s encoding"
msg
c
(
str_encoding
e
)))
|
Sedlexer
.
Error
(
i
,
j
,
msg
)
->
Cduce_loc
.
raise_loc
i
j
(
Ast
.
Parsing_error
msg
)
...
...
lang/parser/parser.mly
View file @
40775718
...
...
@@ -90,6 +90,7 @@ let id_dummy = U.mk "$$$"
/*
Keywords
*/
%
token
HASH_PRINT_TYPE
"#print_type"
%
token
HASH_DUMP_VALUE
"#dump_value"
%
token
HASH_ASCII
HASH_LATIN1
HASH_UTF8
%
token
AND
"and"
%
token
DEBUG
"debug"
%
token
DIV
"div"
...
...
@@ -224,9 +225,11 @@ prog_items:
;
prog_item
:
|
item
=
loc
(
prog_item_
)
{
[
item
]
}
|
item
=
loc
(
prog_item_
)
{
[
item
]
}
|
"include"
items
=
RESOLVED_INCLUDE
{
items
}
|
"include"
_s
=
STRING2
{
[]
}
|
HASH_ASCII
|
HASH_LATIN1
|
HASH_UTF8
|
"include"
STRING2
{
[]
}
;
%
inline
prog_item_
:
...
...
lang/parser/sedlexer.ml
View file @
40775718
...
...
@@ -106,6 +106,9 @@ let rec token lexbuf =
|
"_"
->
UNDERSCORE
|
"#print_type"
->
HASH_PRINT_TYPE
|
"#dump_value"
->
HASH_DUMP_VALUE
|
"#ascii"
->
HASH_ASCII
|
"#latin1"
->
HASH_LATIN1
|
"#utf8"
->
HASH_UTF8
|
"#"
,
qname
->
HASH_DIRECTIVE
(
L
.
Utf8
.
lexeme
lexbuf
)
|
ncname
,
":*"
->
let
s
=
L
.
Utf8
.
sub_lexeme
lexbuf
0
(
L
.
lexeme_length
lexbuf
-
2
)
in
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment