Commit 8573c0c8 authored by Pietro Abate's avatar Pietro Abate

[r2003-09-23 00:15:07 by cvscast] Passage lexer

Original author: cvscast
Date: 2003-09-23 00:16:26+00:00
parent a29e1e2c
......@@ -3,8 +3,9 @@ include Makefile.distrib
# We put this rule here to avoid re-building wlexer.ml on
# user installation (wlex may not be available)
parser/wlexer.ml: parser/wlexer.mll
wlex parser/wlexer.mll
#parser/wlexer.ml: parser/wlexer.mll
# wlex parser/wlexer.mll
# For development
......@@ -25,7 +26,7 @@ profile:
fi; \
done; \
done
cp parser/wlexer.mll prepro/parser/
#cp parser/wlexer.mll prepro/parser/
cp Makefile depend prepro/
(cd prepro; $(MAKE) cduce PROFILE=true SYNTAX_PARSER= NATIVE=false)
......
include Makefile.conf
VERSION = 0.2.0
PACKAGES = pxp-engine pxp-lex-iso88591 wlexing camlp4 num cgi pcre netstring
PACKAGES = pxp-engine pxp-lex-iso88591 ulex camlp4 num cgi pcre netstring
ifeq ($(PXP_WLEX), true)
PACKAGES += pxp-wlex-utf8
else
PACKAGES += pxp-lex-utf8
endif
ULEX_PATH = `ocamlfind query ulex`
SYNTAX = camlp4o -I misc/ pa_extend.cmo \
q_symbol.cmo \
$(shell ocamlfind query ulex)/pa_ulex.cma \
-symbol cduce_version=\"$(VERSION)\" \
-symbol build_date=\"$(shell date +%Y-%m-%d)\" \
-symbol session_dir=\"$(SESSION_DIR)\"
......@@ -86,7 +88,7 @@ OBJECTS = \
schema/schema_types.cmo schema/schema_xml.cmo schema/schema_builtin.cmo \
schema/schema_validator.cmo schema/schema_parser.cmo \
\
parser/location.cmo parser/wlexer.cmo parser/ast.cmo parser/parser.cmo \
parser/location.cmo parser/ulexer.cmo parser/ast.cmo parser/parser.cmo \
\
typing/typed.cmo typing/typer.cmo \
\
......
......@@ -26,10 +26,10 @@ types/intervals.cmo: misc/q_symbol.cmo misc/custom.cmo misc/serialize.cmi types/
types/intervals.cmx: misc/q_symbol.cmo misc/custom.cmx misc/serialize.cmx types/intervals.cmi
types/chars.cmo: misc/q_symbol.cmo misc/custom.cmo types/chars.cmi
types/chars.cmx: misc/q_symbol.cmo misc/custom.cmx types/chars.cmi
types/atoms.cmo: misc/q_symbol.cmo misc/encodings.cmi misc/ns.cmi misc/pool.cmi \
types/sortedList.cmi types/atoms.cmi
types/atoms.cmx: misc/q_symbol.cmo misc/encodings.cmx misc/ns.cmx misc/pool.cmx \
types/sortedList.cmx types/atoms.cmi
types/atoms.cmo: misc/q_symbol.cmo misc/custom.cmo misc/encodings.cmi misc/ns.cmi misc/pool.cmi \
misc/serialize.cmi types/sortedList.cmi types/atoms.cmi
types/atoms.cmx: misc/q_symbol.cmo misc/custom.cmx misc/encodings.cmx misc/ns.cmx misc/pool.cmx \
misc/serialize.cmx types/sortedList.cmx types/atoms.cmi
types/normal.cmo: misc/q_symbol.cmo types/normal.cmi
types/normal.cmx: misc/q_symbol.cmo types/normal.cmi
types/types.cmo: misc/q_symbol.cmo types/atoms.cmi misc/bool.cmi types/chars.cmi \
......@@ -88,8 +88,8 @@ schema/schema_parser.cmx: misc/q_symbol.cmo schema/schema_builtin.cmx schema/sch
schema/schema_parser.cmi
parser/location.cmo: misc/q_symbol.cmo parser/location.cmi
parser/location.cmx: misc/q_symbol.cmo parser/location.cmi
parser/wlexer.cmo: misc/q_symbol.cmo misc/encodings.cmi parser/location.cmi
parser/wlexer.cmx: misc/q_symbol.cmo misc/encodings.cmx parser/location.cmx
parser/ulexer.cmo: misc/q_symbol.cmo parser/ulexer.cmi
parser/ulexer.cmx: misc/q_symbol.cmo parser/ulexer.cmi
parser/ast.cmo: misc/q_symbol.cmo types/ident.cmo parser/location.cmi misc/ns.cmi \
schema/schema_types.cmi types/types.cmi
parser/ast.cmx: misc/q_symbol.cmo types/ident.cmx parser/location.cmx misc/ns.cmx \
......@@ -98,12 +98,12 @@ parser/parser.cmo: misc/q_symbol.cmo parser/ast.cmo types/atoms.cmi types/chars.
misc/encodings.cmi types/ident.cmo types/intervals.cmi \
parser/location.cmi misc/ns.cmi schema/schema_parser.cmi \
schema/schema_xml.cmi types/sequence.cmi types/types.cmi \
parser/wlexer.cmo parser/parser.cmi
parser/ulexer.cmi parser/parser.cmi
parser/parser.cmx: misc/q_symbol.cmo parser/ast.cmx types/atoms.cmx types/chars.cmx \
misc/encodings.cmx types/ident.cmx types/intervals.cmx \
parser/location.cmx misc/ns.cmx schema/schema_parser.cmx \
schema/schema_xml.cmx types/sequence.cmx types/types.cmx \
parser/wlexer.cmx parser/parser.cmi
parser/ulexer.cmx parser/parser.cmi
typing/typed.cmo: misc/q_symbol.cmo types/ident.cmo parser/location.cmi types/patterns.cmi \
types/types.cmi runtime/value.cmi
typing/typed.cmx: misc/q_symbol.cmo types/ident.cmx parser/location.cmx types/patterns.cmx \
......@@ -111,13 +111,13 @@ typing/typed.cmx: misc/q_symbol.cmo types/ident.cmx parser/location.cmx types/pa
typing/typer.cmo: misc/q_symbol.cmo parser/ast.cmo types/atoms.cmi types/builtin_defs.cmi \
types/chars.cmi types/ident.cmo parser/location.cmi misc/ns.cmi \
types/patterns.cmi schema/schema_builtin.cmi schema/schema_types.cmi \
schema/schema_validator.cmi types/sequence.cmi misc/serialize.cmi \
misc/state.cmi typing/typed.cmo types/types.cmi typing/typer.cmi
schema/schema_validator.cmi types/sequence.cmi misc/state.cmi \
typing/typed.cmo types/types.cmi typing/typer.cmi
typing/typer.cmx: misc/q_symbol.cmo parser/ast.cmx types/atoms.cmx types/builtin_defs.cmx \
types/chars.cmx types/ident.cmx parser/location.cmx misc/ns.cmx \
types/patterns.cmx schema/schema_builtin.cmx schema/schema_types.cmx \
schema/schema_validator.cmx types/sequence.cmx misc/serialize.cmx \
misc/state.cmx typing/typed.cmx types/types.cmx typing/typer.cmi
schema/schema_validator.cmx types/sequence.cmx misc/state.cmx \
typing/typed.cmx types/types.cmx typing/typer.cmi
runtime/load_xml.cmo: misc/q_symbol.cmo types/atoms.cmi misc/encodings.cmi types/ident.cmo \
parser/location.cmi misc/ns.cmi runtime/value.cmi runtime/load_xml.cmi
runtime/load_xml.cmx: misc/q_symbol.cmo types/atoms.cmx misc/encodings.cmx types/ident.cmx \
......@@ -153,17 +153,17 @@ types/builtin.cmx: misc/q_symbol.cmo parser/ast.cmx types/atoms.cmx types/builti
driver/cduce.cmo: misc/q_symbol.cmo parser/ast.cmo runtime/eval.cmi types/ident.cmo \
parser/location.cmi misc/ns.cmi parser/parser.cmi types/patterns.cmi \
types/sample.cmi misc/state.cmi typing/typed.cmo typing/typer.cmi \
types/types.cmi runtime/value.cmi parser/wlexer.cmo driver/cduce.cmi
types/types.cmi parser/ulexer.cmi runtime/value.cmi driver/cduce.cmi
driver/cduce.cmx: misc/q_symbol.cmo parser/ast.cmx runtime/eval.cmx types/ident.cmx \
parser/location.cmx misc/ns.cmx parser/parser.cmx types/patterns.cmx \
types/sample.cmx misc/state.cmx typing/typed.cmx typing/typer.cmx \
types/types.cmx runtime/value.cmx parser/wlexer.cmx driver/cduce.cmi
types/types.cmx parser/ulexer.cmx runtime/value.cmx driver/cduce.cmi
driver/run.cmo: misc/q_symbol.cmo driver/cduce.cmi types/ident.cmo runtime/load_xml.cmi \
parser/location.cmi types/sequence.cmi misc/state.cmi runtime/value.cmi \
parser/wlexer.cmo
parser/location.cmi types/sequence.cmi misc/state.cmi parser/ulexer.cmi \
runtime/value.cmi
driver/run.cmx: misc/q_symbol.cmo driver/cduce.cmx types/ident.cmx runtime/load_xml.cmx \
parser/location.cmx types/sequence.cmx misc/state.cmx runtime/value.cmx \
parser/wlexer.cmx
parser/location.cmx types/sequence.cmx misc/state.cmx parser/ulexer.cmx \
runtime/value.cmx
driver/webiface.cmo: misc/q_symbol.cmo driver/cduce.cmi driver/examples.cmo parser/location.cmi \
misc/state.cmi
driver/webiface.cmx: misc/q_symbol.cmo driver/cduce.cmx driver/examples.cmx parser/location.cmx \
......@@ -175,7 +175,7 @@ types/sortedList.cmi: misc/q_symbol.cmo misc/custom.cmo
types/boolean.cmi: misc/q_symbol.cmo misc/custom.cmo
types/intervals.cmi: misc/q_symbol.cmo misc/custom.cmo
types/chars.cmi: misc/q_symbol.cmo misc/custom.cmo
types/atoms.cmi: misc/q_symbol.cmo misc/encodings.cmi misc/ns.cmi
types/atoms.cmi: misc/q_symbol.cmo misc/custom.cmo misc/encodings.cmi misc/ns.cmi
types/types.cmi: misc/q_symbol.cmo types/atoms.cmi types/chars.cmi misc/custom.cmo \
types/ident.cmo types/intervals.cmi
types/patterns.cmi: misc/q_symbol.cmo types/atoms.cmi types/chars.cmi types/ident.cmo \
......
......@@ -87,14 +87,10 @@ let rec print_exn ppf = function
| Typer.UnboundId (x,tn) ->
Format.fprintf ppf "Unbound identifier %a%s@." U.print (Id.value x)
(if tn then " (it is a type name)" else "")
| Wlexer.Illegal_character c ->
Format.fprintf ppf "Illegal character (%a)@." print_protect (Char.escaped c)
| Wlexer.Unterminated_comment ->
Format.fprintf ppf "Comment not terminated@."
| Wlexer.Unterminated_string ->
Format.fprintf ppf "String literal not terminated@."
| Wlexer.Unterminated_string_in_comment ->
Format.fprintf ppf "This comment contains an unterminated string literal@."
| Ulexer.Error (i,j,s) ->
let loc = Location.loc_of_pos (i,j), `Full in
Format.fprintf ppf "Error %a:@." Location.print_loc loc;
Format.fprintf ppf "%a%s" Location.html_hilight loc s
| Parser.Error s | Stream.Error s ->
Format.fprintf ppf "Parsing error: %a@." print_protect s
| Location.Generic s ->
......
......@@ -99,7 +99,7 @@ let toploop () =
Location.push_source `Stream;
let read i =
if !bol then
if !Wlexer.in_comment then outflush "* " else outflush "> ";
if !Ulexer.in_comment then outflush "* " else outflush "> ";
try
let c = input_char stdin in
bol := c = '\n';
......@@ -120,13 +120,13 @@ let do_file s =
let chan = open_in s in
Location.push_source (`File s);
let input = Stream.of_channel chan in
if Stream.peek input = Some '#' then
if Stream.npeek 2 input = ['#';'!'] then
(
let rec count n =
match Stream.next input with
| '\n' -> n
| _ -> count (n + 1) in
Wlexer.set_delta_loc (count 1)
Ulexer.set_delta_loc (count 1)
);
let ok = Cduce.script ppf ppf_err input in
close_in chan;
......
......@@ -10,14 +10,14 @@ let () = Grammar.error_verbose := true
exception Error of string
let error (i,j) s = Location.raise_loc i j (Error s)
let gram = Grammar.gcreate (Wlexer.lexer Wlexer.token Wlexer.latin1_engine)
let gram = Grammar.gcreate Ulexer.lex
let true_atom = Atoms.V.mk_ascii "true"
let false_atom = Atoms.V.mk_ascii "false"
let true_type = Types.atom (Atoms.atom true_atom)
let false_type = Types.atom (Atoms.atom false_atom)
let parse_ident = U.mk_latin1
let parse_ident = U.mk
let id_dummy = ident (U.mk "$$$")
......@@ -113,6 +113,9 @@ EXTEND
let e = exp loc (NamespaceIn (name, ns, e2)) in
[ mk loc (EvalStatement (exp loc e)) ]
| "debug"; d = debug_directive -> [ mk loc (Debug d) ]
| DIRECTIVE "#utf8" -> Ulexer.enc := Ulexing.Utf8; [ ]
| DIRECTIVE "#latin1" -> Ulexer.enc := Ulexing.Latin1; [ ]
| DIRECTIVE "#ascii" -> Ulexer.enc := Ulexing.Ascii; [ ]
| DIRECTIVE "#quit" -> [ mk loc (Directive `Quit) ]
| DIRECTIVE "#env" -> [ mk loc (Directive `Env) ]
| DIRECTIVE "#reinit_ns" -> [ mk loc (Directive `Reinit_ns) ]
......@@ -129,6 +132,8 @@ EXTEND
else (
include_stack := s :: !include_stack;
Location.push_source (`File s);
let saved_enc = !Ulexer.enc in
Ulexer.enc := Ulexing.Latin1;
protect_exn
(fun () ->
let chan = open_in s in
......@@ -138,6 +143,7 @@ EXTEND
Grammar.Entry.parse prog input)
(fun () -> close_in chan))
(fun () ->
Ulexer.enc := saved_enc;
Location.pop_source ();
include_stack := List.tl !include_stack)
)
......@@ -476,7 +482,6 @@ EXTEND
[ x = tag_type -> x
| "("; t = pat; ")" -> t ];
a = attrib_spec; ">"; c = pat ->
(* let t = mk loc (Prod (mk loc (Internal Sequence.nil_type), t)) in *)
mk loc (XmlT (t, multi_prod loc [a;c]))
| s = STRING2 ->
let s =
......@@ -556,14 +561,14 @@ and prog = Grammar.Entry.parse prog
and top_phrases = Grammar.Entry.parse top_phrases
let sync () =
match !Wlexer.lexbuf with
match !Ulexer.lexbuf with
| None -> ()
| Some lb ->
let rec aux () =
match !Wlexer.last_tok with
match !Ulexer.last_tok with
| ("",";;") | ("EOI","") -> ()
| _ ->
Wlexer.last_tok := Wlexer.token Wlexer.latin1_engine lb;
Ulexer.last_tok := Ulexer.token lb;
aux ()
in
aux ()
module L = Ulexing
let keywords : (string,unit) Hashtbl.t = Hashtbl.create 17
exception Error of int * int * string
let error i j s = raise (Error (i,j,s))
(* Buffer for string literals *)
let string_buff = Buffer.create 1024
let store_lexeme lexbuf =
Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
let s = Buffer.contents string_buff in
clear_buff ();
Buffer.clear string_buff;
s
let enc = ref L.Latin1
(* Parse characters literals \123; \x123; *)
let hexa_digit = function
| '0'..'9' as c -> (Char.code c) - (Char.code '0')
| 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
| 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
| _ -> -1
let parse_char lexbuf base i =
let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
let r = ref 0 in
for i = 0 to String.length s - 1 do
let c = hexa_digit s.[i] in
if (c >= base) || (c < 0) then
error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
r := !r * base + c;
done;
!r
let regexp ncname_char =
xml_letter | xml_digit | [ '.' '-' '_' ] | xml_combining_char | xml_extender
let regexp ncname = (xml_letter | '_' ) ncname_char*
let regexp qname = (ncname ':')? ncname
let illegal lexbuf =
error
(L.lexeme_start lexbuf)
(L.lexeme_end lexbuf)
"Illegal character"
let in_comment = ref false
let rec token = lexer
| xml_blank+ -> token lexbuf
| qname ->
let s = L.utf8_lexeme lexbuf in
if Hashtbl.mem keywords s then "",s else "IDENT",s
| ncname ":*" ->
let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
"ANY_IN_NS", s
| ".:*" ->
"ANY_IN_NS", ""
| '-'? ['0'-'9']+ ->
"INT", L.utf8_lexeme lexbuf
| [ "<>=.,:;+-*/@&{}[]()|?`!" ]
| "->" | "::" | ";;" | "--" | ":=" | "\\" | "++"
| "{|" | "|}" | "<=" | ">=" | "<<" | ">>"
| ["?+*"] "?" | "#" ->
"", L.utf8_lexeme lexbuf
| "#" ncname ->
"DIRECTIVE", L.utf8_lexeme lexbuf
| '"' | "'" ->
let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
string (L.lexeme_start lexbuf) double_quote lexbuf;
(if double_quote then "STRING2" else "STRING1"),
(get_stored_string())
| "(*" ->
in_comment := true;
comment (L.lexeme_start lexbuf) lexbuf;
in_comment := false;
token lexbuf
| eof ->
"EOI",""
| _ ->
illegal lexbuf
and comment start = lexer
| "(*" ->
comment (L.lexeme_start lexbuf) lexbuf;
comment start lexbuf
| "*)" ->
()
| '"' | "'" ->
let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in
string (L.lexeme_start lexbuf) double_quote lexbuf;
clear_buff ();
comment start lexbuf
| eof ->
error start (start+2) "Unterminated comment"
| _ ->
comment start lexbuf
and string start double = lexer
| '"' | "'" ->
let d = L.latin1_lexeme_char lexbuf 0 = '"' in
if d != double then (store_lexeme lexbuf; string start double lexbuf)
| '\\' ['\\' '"' '\''] ->
store_ascii (L.latin1_lexeme_char lexbuf 1);
string start double lexbuf
| "\\n" ->
store_ascii '\n'; string start double lexbuf
| "\\t" ->
store_ascii '\t'; string start double lexbuf
| "\\r" ->
store_ascii '\r'; string start double lexbuf
| '\\' ['0'-'9']+ ';' ->
store_code (parse_char lexbuf 10 1);
string start double lexbuf
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
store_code (parse_char lexbuf 16 2);
string start double lexbuf
| '\\' ->
illegal lexbuf;
| eof ->
error start (start+1) "Unterminated string"
| _ ->
store_lexeme lexbuf;
string start double lexbuf
let delta_loc = ref 0
let set_delta_loc dl = delta_loc := dl
let lexbuf = ref None
let last_tok = ref ("","")
let tok_func cs =
let dl = !delta_loc in
delta_loc := 0;
let lb = L.from_var_enc_stream enc cs in
lexbuf := Some lb;
let next () =
let tok =
try token lb
with
| Ulexing.Error ->
raise (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
"Unexpected character"))
| Ulexing.InvalidCodepoint i ->
raise (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
"Code point invalid for the current encoding")) in
(* TODO: translate Error exn with offset dl ? *)
let loc = (L.lexeme_start lb + dl, L.lexeme_end lb + dl) in
last_tok := tok;
(tok, loc)
in
Token.make_stream_and_location next
let register_kw (s1,s2) =
if s1 = "" then
match s2.[0] with
| 'a' .. 'z' when not (Hashtbl.mem keywords s2) ->
Hashtbl.add keywords s2 ()
| _ -> ()
let lex =
{
Token.tok_func = tok_func;
Token.tok_using = register_kw;
Token.tok_removing = (fun _ -> ());
Token.tok_match = Token.default_match;
Token.tok_text = Token.lexer_text;
Token.tok_comm = None;
}
let dump_file f =
let ic = open_in f in
let lexbuf = L.from_var_enc_channel enc ic in
(try
while true do
let (a,b) = token lexbuf in
Printf.printf "%s: \"%s\"\n" a b;
if a = "EOI" then exit 0
done
with
| Ulexing.Error ->
Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n"
(Ulexing.lexeme_end lexbuf)
| Error (i,j,s) ->
Printf.eprintf "Lexing error at offset %i-%i:\n%s\n"
i j s
| Ulexing.InvalidCodepoint i ->
Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n"
(Ulexing.lexeme_end lexbuf)
);
close_in ic
exception Error of int * int * string
val token: Ulexing.lexbuf -> string * string
val lex: (string * string) Token.glexer
val in_comment: bool ref
val set_delta_loc: int -> unit
val lexbuf: Ulexing.lexbuf option ref
val enc: Ulexing.enc ref
val last_tok: (string * string) ref
val dump_file: string -> unit
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment