ulexer.ml 14 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
open Camlp4.PreCast

module Loc = struct
  type t = int * int

  let mk _ = (0,0)
  let ghost = (-1,-1)

  let of_lexing_position _ = assert false
  let to_ocaml_location _ = assert false
  let of_ocaml_location _ = assert false
  let of_lexbuf _ = assert false
  let of_tuple _ = assert false
  let to_tuple _ = assert false

  let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
  let join (x1, _) = (x1, x1)
  let move _ _ _ = assert false
  let shift _ _ = assert false
  let move_line _ _ = assert false
  let file_name  _ = assert false
  let start_line _ = assert false
  let stop_line  _ = assert false
  let start_bol  _ = assert false
  let stop_bol   _ = assert false
  let start_off  = fst
  let stop_off   = snd
  let start_pos  _ = assert false
  let stop_pos   _ = assert false
  let is_ghost   _ = assert false
  let ghostify   _ = assert false
  let set_file_name _ = assert false
  let strictly_before _ = assert false
  let make_absolute _ = assert false
  let print _ = assert false
  let dump  _ = assert false
  let to_string _ = assert false
  exception Exc_located of t * exn
  let raise loc exn =
40
    match exn with
41 42 43 44 45 46 47 48 49 50
    | Exc_located _ -> raise exn
    | _ -> raise (Exc_located (loc, exn))
  let name = ref "_loc"
end

type token =
  | KEYWORD of string
  | IDENT of string
  | ANY_IN_NS of string
  | INT of string
51
  | CHAR of string
52
  | STRING of string
53
  | STRING2 of string
54
  | PTYPE of string
55 56 57 58 59 60 61 62 63 64 65 66
  | EOI

module Token = struct
  open Format
  module Loc = Loc
  type t = token
  type token = t

  let sf = Printf.sprintf

  let to_string =
    function
67 68 69
    | KEYWORD s -> sf "KEYWORD %S" s
    | IDENT s -> sf "IDENT %S" s
    | INT s -> sf "INT %s" s
70
    | CHAR s -> sf "CHAR \'%s\'" s
71
    | STRING s -> sf "STRING \"%s\"" s
72
    | STRING2 s -> sf "STRING \'%s\'" s
73
                      (* here it's not %S since the string is already escaped *)
74
    | PTYPE s -> sf "PTYPE \'%S\'" s
75 76
    | ANY_IN_NS s -> sf "ANY_IN_NS %S" s
    | EOI -> sf "EOI"
77 78 79 80 81 82 83 84 85 86

  let print ppf x = pp_print_string ppf (to_string x)

  let match_keyword kwd =
    function
    | KEYWORD kwd' when kwd = kwd' -> true
    | _ -> false

  let extract_string =
    function
87
      | KEYWORD s | IDENT s | INT s | CHAR s | STRING s | STRING2 s | PTYPE s |
88
	  ANY_IN_NS s -> s
89
      | tok ->
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
        invalid_arg ("Cannot extract a string from this token: "^
                     to_string tok)

  module Error = struct
    type t = string
    exception E of string
    let print = pp_print_string
    let to_string x = x
  end

  module Filter = struct
    type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter

    type t =
      { is_kwd : string -> bool;
        mutable filter : token_filter }
106

107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
    let keyword_conversion tok is_kwd =
      match tok with
      | IDENT s when is_kwd s -> KEYWORD s
      | _ -> tok

    let mk is_kwd =
      { is_kwd = is_kwd;
        filter = (fun s -> s) }

    let filter x =
      let f tok loc =
        let tok' = keyword_conversion tok x.is_kwd in
        (tok', loc)
      in
      let rec filter =
        parser
        | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
        | [< >] -> [< >]
      in
      fun strm -> x.filter (filter strm)

    let define_filter x f = x.filter <- f x.filter

    let keyword_added _ _ _ = ()
    let keyword_removed _ _ = ()
  end

end
module Error = Camlp4.Struct.EmptyError

module L = Ulexing
138 139 140

exception Error of int * int * string

141 142
let toplevel = ref false

143 144 145
let error i j s = raise (Error (i,j,s))

(* Buffer for string literals *)
146

147 148
let string_buff = Buffer.create 1024

149
let store_lexeme lexbuf =
150 151 152 153 154 155 156 157 158 159 160 161 162
  Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code  = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
  let s = Buffer.contents string_buff in
  clear_buff ();
  Buffer.clear string_buff;
  s

let enc = ref L.Latin1

(* Parse characters literals \123; \x123; *)
163

164 165 166 167 168
let hexa_digit = function
  | '0'..'9' as c -> (Char.code c) - (Char.code '0')
  | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
  | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
  | _ -> -1
169

170
let parse_char lexbuf base i =
171
  let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
172 173 174
  let r = ref 0 in
  for i = 0 to String.length s - 1 do
    let c = hexa_digit s.[i] in
175
    if (c >= base) || (c < 0) then
176 177 178 179 180 181
      error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
    r := !r * base + c;
  done;
  !r


182
let regexp ncname_char =
183
  xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
184
let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
185 186
let regexp qname = (ncname ':')? ncname

187 188
(* Should be [^ xml_letter ] *)
let regexp not_xml_letter = [^ 'A'-'Z' 'a'-'z' '0'-'9' '_' ]
189 190
let regexp character = _ | '\\' ['\\' '"' '\''] | "\\n" | "\\t" | "\\r"
  | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' | '\\' ['0'-'9']+ ';'
191 192 193 194

let illegal lexbuf =
  error
    (L.lexeme_start lexbuf)
195
    (L.lexeme_end lexbuf)
196 197 198
    "Illegal character"

let in_comment = ref false
199
let in_brackets = ref 0
200

201 202 203
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))

204
let rec token = lexer
205 206 207
 | xml_blank+ -> token lexbuf
 | qname ->
     let s = L.utf8_lexeme lexbuf in
208
     return lexbuf (IDENT s)
209 210
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
211
     return lexbuf (ANY_IN_NS s)
212
 | ".:*" ->
213
     return lexbuf (ANY_IN_NS "")
214
 | '-'? ['0'-'9']+ ->
215
     return lexbuf (INT (L.utf8_lexeme lexbuf))
216
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
217
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
218
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
219
 | ".."
220
 | ["?+*"] "?" | "#" ->
221
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
222 223 224 225
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
226
 | '"' ->
227
     let start = L.lexeme_start lexbuf in
Julien Lopez's avatar
Julien Lopez committed
228
     string (L.lexeme_start lexbuf) '"' lexbuf;
229
     let s = get_stored_string () in
230
     return_loc start (L.lexeme_end lexbuf) (STRING s)
231
 | "'" character "'" ->
Julien Lopez's avatar
Julien Lopez committed
232 233 234 235 236 237 238 239
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (CHAR s)
	 | _ -> assert false) () lexbuf
 | "'" ncname ->
240 241 242
     let s = L.utf8_lexeme lexbuf in
     let s = String.sub s 1 (String.length s - 1) in
     return lexbuf (PTYPE s)
243 244 245 246 247
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token lexbuf
248 249 250 251 252
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token lexbuf
253
 | eof ->
254
     return lexbuf EOI
255
 | _ ->
256 257
     illegal lexbuf

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
and token2 = lexer
 | xml_blank+ -> token2 lexbuf
 | qname ->
     let s = L.utf8_lexeme lexbuf in
     return lexbuf (IDENT s)
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
     return lexbuf (ANY_IN_NS s)
 | ".:*" ->
     return lexbuf (ANY_IN_NS "")
 | '-'? ['0'-'9']+ ->
     return lexbuf (INT (L.utf8_lexeme lexbuf))
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
 | ".."
 | ["?+*"] "?" | "#" ->
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | '"' ->
     let start = L.lexeme_start lexbuf in
     string (L.lexeme_start lexbuf) '"' lexbuf;
     let s = get_stored_string () in
     return_loc start (L.lexeme_end lexbuf) (STRING s)
285
 | "'" character "'--'" character "'"
286
 | "'" [^ '\'']+ "'" not_xml_letter ->
Julien Lopez's avatar
Julien Lopez committed
287 288 289 290 291 292 293
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (STRING2 s)
	 | _ -> assert false) () lexbuf
Julien Lopez's avatar
Julien Lopez committed
294
 | "(" [" \t"]* "'" ncname [" \t"]* ")" ->
295
     let s = L.utf8_lexeme lexbuf in
296 297 298 299 300 301 302
     let idstart = String.index s '\'' + 1 in
     let s = String.sub s idstart (String.length s - idstart) in
     let len = String.length s in
     let idend = min (min (try String.index s ' ' with _ -> len)
			(try String.index s '\t' with _ -> len))
       (try String.index s ')' with _ -> len) in
     let s = String.sub s 0 idend in
303
     return lexbuf (PTYPE s)
304
 | "'" ncname ->
305
     let s = L.utf8_lexeme lexbuf in
306
     let s = String.sub s 1 (String.length s - 1) in
307
     return lexbuf (PTYPE s)
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2 lexbuf
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2 lexbuf
 | eof ->
     return lexbuf EOI
 | _ ->
     illegal lexbuf

323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
and token2toplevel = lexer
 | xml_blank+ -> token2toplevel lexbuf
 | qname ->
     let s = L.utf8_lexeme lexbuf in
     return lexbuf (IDENT s)
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
     return lexbuf (ANY_IN_NS s)
 | ".:*" ->
     return lexbuf (ANY_IN_NS "")
 | '-'? ['0'-'9']+ ->
     return lexbuf (INT (L.utf8_lexeme lexbuf))
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
 | ".."
 | ["?+*"] "?" | "#" ->
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | '"' ->
     let start = L.lexeme_start lexbuf in
     string (L.lexeme_start lexbuf) '"' lexbuf;
     let s = get_stored_string () in
     return_loc start (L.lexeme_end lexbuf) (STRING s)
350
 | "'" character "'--'" character "'"
351
 | "'" ((";"[^ ";'"]) | [^ ";'"])* ";"? "'" not_xml_letter ->
352 353 354 355 356 357 358
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (STRING2 s)
	 | _ -> assert false) () lexbuf
Julien Lopez's avatar
Julien Lopez committed
359
 | "(" [" \t"]* "'" ncname [" \t"]* ")" ->
360
     let s = L.utf8_lexeme lexbuf in
361 362 363 364 365 366 367
     let idstart = String.index s '\'' + 1 in
     let s = String.sub s idstart (String.length s - idstart) in
     let len = String.length s in
     let idend = min (min (try String.index s ' ' with _ -> len)
			(try String.index s '\t' with _ -> len))
       (try String.index s ')' with _ -> len) in
     let s = String.sub s 0 idend in
368
     return lexbuf (PTYPE s)
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
 | "'" ncname ->
     let s = L.utf8_lexeme lexbuf in
     let s = String.sub s 1 (String.length s - 1) in
     return lexbuf (PTYPE s)
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2toplevel lexbuf
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2toplevel lexbuf
 | eof ->
     return lexbuf EOI
 | _ ->
     illegal lexbuf
387

388 389 390 391 392 393 394 395 396 397 398
and comment start = lexer
  | "(*" ->
      comment (L.lexeme_start lexbuf) lexbuf;
      comment start lexbuf
  | "*)" ->
      ()
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      comment start lexbuf

399 400 401 402 403 404 405 406
and tcomment start = lexer
  | "*/" ->
      ()
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      tcomment start lexbuf

Julien Lopez's avatar
Julien Lopez committed
407 408 409 410 411
and string start endchar = lexer
  | '"' -> if endchar = '"' then ()
    else (store_lexeme lexbuf; string start endchar lexbuf)
  | "'" -> if endchar = '\'' then ()
    else (store_lexeme lexbuf; string start endchar lexbuf)
412
  | '\\' ['\\' '"' '\''] ->
Julien Lopez's avatar
Julien Lopez committed
413 414 415 416
    store_ascii (L.latin1_lexeme_char lexbuf 1); string start endchar lexbuf
  | "\\n" -> store_ascii '\n'; string start endchar lexbuf
  | "\\t" -> store_ascii '\t'; string start endchar lexbuf
  | "\\r" -> store_ascii '\r'; string start endchar lexbuf
417
  | '\\' ['0'-'9']+ ';' ->
Julien Lopez's avatar
Julien Lopez committed
418
    store_code (parse_char lexbuf 10 1); string start endchar lexbuf
419
  | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
Julien Lopez's avatar
Julien Lopez committed
420
      store_code (parse_char lexbuf 16 2); string start endchar lexbuf
421
  | '\\' -> illegal lexbuf;
422
  | eof -> error start (start+1) "Unterminated string"
Julien Lopez's avatar
Julien Lopez committed
423
  | _ -> store_lexeme lexbuf; string start endchar lexbuf
424

425 426
let token lexbuf = if !in_brackets = 0 then token lexbuf
  else if !toplevel then token2toplevel lexbuf else token2 lexbuf
427

428
let lexbuf = ref None
429 430 431 432 433 434
let last_tok = ref (KEYWORD "DUMMY")

let rec sync lb =
  match !last_tok with
  | KEYWORD ";;" | EOI -> ()
  | _ -> last_tok := fst (token lb); sync lb
435

436 437 438 439 440 441
let raise_clean e =
  clear_buff ();
  in_comment := false;
  (* reinit encoding ? *)
  raise e

442
let mk () _FIXME_loc cs =
443
  let lb = L.from_var_enc_stream enc cs in
444
  (lexer ("#!" [^ '\n']* "\n")? -> ()) lb;
445
  lexbuf := Some lb;
446
  let next _ =
447
    let tok, loc =
448 449
      try token lb
      with
450
	| Ulexing.Error ->
451
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
452 453
			  "Unexpected character"))
	| Ulexing.InvalidCodepoint i ->
454 455 456 457
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
			  "Code point invalid for the current encoding"))
	| e -> raise_clean e
    in
458 459
    last_tok := tok;
    Some (tok, loc)
460
  in
461
  Stream.from next
462 463 464 465 466 467

let dump_file f =
  let ic = open_in f in
  let lexbuf = L.from_var_enc_channel enc ic in
  (try
     while true do
468 469 470
       let (tok,_) = token lexbuf in
       Format.printf "%a@." Token.print tok;
       if tok = EOI then exit 0
471
     done
472 473 474
   with
     | Ulexing.Error ->
	 Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n"
475 476
	   (Ulexing.lexeme_end lexbuf)
     | Error (i,j,s) ->
477
	 Printf.eprintf "Lexing error at offset %i-%i:\n%s\n"
478 479
	   i j s
     | Ulexing.InvalidCodepoint i ->
480
	 Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n"
481 482 483
	   (Ulexing.lexeme_end lexbuf)
  );
  close_in ic