ulexer.ml 13.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
open Camlp4.PreCast

module Loc = struct
  type t = int * int

  let mk _ = (0,0)
  let ghost = (-1,-1)

  let of_lexing_position _ = assert false
  let to_ocaml_location _ = assert false
  let of_ocaml_location _ = assert false
  let of_lexbuf _ = assert false
  let of_tuple _ = assert false
  let to_tuple _ = assert false

  let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
  let join (x1, _) = (x1, x1)
  let move _ _ _ = assert false
  let shift _ _ = assert false
  let move_line _ _ = assert false
  let file_name  _ = assert false
  let start_line _ = assert false
  let stop_line  _ = assert false
  let start_bol  _ = assert false
  let stop_bol   _ = assert false
  let start_off  = fst
  let stop_off   = snd
  let start_pos  _ = assert false
  let stop_pos   _ = assert false
  let is_ghost   _ = assert false
  let ghostify   _ = assert false
  let set_file_name _ = assert false
  let strictly_before _ = assert false
  let make_absolute _ = assert false
  let print _ = assert false
  let dump  _ = assert false
  let to_string _ = assert false
  exception Exc_located of t * exn
  let raise loc exn =
40
    match exn with
41 42 43 44 45 46 47 48 49 50
    | Exc_located _ -> raise exn
    | _ -> raise (Exc_located (loc, exn))
  let name = ref "_loc"
end

type token =
  | KEYWORD of string
  | IDENT of string
  | ANY_IN_NS of string
  | INT of string
51
  | STRING1 of string
52
  | STRING2 of string
Pietro Abate's avatar
Pietro Abate committed
53
  | PVAR of string
54 55 56 57 58 59 60 61 62 63 64 65
  | EOI

module Token = struct
  open Format
  module Loc = Loc
  type t = token
  type token = t

  let sf = Printf.sprintf

  let to_string =
    function
66 67 68
    | KEYWORD s -> sf "KEYWORD %S" s
    | IDENT s -> sf "IDENT %S" s
    | INT s -> sf "INT %s" s
69 70
    | STRING1 s    -> sf "STRING \"%s\"" s
    | STRING2 s    -> sf "STRING \"%s\"" s
71
                      (* here it's not %S since the string is already escaped *)
72
    | ANY_IN_NS s -> sf "ANY_IN_NS %S" s
73
    | PVAR s -> sf "PVAR %S" s
74
    | EOI -> sf "EOI"
75 76 77 78 79 80 81 82 83 84

  let print ppf x = pp_print_string ppf (to_string x)

  let match_keyword kwd =
    function
    | KEYWORD kwd' when kwd = kwd' -> true
    | _ -> false

  let extract_string =
    function
85 86
      | KEYWORD s | IDENT s | INT s | STRING1 s | STRING2 s |
	  ANY_IN_NS s | PVAR s -> s
87
      | tok ->
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
        invalid_arg ("Cannot extract a string from this token: "^
                     to_string tok)

  module Error = struct
    type t = string
    exception E of string
    let print = pp_print_string
    let to_string x = x
  end

  module Filter = struct
    type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter

    type t =
      { is_kwd : string -> bool;
        mutable filter : token_filter }
104

105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
    let keyword_conversion tok is_kwd =
      match tok with
      | IDENT s when is_kwd s -> KEYWORD s
      | _ -> tok

    let mk is_kwd =
      { is_kwd = is_kwd;
        filter = (fun s -> s) }

    let filter x =
      let f tok loc =
        let tok' = keyword_conversion tok x.is_kwd in
        (tok', loc)
      in
      let rec filter =
        parser
        | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
        | [< >] -> [< >]
      in
      fun strm -> x.filter (filter strm)

    let define_filter x f = x.filter <- f x.filter

    let keyword_added _ _ _ = ()
    let keyword_removed _ _ = ()
  end

end
module Error = Camlp4.Struct.EmptyError

module L = Ulexing
136 137 138 139 140 141

exception Error of int * int * string

let error i j s = raise (Error (i,j,s))

(* Buffer for string literals *)
142

143 144
let string_buff = Buffer.create 1024

145
let store_lexeme lexbuf =
146 147 148 149 150 151 152 153 154 155 156 157 158
  Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code  = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
  let s = Buffer.contents string_buff in
  clear_buff ();
  Buffer.clear string_buff;
  s

let enc = ref L.Latin1

(* Parse characters literals \123; \x123; *)
159

160 161 162 163 164
let hexa_digit = function
  | '0'..'9' as c -> (Char.code c) - (Char.code '0')
  | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
  | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
  | _ -> -1
165

166
let parse_char lexbuf base i =
167
  let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
168 169 170
  let r = ref 0 in
  for i = 0 to String.length s - 1 do
    let c = hexa_digit s.[i] in
171
    if (c >= base) || (c < 0) then
172 173 174 175 176
      error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
    r := !r * base + c;
  done;
  !r

177 178 179 180 181 182 183 184 185
(* this should match the string lexer *)
let regexp utf8_char =  [^ '\\' '"' '\'' 9 10 13 ]

let regexp dec_char = '\\' ['0'-'9']+ ';'

let regexp hex_char = "\\x" ['0'-'9''a'-'f''A'-'F']+ ';'
let regexp esc_char = '\\' ['\\' '"' '\'' 'n' 't' 'r']
let regexp single_char = utf8_char | dec_char | hex_char | esc_char | '"'

186
let regexp ncname_char =
187
  xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
188
let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
189 190
let regexp qname = (ncname ':')? ncname

191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261

(* We want to write _ \ (xml_letter | '_') but we can't due to a limitation in ulex.
   we define explicitely this regexp, based on the definition in ulex-1.1
*)
let regexp not_ncname_letter = [^ (* base_char *)
    0x0041-0x005A 0x0061-0x007A 0x00C0-0x00D6 0x00D8-0x00F6
    0x00F8-0x00FF 0x0100-0x0131 0x0134-0x013E 0x0141-0x0148
    0x014A-0x017E 0x0180-0x01C3 0x01CD-0x01F0 0x01F4-0x01F5
    0x01FA-0x0217 0x0250-0x02A8 0x02BB-0x02C1 0x0386-0x0386
    0x0388-0x038A 0x038C-0x038C 0x038E-0x03A1 0x03A3-0x03CE
    0x03D0-0x03D6 0x03DA-0x03DA 0x03DC-0x03DC 0x03DE-0x03DE
    0x03E0-0x03E0 0x03E2-0x03F3
    0x0401-0x040C 0x040E-0x044F 0x0451-0x045C 0x045E-0x0481
    0x0490-0x04C4 0x04C7-0x04C8 0x04CB-0x04CC 0x04D0-0x04EB
    0x04EE-0x04F5 0x04F8-0x04F9 0x0531-0x0556 0x0559-0x0559
    0x0561-0x0586 0x05D0-0x05EA 0x05F0-0x05F2 0x0621-0x063A
    0x0641-0x064A 0x0671-0x06B7 0x06BA-0x06BE 0x06C0-0x06CE
    0x06D0-0x06D3 0x06D5-0x06D5 0x06E5-0x06E6 0x0905-0x0939
    0x093D-0x093D
    0x0958-0x0961 0x0985-0x098C 0x098F-0x0990 0x0993-0x09A8
    0x09AA-0x09B0 0x09B2-0x09B2 0x09B6-0x09B9 0x09DC-0x09DD
    0x09DF-0x09E1 0x09F0-0x09F1 0x0A05-0x0A0A 0x0A0F-0x0A10
    0x0A13-0x0A28 0x0A2A-0x0A30 0x0A32-0x0A33 0x0A35-0x0A36
    0x0A38-0x0A39 0x0A59-0x0A5C 0x0A5E-0x0A5E 0x0A72-0x0A74
    0x0A85-0x0A8B 0x0A8D-0x0A8D 0x0A8F-0x0A91 0x0A93-0x0AA8
    0x0AAA-0x0AB0 0x0AB2-0x0AB3 0x0AB5-0x0AB9 0x0ABD-0x0ABD
    0x0AE0-0x0AE0
    0x0B05-0x0B0C 0x0B0F-0x0B10 0x0B13-0x0B28 0x0B2A-0x0B30
    0x0B32-0x0B33 0x0B36-0x0B39 0x0B3D-0x0B3D 0x0B5C-0x0B5D
    0x0B5F-0x0B61 0x0B85-0x0B8A 0x0B8E-0x0B90 0x0B92-0x0B95
    0x0B99-0x0B9A 0x0B9C-0x0B9C 0x0B9E-0x0B9F 0x0BA3-0x0BA4
    0x0BA8-0x0BAA 0x0BAE-0x0BB5 0x0BB7-0x0BB9 0x0C05-0x0C0C
    0x0C0E-0x0C10 0x0C12-0x0C28 0x0C2A-0x0C33 0x0C35-0x0C39
    0x0C60-0x0C61 0x0C85-0x0C8C 0x0C8E-0x0C90 0x0C92-0x0CA8
    0x0CAA-0x0CB3 0x0CB5-0x0CB9 0x0CDE-0x0CDE 0x0CE0-0x0CE1
    0x0D05-0x0D0C 0x0D0E-0x0D10 0x0D12-0x0D28 0x0D2A-0x0D39
    0x0D60-0x0D61 0x0E01-0x0E2E 0x0E30-0x0E30 0x0E32-0x0E33
    0x0E40-0x0E45 0x0E81-0x0E82 0x0E84-0x0E84 0x0E87-0x0E88
    0x0E8A-0x0E8A
    0x0E8D-0x0E8D 0x0E94-0x0E97 0x0E99-0x0E9F 0x0EA1-0x0EA3
    0x0EA5-0x0EA5
    0x0EA7-0x0EA7 0x0EAA-0x0EAB 0x0EAD-0x0EAE 0x0EB0-0x0EB0
    0x0EB2-0x0EB3
    0x0EBD-0x0EBD 0x0EC0-0x0EC4 0x0F40-0x0F47 0x0F49-0x0F69
    0x10A0-0x10C5 0x10D0-0x10F6 0x1100-0x1100 0x1102-0x1103
    0x1105-0x1107 0x1109-0x1109 0x110B-0x110C 0x110E-0x1112
    0x113C-0x113C
    0x113E-0x113E 0x1140-0x1140 0x114C-0x114C 0x114E-0x114E
    0x1150-0x1150 0x1154-0x1155 0x1159-0x1159
    0x115F-0x1161 0x1163-0x1163 0x1165-0x1165 0x1167-0x1167
    0x1169-0x1169 0x116D-0x116E
    0x1172-0x1173 0x1175-0x1175 0x119E-0x119E 0x11A8-0x11A8
    0x11AB-0x11AB 0x11AE-0x11AF
    0x11B7-0x11B8 0x11BA-0x11BA 0x11BC-0x11C2 0x11EB-0x11EB
    0x11F0-0x11F0 0x11F9-0x11F9
    0x1E00-0x1E9B 0x1EA0-0x1EF9 0x1F00-0x1F15 0x1F18-0x1F1D
    0x1F20-0x1F45 0x1F48-0x1F4D 0x1F50-0x1F57 0x1F59-0x1F59
    0x1F5B-0x1F5B
    0x1F5D-0x1F5D 0x1F5F-0x1F7D 0x1F80-0x1FB4 0x1FB6-0x1FBC
    0x1FBE-0x1FBE
    0x1FC2-0x1FC4 0x1FC6-0x1FCC 0x1FD0-0x1FD3 0x1FD6-0x1FDB
    0x1FE0-0x1FEC 0x1FF2-0x1FF4 0x1FF6-0x1FFC 0x2126-0x2126
    0x212A-0x212B 0x212E-0x212E 0x2180-0x2182 0x3041-0x3094
    0x30A1-0x30FA 0x3105-0x312C 0xAC00-0xD7A3

  (* ideographic *)
   0x3007-0x3007 0x3021-0x3029 0x4E00-0x9FA5
  (* '_' *)
   '_'
                    ]

262 263 264 265

let illegal lexbuf =
  error
    (L.lexeme_start lexbuf)
266
    (L.lexeme_end lexbuf)
267
    ("Illegal character : '" ^ (L.utf8_lexeme lexbuf) ^ "'")
268 269 270

let in_comment = ref false

271 272 273
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))

274
let rec token = lexer
275 276 277
 | xml_blank+ -> token lexbuf
 | qname ->
     let s = L.utf8_lexeme lexbuf in
278
     return lexbuf (IDENT s)
279 280
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
281
     return lexbuf (ANY_IN_NS s)
282
 | ".:*" ->
283
     return lexbuf (ANY_IN_NS "")
284
 | '-'? ['0'-'9']+ ->
285
     return lexbuf (INT (L.utf8_lexeme lexbuf))
286
 | [ "<>=.,:;+-*/@&{}[]()|?`!" ]
Pietro Abate's avatar
Pietro Abate committed
287
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
288
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
289
 | ".."
290
 | ["?+*"] "?" | "#" ->
291
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
292 293 294 295 296 297 298

 (* the three followning rules work together and are replicated in the comment lexer *)

 | "'" ([^ '\t' '\n' '\r' '\''] | '\\' '\'')* "'" (not_ncname_letter | eof) ->
   (* two single quotes not followed by an xml_letter must be a string
      we put it back call an auxiliary lexer to consume the first ' and read it as a string.
   *)
Julien Lopez's avatar
Julien Lopez committed
299
     L.rollback lexbuf;
300 301 302
   do_string lexbuf


Julien Lopez's avatar
Julien Lopez committed
303
 | "'" ncname ->
304 305 306
   (* then try to read it as variable *)
   let s = L.utf8_sub_lexeme lexbuf 1 (L.lexeme_length lexbuf - 1) in
   return lexbuf (PVAR (s))
307

308 309 310
 | ('"' | "'")  ->
   (* otherwise we will fail for sure, but try to read it character by character as a string
      to get a decent error message *)
Julien Lopez's avatar
Julien Lopez committed
311
     L.rollback lexbuf;
312 313
   do_string lexbuf

314 315 316 317
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
318
     token lexbuf
319 320 321 322
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
323
     token lexbuf
324 325
 | eof ->
     return lexbuf EOI
326
 | _ ->  illegal lexbuf
327

328 329
and do_string = lexer
    | "'" | '"' ->
330
     let start = L.lexeme_start lexbuf in
331 332
      let double = (L.latin1_lexeme lexbuf).[0] == '"' in
      string start double lexbuf;
333
     let s = get_stored_string () in
334 335 336
      return_loc start (L.lexeme_end lexbuf)
        (if double then STRING2 s else STRING1 s)
    | _ -> assert false
337

338 339 340 341 342 343
and comment start = lexer
  | "(*" ->
      comment (L.lexeme_start lexbuf) lexbuf;
      comment start lexbuf
  | "*)" ->
      ()
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361


  | "'" ([^ '\t' '\n' '\r' '\''] | '\\' '\'')* "'" (not_ncname_letter | eof) ->
    L.rollback lexbuf;
    ignore (do_string lexbuf);
    comment start lexbuf


 | "'" ncname -> comment start lexbuf


 | ('"' | "'")  ->
   (* otherwise we will fail for sure, but try to read it character by character as a string
      to get a decent error message *)
   L.rollback lexbuf;
   ignore (do_string lexbuf);
   comment start lexbuf

362 363 364 365 366
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      comment start lexbuf

367 368 369 370 371 372 373 374
and tcomment start = lexer
  | "*/" ->
      ()
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      tcomment start lexbuf

375 376 377 378
and string start double = lexer
  | '"' | "'" ->
      let d = L.latin1_lexeme_char lexbuf 0 = '"' in
      if d != double then (store_lexeme lexbuf; string start double lexbuf)
379
  | '\\' ['\\' '"' '\''] ->
380 381 382 383 384 385 386 387
      store_ascii (L.latin1_lexeme_char lexbuf 1);
      string start double lexbuf
  | "\\n" ->
      store_ascii '\n';	string start double lexbuf
  | "\\t" ->
      store_ascii '\t';	string start double lexbuf
  | "\\r" ->
      store_ascii '\r';	string start double lexbuf
388
  | '\\' ['0'-'9']+ ';' ->
389 390
      store_code (parse_char lexbuf 10 1);
      string start double lexbuf
391
  | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
392 393 394 395 396 397 398 399 400
      store_code (parse_char lexbuf 16 2);
      string start double lexbuf
  | '\\' ->  illegal lexbuf
  | eof ->
      error start (start+1) "Unterminated string"
  | _ ->
      store_lexeme lexbuf;
      string start double lexbuf

401

402

403
let lexbuf = ref None
404 405
let last_tok = ref (KEYWORD "DUMMY")

406 407


408 409 410 411
let rec sync lb =
  match !last_tok with
  | KEYWORD ";;" | EOI -> ()
  | _ -> last_tok := fst (token lb); sync lb
412

413 414 415 416 417 418
let raise_clean e =
  clear_buff ();
  in_comment := false;
  (* reinit encoding ? *)
  raise e

419

420
let mk () _FIXME_loc cs =
421
  let lb = L.from_var_enc_stream enc cs in
422
  (lexer ("#!" [^ '\n']* "\n")? -> ()) lb;
423
  lexbuf := Some lb;
424
  let next _ =
425
    let tok, loc =
426 427
      try token lb
      with
428
	| Ulexing.Error ->
429
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
430 431
			  "Unexpected character"))
	| Ulexing.InvalidCodepoint i ->
432 433 434 435
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
			  "Code point invalid for the current encoding"))
	| e -> raise_clean e
    in
436 437
    last_tok := tok;
    Some (tok, loc)
438
  in
439
  Stream.from next
440 441 442 443 444 445

let dump_file f =
  let ic = open_in f in
  let lexbuf = L.from_var_enc_channel enc ic in
  (try
     while true do
446 447 448
       let (tok,_) = token lexbuf in
       Format.printf "%a@." Token.print tok;
       if tok = EOI then exit 0
449
     done
450 451 452
   with
     | Ulexing.Error ->
	 Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n"
453 454
	   (Ulexing.lexeme_end lexbuf)
     | Error (i,j,s) ->
455
	 Printf.eprintf "Lexing error at offset %i-%i:\n%s\n"
456 457
	   i j s
     | Ulexing.InvalidCodepoint i ->
458
	 Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n"
459 460 461
	   (Ulexing.lexeme_end lexbuf)
  );
  close_in ic