ulexer.ml 14.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
open Camlp4.PreCast

module Loc = struct
  type t = int * int

  let mk _ = (0,0)
  let ghost = (-1,-1)

  let of_lexing_position _ = assert false
  let to_ocaml_location _ = assert false
  let of_ocaml_location _ = assert false
  let of_lexbuf _ = assert false
  let of_tuple _ = assert false
  let to_tuple _ = assert false

  let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
  let join (x1, _) = (x1, x1)
  let move _ _ _ = assert false
  let shift _ _ = assert false
  let move_line _ _ = assert false
  let file_name  _ = assert false
  let start_line _ = assert false
  let stop_line  _ = assert false
  let start_bol  _ = assert false
  let stop_bol   _ = assert false
  let start_off  = fst
  let stop_off   = snd
  let start_pos  _ = assert false
  let stop_pos   _ = assert false
  let is_ghost   _ = assert false
  let ghostify   _ = assert false
  let set_file_name _ = assert false
  let strictly_before _ = assert false
  let make_absolute _ = assert false
  let print _ = assert false
  let dump  _ = assert false
  let to_string _ = assert false
  exception Exc_located of t * exn
  let raise loc exn =
40
    match exn with
41
42
43
44
45
46
47
48
49
50
    | Exc_located _ -> raise exn
    | _ -> raise (Exc_located (loc, exn))
  let name = ref "_loc"
end

type token =
  | KEYWORD of string
  | IDENT of string
  | ANY_IN_NS of string
  | INT of string
51
  | CHAR of string
52
  | STRING of string
53
  | STRING2 of string
Pietro Abate's avatar
Pietro Abate committed
54
  | PVAR of string
55
  | IDENTPAR of string
56
57
58
59
60
61
62
63
64
65
66
67
  | EOI

module Token = struct
  open Format
  module Loc = Loc
  type t = token
  type token = t

  let sf = Printf.sprintf

  let to_string =
    function
68
69
70
    | KEYWORD s -> sf "KEYWORD %S" s
    | IDENT s -> sf "IDENT %S" s
    | INT s -> sf "INT %s" s
71
    | CHAR s -> sf "CHAR \'%s\'" s
72
    | STRING s -> sf "STRING \"%s\"" s
73
    | STRING2 s -> sf "STRING \'%s\'" s
74
                      (* here it's not %S since the string is already escaped *)
Pietro Abate's avatar
Pietro Abate committed
75
    | PVAR s -> sf "PVAR \'%S\'" s
76
    | IDENTPAR s -> sf "IDENTPAR \'%S\'" s
77
78
    | ANY_IN_NS s -> sf "ANY_IN_NS %S" s
    | EOI -> sf "EOI"
79
80
81
82
83
84
85
86
87
88

  let print ppf x = pp_print_string ppf (to_string x)

  let match_keyword kwd =
    function
    | KEYWORD kwd' when kwd = kwd' -> true
    | _ -> false

  let extract_string =
    function
89
      | IDENTPAR s | KEYWORD s | IDENT s | INT s | CHAR s | STRING s | STRING2 s | PVAR s |
90
	  ANY_IN_NS s -> s
91
      | tok ->
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
        invalid_arg ("Cannot extract a string from this token: "^
                     to_string tok)

  module Error = struct
    type t = string
    exception E of string
    let print = pp_print_string
    let to_string x = x
  end

  module Filter = struct
    type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter

    type t =
      { is_kwd : string -> bool;
        mutable filter : token_filter }
108

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
    let keyword_conversion tok is_kwd =
      match tok with
      | IDENT s when is_kwd s -> KEYWORD s
      | _ -> tok

    let mk is_kwd =
      { is_kwd = is_kwd;
        filter = (fun s -> s) }

    let filter x =
      let f tok loc =
        let tok' = keyword_conversion tok x.is_kwd in
        (tok', loc)
      in
      let rec filter =
        parser
        | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
        | [< >] -> [< >]
      in
      fun strm -> x.filter (filter strm)

    let define_filter x f = x.filter <- f x.filter

    let keyword_added _ _ _ = ()
    let keyword_removed _ _ = ()
  end

end
module Error = Camlp4.Struct.EmptyError

module L = Ulexing
140
141
142

exception Error of int * int * string

143
144
let toplevel = ref false

145
146
147
let error i j s = raise (Error (i,j,s))

(* Buffer for string literals *)
148

149
150
let string_buff = Buffer.create 1024

151
let store_lexeme lexbuf =
152
153
154
155
156
157
158
159
160
161
162
163
164
  Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code  = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
  let s = Buffer.contents string_buff in
  clear_buff ();
  Buffer.clear string_buff;
  s

let enc = ref L.Latin1

(* Parse characters literals \123; \x123; *)
165

166
167
168
169
170
let hexa_digit = function
  | '0'..'9' as c -> (Char.code c) - (Char.code '0')
  | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
  | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
  | _ -> -1
171

172
let parse_char lexbuf base i =
173
  let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
174
175
176
  let r = ref 0 in
  for i = 0 to String.length s - 1 do
    let c = hexa_digit s.[i] in
177
    if (c >= base) || (c < 0) then
178
179
180
181
182
      error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
    r := !r * base + c;
  done;
  !r

183
let regexp ncname_char =
184
  xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
185
let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
186
187
let regexp qname = (ncname ':')? ncname

188
189
(* Should be [^ xml_letter ] *)
let regexp not_xml_letter = [^ 'A'-'Z' 'a'-'z' '0'-'9' '_' ]
190
191
let regexp character = _ | '\\' ['\\' '"' '\''] | "\\n" | "\\t" | "\\r"
  | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' | '\\' ['0'-'9']+ ';'
192
193
194
195

let illegal lexbuf =
  error
    (L.lexeme_start lexbuf)
196
    (L.lexeme_end lexbuf)
197
198
199
    "Illegal character"

let in_comment = ref false
200
let in_brackets = ref 0
201

202
203
204
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))

205
let rec token = lexer
206
 | xml_blank+ -> token lexbuf
Pietro Abate's avatar
Pietro Abate committed
207
208
 | qname "(" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 1) in
209
     return lexbuf (IDENTPAR s)
210
211
 | qname ->
     let s = L.utf8_lexeme lexbuf in
212
     return lexbuf (IDENT s)
213
214
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
215
     return lexbuf (ANY_IN_NS s)
216
 | ".:*" ->
217
     return lexbuf (ANY_IN_NS "")
218
 | '-'? ['0'-'9']+ ->
219
     return lexbuf (INT (L.utf8_lexeme lexbuf))
220
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
Pietro Abate's avatar
Pietro Abate committed
221
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
222
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
223
 | ".."
224
 | ["?+*"] "?" | "#" ->
225
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
226
227
228
229
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
230
 | '"' ->
231
     let start = L.lexeme_start lexbuf in
Julien Lopez's avatar
Julien Lopez committed
232
     string (L.lexeme_start lexbuf) '"' lexbuf;
233
     let s = get_stored_string () in
234
     return_loc start (L.lexeme_end lexbuf) (STRING s)
235
 | "'" character "'" ->
Julien Lopez's avatar
Julien Lopez committed
236
237
238
239
240
241
242
243
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (CHAR s)
	 | _ -> assert false) () lexbuf
 | "'" ncname ->
244
245
     let s = L.utf8_lexeme lexbuf in
     let s = String.sub s 1 (String.length s - 1) in
Pietro Abate's avatar
Pietro Abate committed
246
     return lexbuf (PVAR s)
247
248
249
250
251
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token lexbuf
252
253
254
255
256
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token lexbuf
257
 | eof ->
258
     return lexbuf EOI
259
 | _ ->
260
261
     illegal lexbuf

262
263
and token2 = lexer
 | xml_blank+ -> token2 lexbuf
Pietro Abate's avatar
Pietro Abate committed
264
265
 | qname "(" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 1) in
266
     return lexbuf (IDENTPAR s)
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
 | qname ->
     let s = L.utf8_lexeme lexbuf in
     return lexbuf (IDENT s)
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
     return lexbuf (ANY_IN_NS s)
 | ".:*" ->
     return lexbuf (ANY_IN_NS "")
 | '-'? ['0'-'9']+ ->
     return lexbuf (INT (L.utf8_lexeme lexbuf))
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
 | ".."
 | ["?+*"] "?" | "#" ->
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | '"' ->
     let start = L.lexeme_start lexbuf in
     string (L.lexeme_start lexbuf) '"' lexbuf;
     let s = get_stored_string () in
     return_loc start (L.lexeme_end lexbuf) (STRING s)
292
 | "'" character "'--'" character "'"
293
 | "'" [^ '\'']+ "'" not_xml_letter ->
Julien Lopez's avatar
Julien Lopez committed
294
295
296
297
298
299
300
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (STRING2 s)
	 | _ -> assert false) () lexbuf
Julien Lopez's avatar
Julien Lopez committed
301
 | "(" [" \t"]* "'" ncname [" \t"]* ")" ->
302
     let s = L.utf8_lexeme lexbuf in
303
304
305
306
307
308
309
     let idstart = String.index s '\'' + 1 in
     let s = String.sub s idstart (String.length s - idstart) in
     let len = String.length s in
     let idend = min (min (try String.index s ' ' with _ -> len)
			(try String.index s '\t' with _ -> len))
       (try String.index s ')' with _ -> len) in
     let s = String.sub s 0 idend in
Pietro Abate's avatar
Pietro Abate committed
310
     return lexbuf (PVAR s)
311
 | "'" ncname ->
312
     let s = L.utf8_lexeme lexbuf in
313
     let s = String.sub s 1 (String.length s - 1) in
Pietro Abate's avatar
Pietro Abate committed
314
     return lexbuf (PVAR s)
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2 lexbuf
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2 lexbuf
 | eof ->
     return lexbuf EOI
 | _ ->
     illegal lexbuf

330
331
and token2toplevel = lexer
 | xml_blank+ -> token2toplevel lexbuf
Pietro Abate's avatar
Pietro Abate committed
332
333
 | qname "(" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 1) in
334
     return lexbuf (IDENTPAR s)
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
 | qname ->
     let s = L.utf8_lexeme lexbuf in
     return lexbuf (IDENT s)
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
     return lexbuf (ANY_IN_NS s)
 | ".:*" ->
     return lexbuf (ANY_IN_NS "")
 | '-'? ['0'-'9']+ ->
     return lexbuf (INT (L.utf8_lexeme lexbuf))
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
 | ".."
 | ["?+*"] "?" | "#" ->
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | '"' ->
     let start = L.lexeme_start lexbuf in
     string (L.lexeme_start lexbuf) '"' lexbuf;
     let s = get_stored_string () in
     return_loc start (L.lexeme_end lexbuf) (STRING s)
360
 | "'" character "'--'" character "'"
361
 | "'" ((";"[^ ";'"]) | [^ ";'"])* ";"? "'" not_xml_letter ->
362
363
364
365
366
367
368
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (STRING2 s)
	 | _ -> assert false) () lexbuf
Julien Lopez's avatar
Julien Lopez committed
369
 | "(" [" \t"]* "'" ncname [" \t"]* ")" ->
370
     let s = L.utf8_lexeme lexbuf in
371
372
373
374
375
376
377
     let idstart = String.index s '\'' + 1 in
     let s = String.sub s idstart (String.length s - idstart) in
     let len = String.length s in
     let idend = min (min (try String.index s ' ' with _ -> len)
			(try String.index s '\t' with _ -> len))
       (try String.index s ')' with _ -> len) in
     let s = String.sub s 0 idend in
Pietro Abate's avatar
Pietro Abate committed
378
     return lexbuf (PVAR s)
379
380
381
 | "'" ncname ->
     let s = L.utf8_lexeme lexbuf in
     let s = String.sub s 1 (String.length s - 1) in
Pietro Abate's avatar
Pietro Abate committed
382
     return lexbuf (PVAR s)
383
384
385
386
387
388
389
390
391
392
393
394
395
396
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2toplevel lexbuf
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2toplevel lexbuf
 | eof ->
     return lexbuf EOI
 | _ ->
     illegal lexbuf
397

398
399
400
401
402
403
404
405
406
407
408
and comment start = lexer
  | "(*" ->
      comment (L.lexeme_start lexbuf) lexbuf;
      comment start lexbuf
  | "*)" ->
      ()
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      comment start lexbuf

409
410
411
412
413
414
415
416
and tcomment start = lexer
  | "*/" ->
      ()
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      tcomment start lexbuf

Julien Lopez's avatar
Julien Lopez committed
417
418
419
420
421
and string start endchar = lexer
  | '"' -> if endchar = '"' then ()
    else (store_lexeme lexbuf; string start endchar lexbuf)
  | "'" -> if endchar = '\'' then ()
    else (store_lexeme lexbuf; string start endchar lexbuf)
422
  | '\\' ['\\' '"' '\''] ->
Julien Lopez's avatar
Julien Lopez committed
423
424
425
426
    store_ascii (L.latin1_lexeme_char lexbuf 1); string start endchar lexbuf
  | "\\n" -> store_ascii '\n'; string start endchar lexbuf
  | "\\t" -> store_ascii '\t'; string start endchar lexbuf
  | "\\r" -> store_ascii '\r'; string start endchar lexbuf
427
  | '\\' ['0'-'9']+ ';' ->
Julien Lopez's avatar
Julien Lopez committed
428
    store_code (parse_char lexbuf 10 1); string start endchar lexbuf
429
  | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
Julien Lopez's avatar
Julien Lopez committed
430
      store_code (parse_char lexbuf 16 2); string start endchar lexbuf
431
  | '\\' -> illegal lexbuf;
432
  | eof -> error start (start+1) "Unterminated string"
Julien Lopez's avatar
Julien Lopez committed
433
  | _ -> store_lexeme lexbuf; string start endchar lexbuf
434

435
436
let token lexbuf = if !in_brackets = 0 then token lexbuf
  else if !toplevel then token2toplevel lexbuf else token2 lexbuf
437

438
let lexbuf = ref None
439
440
441
442
443
444
let last_tok = ref (KEYWORD "DUMMY")

let rec sync lb =
  match !last_tok with
  | KEYWORD ";;" | EOI -> ()
  | _ -> last_tok := fst (token lb); sync lb
445

446
447
448
449
450
451
let raise_clean e =
  clear_buff ();
  in_comment := false;
  (* reinit encoding ? *)
  raise e

452
let mk () _FIXME_loc cs =
453
  let lb = L.from_var_enc_stream enc cs in
454
  (lexer ("#!" [^ '\n']* "\n")? -> ()) lb;
455
  lexbuf := Some lb;
456
  let next _ =
457
    let tok, loc =
458
459
      try token lb
      with
460
	| Ulexing.Error ->
461
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
462
463
			  "Unexpected character"))
	| Ulexing.InvalidCodepoint i ->
464
465
466
467
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
			  "Code point invalid for the current encoding"))
	| e -> raise_clean e
    in
468
469
    last_tok := tok;
    Some (tok, loc)
470
  in
471
  Stream.from next
472
473
474
475
476
477

let dump_file f =
  let ic = open_in f in
  let lexbuf = L.from_var_enc_channel enc ic in
  (try
     while true do
478
479
480
       let (tok,_) = token lexbuf in
       Format.printf "%a@." Token.print tok;
       if tok = EOI then exit 0
481
     done
482
483
484
   with
     | Ulexing.Error ->
	 Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n"
485
486
	   (Ulexing.lexeme_end lexbuf)
     | Error (i,j,s) ->
487
	 Printf.eprintf "Lexing error at offset %i-%i:\n%s\n"
488
489
	   i j s
     | Ulexing.InvalidCodepoint i ->
490
	 Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n"
491
492
493
	   (Ulexing.lexeme_end lexbuf)
  );
  close_in ic