ulexer.ml 12.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
open Camlp4.PreCast

module Loc = struct
  type t = int * int

  let mk _ = (0,0)
  let ghost = (-1,-1)

  let of_lexing_position _ = assert false
  let to_ocaml_location _ = assert false
  let of_ocaml_location _ = assert false
  let of_lexbuf _ = assert false
  let of_tuple _ = assert false
  let to_tuple _ = assert false

  let merge (x1, x2) (y1, y2) = (min x1 y1, max x2 y2)
  let join (x1, _) = (x1, x1)
  let move _ _ _ = assert false
  let shift _ _ = assert false
  let move_line _ _ = assert false
  let file_name  _ = assert false
  let start_line _ = assert false
  let stop_line  _ = assert false
  let start_bol  _ = assert false
  let stop_bol   _ = assert false
  let start_off  = fst
  let stop_off   = snd
  let start_pos  _ = assert false
  let stop_pos   _ = assert false
  let is_ghost   _ = assert false
  let ghostify   _ = assert false
  let set_file_name _ = assert false
  let strictly_before _ = assert false
  let make_absolute _ = assert false
  let print _ = assert false
  let dump  _ = assert false
  let to_string _ = assert false
  exception Exc_located of t * exn
  let raise loc exn =
40
    match exn with
41
42
43
44
45
46
47
48
49
50
    | Exc_located _ -> raise exn
    | _ -> raise (Exc_located (loc, exn))
  let name = ref "_loc"
end

type token =
  | KEYWORD of string
  | IDENT of string
  | ANY_IN_NS of string
  | INT of string
51
  | CHAR of string
52
  | STRING of string
53
  | STRING2 of string
54
  | PTYPE of string
55
56
57
58
59
60
61
62
63
64
65
66
  | EOI

module Token = struct
  open Format
  module Loc = Loc
  type t = token
  type token = t

  let sf = Printf.sprintf

  let to_string =
    function
67
68
69
    | KEYWORD s -> sf "KEYWORD %S" s
    | IDENT s -> sf "IDENT %S" s
    | INT s -> sf "INT %s" s
70
    | CHAR s -> sf "CHAR \'%s\'" s
71
    | STRING s -> sf "STRING \"%s\"" s
72
    | STRING2 s -> sf "STRING \'%s\'" s
73
                      (* here it's not %S since the string is already escaped *)
74
    | PTYPE s -> sf "PTYPE \'%S\'" s
75
76
    | ANY_IN_NS s -> sf "ANY_IN_NS %S" s
    | EOI -> sf "EOI"
77
78
79
80
81
82
83
84
85
86

  let print ppf x = pp_print_string ppf (to_string x)

  let match_keyword kwd =
    function
    | KEYWORD kwd' when kwd = kwd' -> true
    | _ -> false

  let extract_string =
    function
87
      | KEYWORD s | IDENT s | INT s | CHAR s | STRING s | STRING2 s | PTYPE s |
88
	  ANY_IN_NS s -> s
89
      | tok ->
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
        invalid_arg ("Cannot extract a string from this token: "^
                     to_string tok)

  module Error = struct
    type t = string
    exception E of string
    let print = pp_print_string
    let to_string x = x
  end

  module Filter = struct
    type token_filter = (t, Loc.t) Camlp4.Sig.stream_filter

    type t =
      { is_kwd : string -> bool;
        mutable filter : token_filter }
106

107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    let keyword_conversion tok is_kwd =
      match tok with
      | IDENT s when is_kwd s -> KEYWORD s
      | _ -> tok

    let mk is_kwd =
      { is_kwd = is_kwd;
        filter = (fun s -> s) }

    let filter x =
      let f tok loc =
        let tok' = keyword_conversion tok x.is_kwd in
        (tok', loc)
      in
      let rec filter =
        parser
        | [< '(tok, loc); s >] -> [< ' f tok loc; filter s >]
        | [< >] -> [< >]
      in
      fun strm -> x.filter (filter strm)

    let define_filter x f = x.filter <- f x.filter

    let keyword_added _ _ _ = ()
    let keyword_removed _ _ = ()
  end

end
module Error = Camlp4.Struct.EmptyError

module L = Ulexing
138
139
140

exception Error of int * int * string

141
142
let toplevel = ref false

143
144
145
let error i j s = raise (Error (i,j,s))

(* Buffer for string literals *)
146

147
148
let string_buff = Buffer.create 1024

149
let store_lexeme lexbuf =
150
151
152
153
154
155
156
157
158
159
160
161
162
  Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf)
let store_ascii = Buffer.add_char string_buff
let store_code  = Utf8.store string_buff
let clear_buff () = Buffer.clear string_buff
let get_stored_string () =
  let s = Buffer.contents string_buff in
  clear_buff ();
  Buffer.clear string_buff;
  s

let enc = ref L.Latin1

(* Parse characters literals \123; \x123; *)
163

164
165
166
167
168
let hexa_digit = function
  | '0'..'9' as c -> (Char.code c) - (Char.code '0')
  | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
  | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10
  | _ -> -1
169

170
let parse_char lexbuf base i =
171
  let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in
172
173
174
  let r = ref 0 in
  for i = 0 to String.length s - 1 do
    let c = hexa_digit s.[i] in
175
    if (c >= base) || (c < 0) then
176
177
178
179
180
181
      error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit";
    r := !r * base + c;
  done;
  !r


182
let regexp ncname_char =
183
  xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\."
184
let regexp ncname = ( xml_letter ncname_char* ) | ('_' ncname_char+)
185
186
let regexp qname = (ncname ':')? ncname

187
188
(* Should be [^ xml_letter ] *)
let regexp not_xml_letter = [^ 'A'-'Z' 'a'-'z' '0'-'9' '_' ]
189
190
191
192

let illegal lexbuf =
  error
    (L.lexeme_start lexbuf)
193
    (L.lexeme_end lexbuf)
194
195
196
    "Illegal character"

let in_comment = ref false
197
let in_brackets = ref 0
198

199
200
201
let return lexbuf tok = (tok, L.loc lexbuf)
let return_loc i j tok = (tok, (i,j))

202
let rec token = lexer
203
204
205
 | xml_blank+ -> token lexbuf
 | qname ->
     let s = L.utf8_lexeme lexbuf in
206
     return lexbuf (IDENT s)
207
208
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
209
     return lexbuf (ANY_IN_NS s)
210
 | ".:*" ->
211
     return lexbuf (ANY_IN_NS "")
212
 | '-'? ['0'-'9']+ ->
213
     return lexbuf (INT (L.utf8_lexeme lexbuf))
214
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
215
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
216
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
217
 | ".."
218
 | ["?+*"] "?" | "#" ->
219
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
220
221
222
223
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
224
 | '"' ->
225
     let start = L.lexeme_start lexbuf in
Julien Lopez's avatar
Julien Lopez committed
226
     string (L.lexeme_start lexbuf) '"' lexbuf;
227
     let s = get_stored_string () in
228
     return_loc start (L.lexeme_end lexbuf) (STRING s)
229
 | "'" "\\"? _ "'" ->
Julien Lopez's avatar
Julien Lopez committed
230
231
232
233
234
235
236
237
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (CHAR s)
	 | _ -> assert false) () lexbuf
 | "'" ncname ->
238
239
240
     let s = L.utf8_lexeme lexbuf in
     let s = String.sub s 1 (String.length s - 1) in
     return lexbuf (PTYPE s)
241
242
243
244
245
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token lexbuf
246
247
248
249
250
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token lexbuf
251
 | eof ->
252
     return lexbuf EOI
253
 | _ ->
254
255
     illegal lexbuf

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
and token2 = lexer
 | xml_blank+ -> token2 lexbuf
 | qname ->
     let s = L.utf8_lexeme lexbuf in
     return lexbuf (IDENT s)
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
     return lexbuf (ANY_IN_NS s)
 | ".:*" ->
     return lexbuf (ANY_IN_NS "")
 | '-'? ['0'-'9']+ ->
     return lexbuf (INT (L.utf8_lexeme lexbuf))
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
 | ".."
 | ["?+*"] "?" | "#" ->
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | '"' ->
     let start = L.lexeme_start lexbuf in
     string (L.lexeme_start lexbuf) '"' lexbuf;
     let s = get_stored_string () in
     return_loc start (L.lexeme_end lexbuf) (STRING s)
283
284
 | "'" "\\"? _ "'--'" "\\"? _ "'"
 | "'" [^ '\'']+ "'" not_xml_letter ->
Julien Lopez's avatar
Julien Lopez committed
285
286
287
288
289
290
291
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (STRING2 s)
	 | _ -> assert false) () lexbuf
292
 | "'" ncname ->
293
     let s = L.utf8_lexeme lexbuf in
294
     let s = String.sub s 1 (String.length s - 1) in
295
     return lexbuf (PTYPE s)
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2 lexbuf
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2 lexbuf
 | eof ->
     return lexbuf EOI
 | _ ->
     illegal lexbuf

311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
and token2toplevel = lexer
 | xml_blank+ -> token2toplevel lexbuf
 | qname ->
     let s = L.utf8_lexeme lexbuf in
     return lexbuf (IDENT s)
 | ncname ":*" ->
     let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in
     return lexbuf (ANY_IN_NS s)
 | ".:*" ->
     return lexbuf (ANY_IN_NS "")
 | '-'? ['0'-'9']+ ->
     return lexbuf (INT (L.utf8_lexeme lexbuf))
 | [ "<>=.,:;+-*/@&{}()|?`!$" ]
 | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++"
 | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_"
 | ".."
 | ["?+*"] "?" | "#" ->
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "[" -> in_brackets := !in_brackets + 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | "]" -> in_brackets := !in_brackets - 1;
     return lexbuf (KEYWORD (L.utf8_lexeme lexbuf))
 | '"' ->
     let start = L.lexeme_start lexbuf in
     string (L.lexeme_start lexbuf) '"' lexbuf;
     let s = get_stored_string () in
     return_loc start (L.lexeme_end lexbuf) (STRING s)
 | "'" "\\"? _ "'--'" "\\"? _ "'"
339
 | "'" ((";"[^ ";'"]) | [^ ";'"])* ";"? "'" not_xml_letter ->
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
     L.rollback lexbuf;
     (fun _ -> lexer
	 | "'" -> let start = L.lexeme_start lexbuf in
		  string (L.lexeme_start lexbuf) '\'' lexbuf;
		  let s = get_stored_string () in
		  return_loc start (L.lexeme_end lexbuf) (STRING2 s)
	 | _ -> assert false) () lexbuf
 | "'" ncname ->
     let s = L.utf8_lexeme lexbuf in
     let s = String.sub s 1 (String.length s - 1) in
     return lexbuf (PTYPE s)
 | "(*" ->
     in_comment := true;
     comment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2toplevel lexbuf
 | "/*" ->
     in_comment := true;
     tcomment (L.lexeme_start lexbuf) lexbuf;
     in_comment := false;
     token2toplevel lexbuf
 | eof ->
     return lexbuf EOI
 | _ ->
     illegal lexbuf
365

366
367
368
369
370
371
372
373
374
375
376
and comment start = lexer
  | "(*" ->
      comment (L.lexeme_start lexbuf) lexbuf;
      comment start lexbuf
  | "*)" ->
      ()
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      comment start lexbuf

377
378
379
380
381
382
383
384
and tcomment start = lexer
  | "*/" ->
      ()
  | eof ->
      error start (start+2) "Unterminated comment"
  | _ ->
      tcomment start lexbuf

Julien Lopez's avatar
Julien Lopez committed
385
386
387
388
389
and string start endchar = lexer
  | '"' -> if endchar = '"' then ()
    else (store_lexeme lexbuf; string start endchar lexbuf)
  | "'" -> if endchar = '\'' then ()
    else (store_lexeme lexbuf; string start endchar lexbuf)
390
  | '\\' ['\\' '"' '\''] ->
Julien Lopez's avatar
Julien Lopez committed
391
392
393
394
    store_ascii (L.latin1_lexeme_char lexbuf 1); string start endchar lexbuf
  | "\\n" -> store_ascii '\n'; string start endchar lexbuf
  | "\\t" -> store_ascii '\t'; string start endchar lexbuf
  | "\\r" -> store_ascii '\r'; string start endchar lexbuf
395
  | '\\' ['0'-'9']+ ';' ->
Julien Lopez's avatar
Julien Lopez committed
396
    store_code (parse_char lexbuf 10 1); string start endchar lexbuf
397
  | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' ->
Julien Lopez's avatar
Julien Lopez committed
398
      store_code (parse_char lexbuf 16 2); string start endchar lexbuf
399
  | '\\' -> illegal lexbuf;
400
  | eof -> error start (start+1) "Unterminated string"
Julien Lopez's avatar
Julien Lopez committed
401
  | _ -> store_lexeme lexbuf; string start endchar lexbuf
402

403
404
let token lexbuf = if !in_brackets = 0 then token lexbuf
  else if !toplevel then token2toplevel lexbuf else token2 lexbuf
405

406
let lexbuf = ref None
407
408
409
410
411
412
let last_tok = ref (KEYWORD "DUMMY")

let rec sync lb =
  match !last_tok with
  | KEYWORD ";;" | EOI -> ()
  | _ -> last_tok := fst (token lb); sync lb
413

414
415
416
417
418
419
let raise_clean e =
  clear_buff ();
  in_comment := false;
  (* reinit encoding ? *)
  raise e

420
let mk () _FIXME_loc cs =
421
  let lb = L.from_var_enc_stream enc cs in
422
  (lexer ("#!" [^ '\n']* "\n")? -> ()) lb;
423
  lexbuf := Some lb;
424
  let next _ =
425
    let tok, loc =
426
427
      try token lb
      with
428
	| Ulexing.Error ->
429
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
430
431
			  "Unexpected character"))
	| Ulexing.InvalidCodepoint i ->
432
433
434
435
	    raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb,
			  "Code point invalid for the current encoding"))
	| e -> raise_clean e
    in
436
437
    last_tok := tok;
    Some (tok, loc)
438
  in
439
  Stream.from next
440
441
442
443
444
445

let dump_file f =
  let ic = open_in f in
  let lexbuf = L.from_var_enc_channel enc ic in
  (try
     while true do
446
447
448
       let (tok,_) = token lexbuf in
       Format.printf "%a@." Token.print tok;
       if tok = EOI then exit 0
449
     done
450
451
452
   with
     | Ulexing.Error ->
	 Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n"
453
454
	   (Ulexing.lexeme_end lexbuf)
     | Error (i,j,s) ->
455
	 Printf.eprintf "Lexing error at offset %i-%i:\n%s\n"
456
457
	   i j s
     | Ulexing.InvalidCodepoint i ->
458
	 Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n"
459
460
461
	   (Ulexing.lexeme_end lexbuf)
  );
  close_in ic