wlexer.mll 12.6 KB
Newer Older
1
2
3
4
5
(* File to be processed by wlex, not ocamllex ! *)

classes 
   encoding_error
   xml_char
6
   blank
7
   lowercase uppercase ascii_digit 
8
   "#_<>=.,:;+-*/@&{}[]()|?`\"\\\'!"
9
10
11
12
13
14
15

   unicode_base_char
   unicode_ideographic
   unicode_combining_char
   unicode_digit
   unicode_extender

16
17
18
19

{
  let keywords = Hashtbl.create 17

20
21
  let in_comment = ref false

22
  let error = Location.raise_loc
23
24
25
26
27
  exception Illegal_character of char
  exception Unterminated_comment
  exception Unterminated_string
  exception Unterminated_string_in_comment

28

29
  (* Buffer for string literals (always encoded in UTF8). *)
30
31
    
  let string_buff = Buffer.create 1024
32

33
34
35
  let store_ascii = Buffer.add_char string_buff
  let store_char  = Buffer.add_string string_buff
  let store_code  = Encodings.Utf8.store string_buff
36
37
38
39
  let get_stored_string () =
    let s = Buffer.contents string_buff in
    Buffer.clear string_buff;
    s
40
  let store_special = function
41
42
43
    | 'n' ->  store_ascii '\n' 
    | 'r' ->  store_ascii '\r' 
    | 't' ->  store_ascii '\t' 
44
    | c -> raise (Illegal_character '\\')
45
46
47
48

  let string_start_pos = ref 0;;
  let comment_start_pos : int list ref = ref [];;

49
  let decimal_char s =
50
51
    int_of_string (String.sub s 1 (String.length s - 2))

52
53
54
55
56
57
58

  let hexa_digit = function
    | '0'..'9' as c -> (Char.code c) - (Char.code '0')
    | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10
    | _ -> failwith "Invalid hexadecimal digit" (* TODO: error loc *)


59
60
61
  let hexa_char s =
    let rec aux i accu =
      if i = String.length s - 1 then accu
62
      else aux (succ i) (accu * 16 + hexa_digit s.[i])
63
64
    in
    aux 0 0
65

66
67
}

68
69
70
71
72
73
74
75
76
let letter = lowercase | uppercase | unicode_base_char | unicode_ideographic
let digit = ascii_digit | unicode_digit
let character = [ ^ encoding_error ]

let ncname_char = 
  letter | digit | [ ".-_" ] | unicode_combining_char | unicode_extender
let ncname = (letter | '_' ) ncname_char*
let qname = (ncname ':')? ncname

77
78
rule token = parse
    blank+    { token engine lexbuf }
79
80
81
  | qname 
      { 
	let s = Lexing.lexeme lexbuf in
82
	if Hashtbl.mem keywords s then "",s else "IDENT",s
83
      }
84
85
86
87
88
89
90
  | ncname ":*"
      { 
	let s = Lexing.lexeme lexbuf in
	let s = String.sub s 0 (String.length s - 2) in
	"ANY_IN_NS", s
      }
  | ".:*" { "ANY_IN_NS", "" }
91
92
  | '-'? ascii_digit+ 
    { "INT",Lexing.lexeme lexbuf }
93
  | [ "<>=.,:;+-*/@&{}[]()|?`!" ]
94
  | "->" | "::" | ";;" | "--" | ":=" | "\\" | "++"
95
  | "{|" | "|}" | "<=" | ">=" | "<<" | ">>"
96
  | ["?+*"] "?" | "#"
97
	{ "",Lexing.lexeme lexbuf }
98
  | "#" ncname { "DIRECTIVE",Lexing.lexeme lexbuf }
99
100
101
102
  | '"' | "'"
      { let string_start = Lexing.lexeme_start lexbuf in
        string_start_pos := string_start;
	let double_quote = Lexing.lexeme_char lexbuf 0 = '"' in
103
        string (Lexing.lexeme lexbuf) engine lexbuf;
104
105
106
107
108
109
110
        lexbuf.Lexing.lex_start_pos <-
          string_start - lexbuf.Lexing.lex_abs_pos;
        (if double_quote then "STRING2" else "STRING1"), 
	(get_stored_string()) }

  | "(*"
      { comment_start_pos := [Lexing.lexeme_start lexbuf];
111
	in_comment := true;
112
        comment engine lexbuf;
113
	in_comment := false;
114
115
116
117
118
        token engine lexbuf }

  | eof       
      { "EOI","" }
  | _
119
      { error 
120
121
122
123
124
125
126
127
128
129
130
131
132
133
	  (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
	  (Illegal_character ((Lexing.lexeme lexbuf).[0])) }

and comment = parse
    "(*"
      { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
        comment engine lexbuf;
      }
  | "*)"
      { comment_start_pos := List.tl !comment_start_pos;
	if !comment_start_pos <> [] then comment engine lexbuf;
      }
  | '"' | "'"
      { string_start_pos := Lexing.lexeme_start lexbuf;
134
	Buffer.clear string_buff;
135
136
	let ender = Lexing.lexeme lexbuf in
        (try string ender engine lexbuf
137
         with Location.Location (_,_,Unterminated_string) ->
138
           let st = List.hd !comment_start_pos in
139
           error st (st+2) Unterminated_string_in_comment);
140
141
142
143
144
145
146
147
148
	Buffer.clear string_buff;
        comment engine lexbuf }
  | eof
      { let st = List.hd !comment_start_pos in
        error st (st+2) Unterminated_comment
      }
  | _
      { comment engine lexbuf }

149
150
and string ender = parse
  | '"' | "'"
151
      { let c = Lexing.lexeme lexbuf in
152
	if c = ender then ()
153
	else (store_char (Lexing.lexeme lexbuf); 
154
	      string ender engine lexbuf) }
155
  | '\\' ['\\' '"' '\'']
156
      { store_ascii (Lexing.lexeme_char lexbuf 1);
157
        string ender engine lexbuf }
158
159
160
161
  | '\\' lowercase 
      { let c = Lexing.lexeme_char lexbuf 1 in
	if c = 'x' 
	then parse_hexa_char engine lexbuf 
162
	else store_special c;
163
164
	string ender engine lexbuf }
  | '\\' ascii_digit+ ';'
165
      { store_code (decimal_char (Lexing.lexeme lexbuf));
166
        string ender engine lexbuf }
167
168
169
170
  | '\\' 
      { error 
	  (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
	  (Illegal_character '\\') }
171
172
173
  | eof
      { error !string_start_pos (!string_start_pos+1) Unterminated_string }
  | _
174
      { store_code (Char.code (Lexing.lexeme_char lexbuf 0));  
175
	(* Adapt when source is UTF8 *)
176
177
178
        string ender engine lexbuf }

and parse_hexa_char = parse
179
  | (ascii_digit|lowercase)+ ';'
180
      { store_code (hexa_char (Lexing.lexeme lexbuf)) }
181
  | _
182
183
184
185
      { error 
	  (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf)
	  (Illegal_character '\\') }

186

187

188
189
{

190
191
192
  let delta_loc = ref 0
  let set_delta_loc dl = delta_loc := dl

193
194
195
196
197
(* For synchronization on errors in the toplevel ... *)
(* Issue: file inclusion *)
  let lexbuf = ref None
  let last_tok = ref ("","")

198
199
200
201
202
203

  let lexeme_start lexbuf = 
    lexbuf.Lexing.lex_abs_pos + lexbuf.Lexing.lex_start_pos
  let lexeme_end lexbuf = 
    lexbuf.Lexing.lex_abs_pos + lexbuf.Lexing.lex_curr_pos

204
205
206
207
208
209
  let lexer_func_of_wlex lexfun lexengine cs =
    let dl = !delta_loc in
    delta_loc := 0;
    let lb =
      Lexing.from_function
	(fun s n ->
210
211
           try s.[0] <- Stream.next cs; 1 
	   with Stream.Failure -> 0)
212
    in
213
    lexbuf := Some lb;
214
215
    let next () =
      let tok = lexfun lexengine lb in
216
      let loc = (lexeme_start lb + dl, lexeme_end lb + dl) in
217
      last_tok := tok;
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
      (tok, loc) 
    in
    Token.make_stream_and_location next
      
  let register_kw (s1,s2) =
    if s1 = "" then 
      match s2.[0] with 
	| 'a' .. 'z' when not (Hashtbl.mem keywords s2) -> 
	    Hashtbl.add keywords s2 ()      
	| _ -> ()


  let lexer lexfun lexengine =
    { 
      Token.tok_func = lexer_func_of_wlex lexfun lexengine; 
      Token.tok_using = register_kw;
      Token.tok_removing = (fun _ -> ()); 
      Token.tok_match = Token.default_match;
236
237
      Token.tok_text = Token.lexer_text;
      Token.tok_comm = None;
238
    }
239
240
241
242

  let classes = 
    let c i = (i,i) in
    let i ch1 ch2 = (Char.code ch1, Char.code ch2) in
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
    [ unicode_base_char,
      [ 0x00C0,0x00D6; 0x00D8,0x00F6; 
	0x00F8,0x00FF; 0x0100,0x0131; 0x0134,0x013E; 0x0141,0x0148; 
	0x014A,0x017E; 0x0180,0x01C3; 0x01CD,0x01F0; 0x01F4,0x01F5; 
	0x01FA,0x0217; 0x0250,0x02A8; 0x02BB,0x02C1; 0x0386,0x0386;
	0x0388,0x038A; 0x038C,0x038C; 0x038E,0x03A1; 0x03A3,0x03CE; 
	0x03D0,0x03D6; 0x03DA,0x03DA; 0x03DC,0x03DC; 0x03DE,0x03DE; 
	0x03E0,0x03E0; 0x03E2,0x03F3; 
	0x0401,0x040C; 0x040E,0x044F; 0x0451,0x045C; 0x045E,0x0481; 
	0x0490,0x04C4; 0x04C7,0x04C8; 0x04CB,0x04CC; 0x04D0,0x04EB; 
	0x04EE,0x04F5; 0x04F8,0x04F9; 0x0531,0x0556; 0x0559,0x0559;
	0x0561,0x0586; 0x05D0,0x05EA; 0x05F0,0x05F2; 0x0621,0x063A; 
	0x0641,0x064A; 0x0671,0x06B7; 0x06BA,0x06BE; 0x06C0,0x06CE; 
	0x06D0,0x06D3; 0x06D5,0x06D5; 0x06E5,0x06E6; 0x0905,0x0939; 
	0x093D,0x093D;
	0x0958,0x0961; 0x0985,0x098C; 0x098F,0x0990; 0x0993,0x09A8; 
	0x09AA,0x09B0; 0x09B2,0x09B2; 0x09B6,0x09B9; 0x09DC,0x09DD; 
	0x09DF,0x09E1; 0x09F0,0x09F1; 0x0A05,0x0A0A; 0x0A0F,0x0A10; 
	0x0A13,0x0A28; 0x0A2A,0x0A30; 0x0A32,0x0A33; 0x0A35,0x0A36; 
	0x0A38,0x0A39; 0x0A59,0x0A5C; 0x0A5E,0x0A5E; 0x0A72,0x0A74; 
	0x0A85,0x0A8B; 0x0A8D,0x0A8D; 0x0A8F,0x0A91; 0x0A93,0x0AA8; 
	0x0AAA,0x0AB0; 0x0AB2,0x0AB3; 0x0AB5,0x0AB9; 0x0ABD,0x0ABD; 
	0x0AE0,0x0AE0;
	0x0B05,0x0B0C; 0x0B0F,0x0B10; 0x0B13,0x0B28; 0x0B2A,0x0B30; 
	0x0B32,0x0B33; 0x0B36,0x0B39; 0x0B3D,0x0B3D; 0x0B5C,0x0B5D; 
	0x0B5F,0x0B61; 0x0B85,0x0B8A; 0x0B8E,0x0B90; 0x0B92,0x0B95; 
	0x0B99,0x0B9A; 0x0B9C,0x0B9C; 0x0B9E,0x0B9F; 0x0BA3,0x0BA4; 
	0x0BA8,0x0BAA; 0x0BAE,0x0BB5; 0x0BB7,0x0BB9; 0x0C05,0x0C0C; 
	0x0C0E,0x0C10; 0x0C12,0x0C28; 0x0C2A,0x0C33; 0x0C35,0x0C39; 
	0x0C60,0x0C61; 0x0C85,0x0C8C; 0x0C8E,0x0C90; 0x0C92,0x0CA8; 
	0x0CAA,0x0CB3; 0x0CB5,0x0CB9; 0x0CDE,0x0CDE; 0x0CE0,0x0CE1; 
	0x0D05,0x0D0C; 0x0D0E,0x0D10; 0x0D12,0x0D28; 0x0D2A,0x0D39; 
	0x0D60,0x0D61; 0x0E01,0x0E2E; 0x0E30,0x0E30; 0x0E32,0x0E33; 
	0x0E40,0x0E45; 0x0E81,0x0E82; 0x0E84,0x0E84; 0x0E87,0x0E88; 
	0x0E8A,0x0E8A;
	0x0E8D,0x0E8D; 0x0E94,0x0E97; 0x0E99,0x0E9F; 0x0EA1,0x0EA3; 
	0x0EA5,0x0EA5;
	0x0EA7,0x0EA7; 0x0EAA,0x0EAB; 0x0EAD,0x0EAE; 0x0EB0,0x0EB0; 
	0x0EB2,0x0EB3;
	0x0EBD,0x0EBD; 0x0EC0,0x0EC4; 0x0F40,0x0F47; 0x0F49,0x0F69; 
	0x10A0,0x10C5; 0x10D0,0x10F6; 0x1100,0x1100; 0x1102,0x1103; 
	0x1105,0x1107; 0x1109,0x1109; 0x110B,0x110C; 0x110E,0x1112; 
	0x113C,0x113C;
	0x113E,0x113E; 0x1140,0x1140; 0x114C,0x114C; 0x114E,0x114E; 
	0x1150,0x1150; 0x1154,0x1155; 0x1159,0x1159;
	0x115F,0x1161; 0x1163,0x1163; 0x1165,0x1165; 0x1167,0x1167; 
	0x1169,0x1169; 0x116D,0x116E; 
	0x1172,0x1173; 0x1175,0x1175; 0x119E,0x119E; 0x11A8,0x11A8; 
	0x11AB,0x11AB; 0x11AE,0x11AF; 
	0x11B7,0x11B8; 0x11BA,0x11BA; 0x11BC,0x11C2; 0x11EB,0x11EB; 
	0x11F0,0x11F0; 0x11F9,0x11F9;
	0x1E00,0x1E9B; 0x1EA0,0x1EF9; 0x1F00,0x1F15; 0x1F18,0x1F1D; 
	0x1F20,0x1F45; 0x1F48,0x1F4D; 0x1F50,0x1F57; 0x1F59,0x1F59; 
	0x1F5B,0x1F5B;
	0x1F5D,0x1F5D; 0x1F5F,0x1F7D; 0x1F80,0x1FB4; 0x1FB6,0x1FBC; 
	0x1FBE,0x1FBE;
	0x1FC2,0x1FC4; 0x1FC6,0x1FCC; 0x1FD0,0x1FD3; 0x1FD6,0x1FDB; 
	0x1FE0,0x1FEC; 0x1FF2,0x1FF4; 0x1FF6,0x1FFC; 0x2126,0x2126;
	0x212A,0x212B; 0x212E,0x212E; 0x2180,0x2182; 0x3041,0x3094; 
	0x30A1,0x30FA; 0x3105,0x312C; (* 0xAC00,0xD7A3 *) ];

      unicode_ideographic,
      [ 0x3007,0x3007; 0x3021,0x3029 (* 0x4E00-0x9FA5 *) ];

      unicode_combining_char,
      [ 0x0300,0x0345; 0x0360,0x0361; 0x0483,0x0486; 0x0591,0x05A1;
	0x05A3,0x05B9; 0x05BB,0x05BD; 0x05BF,0x05BF; 0x05C1,0x05C2;
	0x05C4,0x05C4; 0x064B,0x0652; 0x0670,0x0670; 0x06D6,0x06DC;
	0x06DD,0x06DF; 0x06E0,0x06E4; 0x06E7,0x06E8; 0x06EA,0x06ED;
	0x0901,0x0903; 0x093C,0x093C; 0x093E,0x094C; 0x094D,0x094D;
	0x0951,0x0954; 0x0962,0x0963; 0x0981,0x0983; 0x09BC,0x09BC;
	0x09BE,0x09BE; 0x09BF,0x09BF; 0x09C0,0x09C4; 0x09C7,0x09C8;
	0x09CB,0x09CD; 0x09D7,0x09D7; 0x09E2,0x09E3; 0x0A02,0x0A02;
	0x0A3C,0x0A3C; 0x0A3E,0x0A3E; 0x0A3F,0x0A3F; 0x0A40,0x0A42;
	0x0A47,0x0A48; 0x0A4B,0x0A4D; 0x0A70,0x0A71; 0x0A81,0x0A83;
	0x0ABC,0x0ABC; 0x0ABE,0x0AC5; 0x0AC7,0x0AC9; 0x0ACB,0x0ACD;
	0x0B01,0x0B03; 0x0B3C,0x0B3C; 0x0B3E,0x0B43; 0x0B47,0x0B48;
	0x0B4B,0x0B4D; 0x0B56,0x0B57; 0x0B82,0x0B83; 0x0BBE,0x0BC2;
	0x0BC6,0x0BC8; 0x0BCA,0x0BCD; 0x0BD7,0x0BD7; 0x0C01,0x0C03;
	0x0C3E,0x0C44; 0x0C46,0x0C48; 0x0C4A,0x0C4D; 0x0C55,0x0C56;
	0x0C82,0x0C83; 0x0CBE,0x0CC4; 0x0CC6,0x0CC8; 0x0CCA,0x0CCD;
	0x0CD5,0x0CD6; 0x0D02,0x0D03; 0x0D3E,0x0D43; 0x0D46,0x0D48;
	0x0D4A,0x0D4D; 0x0D57,0x0D57; 0x0E31,0x0E31; 0x0E34,0x0E3A;
	0x0E47,0x0E4E; 0x0EB1,0x0EB1; 0x0EB4,0x0EB9; 0x0EBB,0x0EBC;
	0x0EC8,0x0ECD; 0x0F18,0x0F19; 0x0F35,0x0F35; 0x0F37,0x0F37;
	0x0F39,0x0F39; 0x0F3E,0x0F3E; 0x0F3F,0x0F3F; 0x0F71,0x0F84;
	0x0F86,0x0F8B; 0x0F90,0x0F95; 0x0F97,0x0F97; 0x0F99,0x0FAD;
	0x0FB1,0x0FB7; 0x0FB9,0x0FB9; 0x20D0,0x20DC; 0x20E1,0x20E1;
	0x302A,0x302F; 0x3099,0x3099; 0x309A,0x309A ];

      unicode_digit,
      [ 0x0660,0x0669; 0x06F0,0x06F9; 0x0966,0x096F; 0x09E6,0x09EF;
	0x0A66,0x0A6F; 0x0AE6,0x0AEF; 0x0B66,0x0B6F; 0x0BE7,0x0BEF;
	0x0C66,0x0C6F; 0x0CE6,0x0CEF; 0x0D66,0x0D6F; 0x0E50,0x0E59;
	0x0ED0,0x0ED9; 0x0F20,0x0F29 ];


      unicode_extender,
      [ 0x00B7,0x00B7; 0x02D0,0x02D1; 0x0387,0x0387; 0x0640,0x0640;
	0x0E46,0x0E46; 0x0EC6,0x0EC6; 0x3005,0x3005; 0x3031,0x3035;
	0x309D,0x309E; 0x30FC,0x30FE ];

      ascii_digit, 
      [ i '0' '9'];

      lowercase, 
      [i 'a' 'z'];

      uppercase, 
      [i 'A' 'Z'];
      
      blank, 
      [c 8; c 9; c 10; c 13; c 32]
356
357
358
359
    ]

  let table =
    assert(nb_classes <= 256);
360
    let v = String.make 0x312d (Char.chr encoding_error) in
361
362
363
364
365
366
367
    let fill_int c (i, j) = String.fill v i (j-i+1) c in
    let fill_class (c, l) = List.iter (fill_int (Char.chr c)) l in
    let fill_char  (ch, cl) = v.[ch] <- Char.chr cl in
    List.iter fill_class classes;
    List.iter fill_char one_char_classes;
    v

368
369
370
371
372
373
374
375
376
377
378
  let utf8_engine = 
    Lex_engines.engine_tiny_utf8 table
      (fun code -> 
	 if code >= 0x4E00 && code <= 0x9FA5 then
           unicode_ideographic
	 else if code >= 0xAC00 && code <= 0xD7A3 then
           unicode_base_char
	 else if code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFD) ||
           (code >= 0x10000 && code <= 0x10FFFF) then
             xml_char
	 else encoding_error)
379
380
381

  let latin1_engine = Lex_engines.engine_tiny_8bit table
}