schema_builtin.ml 19 KB
Newer Older
1

2
open Printf
3

4
5
open Encodings
open Encodings.Utf8.Pcre
6
open Schema_common
7
open Schema_types
8
9
10
11
12

(* TODO dates: boundary checks (e.g. 95/26/2003) *)
(* TODO a lot of almost cut-and-paste code, expecially in gFoo types validation
*)

13
14
  (** {2 Aux/Misc stuff} *)

15
16
let add_xsd_prefix s = Schema_xml.add_xsd_prefix (Utf8.mk s)

17
18
19
20
let unsupported =
  List.map (fun s -> add_xsd_prefix s)
    [ "decimal"; "float"; "double"; "NOTATION"; "QName" ]

21
22
let is_empty s = Utf8.equal s (Utf8.mk "")

23
24
25
26
27
28
29
30
31
32
33
34
let zero = Intervals.V.zero
let one = (Intervals.V.succ Intervals.V.zero)
let minus_one = (Intervals.V.pred Intervals.V.zero)
let long_l = (Intervals.V.mk "-9223372036854775808")
let long_r = (Intervals.V.mk "9223372036854775807")
let int_l = (Intervals.V.mk "-2147483648")
let int_r = (Intervals.V.mk "2147483647")
let short_l = (Intervals.V.mk "-32768")
let short_r = (Intervals.V.mk "32767")
let byte_l = (Intervals.V.mk "-128")
let byte_r = (Intervals.V.mk "127")

35
let xml_S_RE = pcre_regexp "[ \\t\\r\\n]+"
36
  (* split a string at XML recommendation "S" production boundaries *)
37
38
let split_xml_S s = pcre_split ~rex:xml_S_RE s
let norm_RE = pcre_regexp "[\\t\\r\\n]"
39
40
41
42
43
44
45
46
47
48
49

let char_of_hex =
  let int_of_hex_char = function
    | '0' -> 0 | '1' -> 1 | '2' -> 2 | '3' -> 3 | '4' -> 4 | '5' -> 5 | '6' -> 6
    | '7' -> 7 | '8' -> 8 | '9' -> 9 | 'a' | 'A' -> 10 | 'b' | 'B' -> 11
    | 'c' | 'C' -> 12 | 'd' | 'D' -> 13 | 'e' | 'E' -> 14 | 'f' | 'F' -> 15
    | _ -> assert false
  in
    (* most significative, least significative *)
  fun ms ls -> Char.unsafe_chr (int_of_hex_char ms * 16 + int_of_hex_char ls)

50
51
let strip_parens s = Pcre.replace ~pat:"[()]" s
let add_limits s = "^" ^ s ^ "$"
52

53
54
exception Schema_builtin_error of Utf8.t
let simple_type_error name = raise (Schema_builtin_error (add_xsd_prefix name))
55

56
let qualify s = (Ns.empty, Encodings.Utf8.mk s)
57

58
59
  (* regular expressions used to validate built-in types *)

60
61
62
63
64
65
66
67
68
69
let timezone_RE_raw = "(Z)|(([+-])?(\\d{2}):(\\d{2}))"
let date_RE_raw = "(\\d{4,})-(\\d{2})-(\\d{2})"
let time_RE_raw = "(\\d{2}):(\\d{2}):(\\d{2})"

let gYearMonth_RE_raw = sprintf "(-)?(\\d{4,})-(\\d{2})(%s)?" timezone_RE_raw
let gYear_RE_raw = sprintf "(-)?(\\d{4,})(%s)?" timezone_RE_raw
let gMonthDay_RE_raw = sprintf "--(\\d{2})-(\\d{2})(%s)?" timezone_RE_raw
let gDay_RE_raw = sprintf "---(\\d{2})(%s)?" timezone_RE_raw
let gMonth_RE_raw = "--(\\d{2})--(%s)?"

70
71
  (** {2 CDuce types} *)

72
73
74
75
76
77
78
let positive_field = false, qualify "positive", Builtin_defs.bool
let year_field = false, qualify "year", Builtin_defs.int
let month_field = false, qualify "month", Builtin_defs.int
let day_field = false, qualify "day", Builtin_defs.int
let hour_field = false, qualify "hour", Builtin_defs.int
let minute_field = false, qualify "minute", Builtin_defs.int
let second_field = false, qualify "second", Builtin_defs.int
79
80
81
  (* TODO this should be a decimal *)
let time_type_fields = [ hour_field; minute_field; second_field ]
let date_type_fields = [ year_field; month_field; day_field ]
82
83
84
85
86

  (* TODO the constraint that at least one part should be present isn't easily
  expressible with CDuce types *)
let duration_type = Types.rec_of_list' [
  positive_field;
87
88
89
90
91
92
  true, qualify "year", Builtin_defs.int;
  true, qualify "month", Builtin_defs.int;
  true, qualify "day", Builtin_defs.int;
  true, qualify "hour", Builtin_defs.int;
  true, qualify "minute", Builtin_defs.int;
  true, qualify "second", Builtin_defs.int; (* TODO this should be a decimal *)
93
]
94
let timezone_type = Types.rec_of_list' [
95
  positive_field;
96
  hour_field; minute_field
97
]
98
let timezone_type_fields = [ true, qualify "timezone", timezone_type ]
99
100
let time_type = Types.rec_of_list' (time_type_fields @ timezone_type_fields)
let date_type = Types.rec_of_list' (positive_field :: date_type_fields)
101
102
103
104
105
let dateTime_type =
  Types.rec_of_list' (positive_field ::
    (date_type_fields @ time_type_fields @ timezone_type_fields))
let gYearMonth_type = Types.rec_of_list' [
  positive_field; year_field; month_field
106
107
108
109
110
]
let gYear_type = Types.rec_of_list' [ positive_field; year_field ]
let gMonthDay_type = Types.rec_of_list' [ month_field; day_field ]
let gDay_type = Types.rec_of_list' [ day_field ]
let gMonth_type = Types.rec_of_list' [ month_field ]
111

112
113
114
115
let nonPositiveInteger_type = Builtin_defs.non_pos_int
let negativeInteger_type = Builtin_defs.neg_int
let nonNegativeInteger_type = Builtin_defs.non_neg_int
let positiveInteger_type = Builtin_defs.pos_int
116
117
118
119
let long_type = Builtin_defs.long_int
let int_type = Builtin_defs.int_int
let short_type = Builtin_defs.short_int
let byte_type = Builtin_defs.byte_int
120

121
122
123
let string_list_type = Sequence.star Builtin_defs.string

  (** {2 Validation functions (string -> Value.t)} *)
124

125
126
127
128
129
let parse_sign s =
  if Utf8.equal s (Utf8.mk "+") || Utf8.equal s (Utf8.mk "") then
    Value.vtrue
  else
    Value.vfalse
130
131
132

let validate_integer s =
  try
133
    Value.Integer (Intervals.V.mk (Utf8.get_str s))
134
135
136
  with Failure _ -> simple_type_error "integer"

let strip_decimal_RE = Pcre.regexp "\\..*$"
137
138
139
140

let parse_date =
  let rex = Pcre.regexp (add_limits date_RE_raw) in
  fun s ->
141
  let abort () = simple_type_error "date" in
142
143
144
145
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  [ qualify "year", validate_integer subs.(1);
    qualify "month", validate_integer subs.(2);
    qualify "day", validate_integer subs.(3) ]
146
147
148
149

let parse_time =
  let rex = Pcre.regexp (add_limits time_RE_raw) in
  fun s ->
150
  let abort () = simple_type_error "time" in
151
152
153
154
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  [ qualify "hour", validate_integer subs.(1);
    qualify "minute", validate_integer subs.(2);
    qualify "second", validate_integer subs.(3) ]
155
156
157
158

let parse_timezone =
  let rex = Pcre.regexp (add_limits timezone_RE_raw) in
  fun s ->
159
160
161
162
163
164
165
166
167
168
  let abort () = simple_type_error "timezone" in
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  if Utf8.equal subs.(1) (Utf8.mk "Z") then
    [qualify "positive", Value.vtrue;
     qualify "hour", validate_integer (Utf8.mk "0");
     qualify "minute", validate_integer (Utf8.mk "0")]
  else
    [qualify "positive", parse_sign subs.(3);
     qualify "hour", validate_integer subs.(4);
     qualify "minute", validate_integer subs.(5)]
169
170
  (* parse a timezone from a string, if it's empty return the empty list,
  otherwise return a list containing a pair <"timezone", timezone value> *)
171
172
173
174
175
let parse_timezone' s =
  if is_empty s then
    []
  else
    [ qualify "timezone", Value.vrecord (parse_timezone s) ]
176

177
let validate_string s = Value.string_utf8 s
178
179
180
181
182
183
184
185
186
187
let validate_normalizedString s =
  validate_string (normalize_white_space `Replace s)
let validate_token s =
  validate_string (normalize_white_space `Collapse s)
let validate_token_list s =
  Value.sequence (List.map validate_token (split_xml_S s))

let validate_interval interval type_name s =
  let integer =
    try
188
      Intervals.V.mk (Utf8.get_str s)
189
    with Failure _ -> simple_type_error type_name
190
191
192
193
  in
  if Intervals.contains integer interval then
    Value.Integer integer
  else
194
    simple_type_error type_name
195
196
197
198
199
200
201
202
let validate_nonPositiveInteger =
  validate_interval (Intervals.left Intervals.V.zero) "nonPositiveInteger"
let validate_negativeInteger =
  validate_interval (Intervals.left Intervals.V.minus_one) "negativeInteger"
let validate_nonNegativeInteger =
  validate_interval (Intervals.right Intervals.V.zero) "nonNegativeInteger"
let validate_positiveInteger =
  validate_interval (Intervals.right Intervals.V.one) "positiveInteger"
203
204
205
206
207
let validate_long = validate_interval (Intervals.bounded long_l long_r) "long"
let validate_int = validate_interval (Intervals.bounded int_l int_r) "int"
let validate_short =
  validate_interval (Intervals.bounded short_l short_r) "short"
let validate_byte = validate_interval (Intervals.bounded byte_l byte_r) "byte"
208

209
210
211
212
213
214
215
let validate_bool s =
  if Utf8.equal s (Utf8.mk "true") || Utf8.equal s (Utf8.mk "1") then
    Value.vtrue
  else if Utf8.equal s (Utf8.mk "false") || Utf8.equal s (Utf8.mk "0") then
    Value.vfalse
  else
    simple_type_error "boolean"
216

217
let validate_duration =
218
  let rex = pcre_regexp
219
220
221
  "^([+-])?P((\\d+)Y)?((\\d+)M)?((\\d+)D)?(T((\\d+)H)?((\\d+)M)?((\\d+)S)?)?$"
  in
  fun s ->
222
  let abort () = simple_type_error "duration" in
223
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
224
225
  try
    let fields =
226
227
228
229
230
231
232
233
234
235
236
237
238
      [qualify "positive", parse_sign subs.(1) ] @
      (if is_empty subs.(3) then []
      else [qualify "year", validate_integer subs.(3)]) @
      (if is_empty subs.(5) then []
      else [qualify "month", validate_integer subs.(5)]) @
      (if is_empty subs.(7) then []
      else [qualify "day", validate_integer subs.(7)]) @
      (if is_empty subs.(10) then []
      else [qualify "hour", validate_integer subs.(10)]) @
      (if is_empty subs.(12) then []
      else [qualify "minute", validate_integer subs.(12)]) @
      (if is_empty subs.(14) then []
      else [qualify "second", validate_integer subs.(14)])
239
240
    in
    Value.vrecord fields
241
  with Schema_builtin_error _ -> abort ()
242
243
244
245
246
247
248

let validate_dateTime =
  let rex = Pcre.regexp (sprintf "^([+-])?(%s)T(%s)(%s)?$"
    (strip_parens date_RE_raw) (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
249
  let abort () = simple_type_error "dateTime" in
250
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
251
252
  try
    let fields =
253
      [ qualify "positive", parse_sign subs.(1) ] @
254
255
256
257
258
      parse_date subs.(2) @
      parse_time subs.(3) @
      parse_timezone' subs.(4)
    in
    Value.vrecord fields
259
  with Schema_builtin_error _ -> abort ()
260
261
262
263

let validate_gYearMonth =
  let rex = Pcre.regexp (add_limits gYearMonth_RE_raw) in
  fun s ->
264
    let abort () = simple_type_error "gYearMonth" in
265
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
266
267
    try
      let fields = [
268
269
270
        qualify "positive", parse_sign subs.(1);
        qualify "year", validate_integer subs.(2);
        qualify "month", validate_integer subs.(3)
271
272
273
      ] @ parse_timezone' subs.(4)
      in
      Value.vrecord fields
274
    with Schema_builtin_error _ -> abort ()
275
276
277
278

let validate_gYear =
  let rex = Pcre.regexp (add_limits gYear_RE_raw) in
  fun s ->
279
    let abort () = simple_type_error "gYear" in
280
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
281
282
    try
      let fields = [
283
284
        qualify "positive", parse_sign subs.(1);
        qualify "year", validate_integer subs.(2);
285
286
287
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
288
    with Schema_builtin_error _ -> abort ()
289
290
291
292

let validate_gMonthDay =
  let rex = Pcre.regexp (add_limits gMonthDay_RE_raw) in
  fun s ->
293
    let abort () = simple_type_error "gMonthDay" in
294
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
295
296
    try
      let fields = [
297
298
        qualify "month", validate_integer subs.(1);
        qualify "day", validate_integer subs.(2);
299
300
301
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
302
    with Schema_builtin_error _ -> abort ()
303
304
305
306

let validate_gDay =
  let rex = Pcre.regexp (add_limits gDay_RE_raw) in
  fun s ->
307
    let abort () = simple_type_error "gDay" in
308
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
309
310
    try
      let fields =
311
312
        (qualify "day", validate_integer subs.(1)) ::
          (parse_timezone' subs.(2))
313
314
      in
      Value.vrecord fields
315
    with Schema_builtin_error _ -> abort ()
316
317
318
319

let validate_gMonth =
  let rex = Pcre.regexp (add_limits gMonth_RE_raw) in
  fun s ->
320
    let abort () = simple_type_error "gMonth" in
321
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
322
323
    try
      let fields =
324
325
        (qualify "month", validate_integer subs.(1)) ::
          (parse_timezone' subs.(2))
326
327
      in
      Value.vrecord fields
328
    with Schema_builtin_error _ -> abort ()
329
330
331
332
333
334

let validate_time =
  let rex = Pcre.regexp (sprintf "^(%s)(%s)?$" (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
335
  let abort () = simple_type_error "time" in
336
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
337
338
339
  try
    let fields =
      parse_time subs.(1) @
340
341
      (if is_empty subs.(2) then []
      else [ qualify "timezone", Value.vrecord (parse_timezone subs.(2)) ])
342
343
    in
    Value.vrecord fields
344
  with Schema_builtin_error _ -> abort ()
345
346
347
348
349
350

let validate_date =
  let rex = Pcre.regexp (sprintf "^(-)?(%s)(%s)?$" (strip_parens date_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
351
  let abort () = simple_type_error "date" in
352
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
353
354
  try
    let fields =
355
      [ qualify "positive", parse_sign subs.(1) ] @
356
      parse_date subs.(2) @
357
358
      (if is_empty subs.(3) then []
      else [ qualify "timezone", Value.vrecord (parse_timezone subs.(3)) ])
359
360
    in
    Value.vrecord fields
361
  with Schema_builtin_error _ -> abort ()
362
363

let validate_hexBinary s =
364
  let s = Utf8.get_str s in
365
366
  let len = String.length s in
  if len mod 2 <> 0 then
367
    simple_type_error "hexBinary";
368
369
370
371
372
373
374
375
376
  let res = String.create (len / 2) in
  let rec aux idx =
    if idx < len then begin
      String.unsafe_set res (idx / 2)
        (char_of_hex (String.unsafe_get s idx) (String.unsafe_get s (idx + 1)));
      aux (idx + 2)
    end
  in
  aux 0;
377
  validate_string (Utf8.mk res)
378

379
380
381
let validate_base64Binary s =
  let s = Utf8.get_str s in
  validate_string (Utf8.mk (Netencoding.Base64.decode s))
382
383

let validate_anyURI s =
384
  let s = Utf8.get_str s in
385
  try
386
387
    validate_string (Utf8.mk (Neturl.string_of_url (Neturl.url_of_string
      Neturl.ip_url_syntax s)))
388
  with Neturl.Malformed_URL -> simple_type_error "anyURI"
389

390
391
392
  (** {2 API backend} *)

let builtins = Hashtbl.create 50
393
let reg name spec = Hashtbl.add builtins (add_xsd_prefix name) spec
394
let alias alias name =
395
  let (alias, name) = (add_xsd_prefix alias, add_xsd_prefix name) in
396
397
398
399
400
401
402
403
404
405
  Hashtbl.add builtins alias
    (let (st_def, descr, validator) = Hashtbl.find builtins name in
    let new_def =
      match st_def with
      | Primitive _ -> Primitive alias
      | Derived (_, variety, facets, base) ->
          Derived (Some alias, variety, facets, base)
    in
    (new_def, descr, validator))
let restrict' name basename new_facets =
406
  let (name, basename) = (add_xsd_prefix name, add_xsd_prefix basename) in
407
408
409
410
411
412
413
  let (base, _, _) = Hashtbl.find builtins basename in
  let variety = variety_of_simple_type_definition base in
  let facets =
    merge_facets (facets_of_simple_type_definition base) new_facets
  in
  Derived (Some name, variety, facets, base)
let list' name itemname =
414
  let (name, itemname) = (add_xsd_prefix name, add_xsd_prefix itemname) in
415
416
417
418
  let (base, _, _) = Hashtbl.find builtins itemname in
  Derived (Some name, List base, no_facets, base)

let fill () = (* fill "builtins" hashtbl *)
419
  let primitive name = Primitive (add_xsd_prefix name) in
420
421
422

  (* primitive builtins *)

423
424
  reg "anySimpleType"
    (primitive "anySimpleType", Builtin_defs.string, validate_string);
425
  alias "anyType" "anySimpleType";  (* TODO BUG HERE *)
426
427
  reg "string"
    (primitive "string", Builtin_defs.string, validate_string);
428
429
430
431
432
433
434
435

    (* TODO following types not yet supported (see "unsupported" above) *)
  alias "decimal" "string";
  alias "float" "string";
  alias "double" "string";
  alias "NOTATION" "string";
  alias "QName" "string";

436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
  reg "boolean"
    (primitive "boolean", Builtin_defs.bool, validate_bool);
  reg "hexBinary"
    (primitive "hexBinary", Builtin_defs.string, validate_hexBinary);
  reg "base64Binary"
    (primitive "base64Binary", Builtin_defs.string, validate_base64Binary);
  reg "anyURI"
    (primitive "anyURI", Builtin_defs.string, validate_anyURI);
  reg "duration"
    (primitive "duration", duration_type, validate_duration);
  reg "dateTime"
    (primitive "dateTime", dateTime_type, validate_dateTime);
  reg "time"
    (primitive "time", time_type, validate_time);
  reg "date"
    (primitive "date", date_type, validate_date);
  reg "gYearMonth"
    (primitive "gYearMonth", gYearMonth_type, validate_gYearMonth);
  reg "gYear"
    (primitive "gYear", gYear_type, validate_gYear);
  reg "gMonthDay"
    (primitive "gMonthDay", gMonthDay_type, validate_gMonthDay);
  reg "gDay"
    (primitive "gDay", gDay_type, validate_gDay);
  reg "gMonth"
    (primitive "gMonth", gMonth_type, validate_gMonth);
462
463
464

  (* derived builtins *)

465
466
  reg "integer"
    (restrict' "integer" "decimal" no_facets, (* fake restriction *)
467
    Builtin_defs.int, validate_integer);
468
469
  reg "nonPositiveInteger"
    (restrict' "nonPositiveInteger" "integer"
470
      { no_facets with maxInclusive = Some (Value.Integer zero, false) },
471
    nonPositiveInteger_type, validate_nonPositiveInteger);
472
473
  reg "negativeInteger"
    (restrict' "negativeInteger" "nonPositiveInteger"
474
      { no_facets with maxInclusive = Some (Value.Integer minus_one, false) },
475
    negativeInteger_type, validate_negativeInteger);
476
477
  reg "nonNegativeInteger"
    (restrict' "nonNegativeInteger" "integer"
478
      { no_facets with minInclusive = Some (Value.Integer zero, false) },
479
    nonNegativeInteger_type, validate_nonNegativeInteger);
480
481
  reg "positiveInteger"
    (restrict' "positiveInteger" "nonNegativeInteger"
482
      { no_facets with minInclusive = Some (Value.Integer one, false) },
483
    positiveInteger_type, validate_positiveInteger);
484
485
  reg "long"
    (restrict' "long" "integer"
486
487
488
489
      { no_facets with
          minInclusive = Some (Value.Integer long_l, false);
          maxInclusive = Some (Value.Integer long_r, false)},
    long_type, validate_long);
490
491
  reg "int"
    (restrict' "int" "long"
492
493
494
495
      { no_facets with
          minInclusive = Some (Value.Integer int_l, false);
          maxInclusive = Some (Value.Integer int_r, false)},
    int_type, validate_int);
496
497
  reg "short"
    (restrict' "short" "int"
498
499
500
501
      { no_facets with
          minInclusive = Some (Value.Integer short_l, false);
          maxInclusive = Some (Value.Integer short_r, false)},
    short_type, validate_short);
502
503
  reg "byte"
    (restrict' "byte" "short"
504
505
506
507
      { no_facets with
          minInclusive = Some (Value.Integer byte_l, false);
          maxInclusive = Some (Value.Integer byte_r, false)},
    byte_type, validate_short);
508
509
  reg "normalizedString"
    (restrict' "normalizedString" "string"
510
511
      { no_facets with whiteSpace = `Replace, false },
    Builtin_defs.string, validate_normalizedString);
512
513
  reg "token"
    (restrict' "token" "normalizedString"
514
515
      { no_facets with whiteSpace = `Collapse, false },
    Builtin_defs.string, validate_token);
516
517
518
519
520
521
522
523
524
  alias "language" "token";
  alias "Name" "token";
  alias "NMTOKEN" "token";
  alias "NCName" "token";
  alias "ID" "token";
  alias "IDREF" "token";
  alias "ENTITY" "token";
  reg "NMTOKENS"
    (list' "NMTOKENS" "token",
525
    string_list_type, validate_token_list);
526
527
  alias "IDREFS" "NMTOKENS";
  alias "ENTITIES" "NMTOKENS"
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543

let _ = try fill () with Not_found -> assert false

  (** {2 API} *)

let is_builtin = Hashtbl.mem builtins
let iter_builtin f =
  Hashtbl.iter (fun _ (type_def, _, _) -> f type_def) builtins

let lookup name = Hashtbl.find builtins name

let fst (x,_,_) = x
let snd (_,y,_) = y
let trd (_,_,z) = z

let get_builtin name          = fst (lookup name)
544
545
546
547
548
549
let cd_type_of_builtin name   =
  if List.mem name unsupported then
    Format.fprintf Format.err_formatter
      "Warning: %s isn't properly supported and is thread as a string by CDuce@."
      (Utf8.get_str name);
  snd (lookup name)
550
let validate_builtin name     = trd (lookup name)
551