schema_builtin.ml 18.8 KB
Newer Older
1

2
open Printf
3

4
5
open Encodings
open Encodings.Utf8.Pcre
6
open Schema_common
7
open Schema_types
8
9
10
11
12

(* TODO dates: boundary checks (e.g. 95/26/2003) *)
(* TODO a lot of almost cut-and-paste code, expecially in gFoo types validation
*)

13
14
  (** {2 Aux/Misc stuff} *)

15
16
17
18
let add_xsd_prefix s = Schema_xml.add_xsd_prefix (Utf8.mk s)

let is_empty s = Utf8.equal s (Utf8.mk "")

19
20
21
22
23
24
25
26
27
28
29
30
let zero = Intervals.V.zero
let one = (Intervals.V.succ Intervals.V.zero)
let minus_one = (Intervals.V.pred Intervals.V.zero)
let long_l = (Intervals.V.mk "-9223372036854775808")
let long_r = (Intervals.V.mk "9223372036854775807")
let int_l = (Intervals.V.mk "-2147483648")
let int_r = (Intervals.V.mk "2147483647")
let short_l = (Intervals.V.mk "-32768")
let short_r = (Intervals.V.mk "32767")
let byte_l = (Intervals.V.mk "-128")
let byte_r = (Intervals.V.mk "127")

31
let xml_S_RE = pcre_regexp "[ \\t\\r\\n]+"
32
  (* split a string at XML recommendation "S" production boundaries *)
33
34
let split_xml_S s = pcre_split ~rex:xml_S_RE s
let norm_RE = pcre_regexp "[\\t\\r\\n]"
35
36
37
38
39
40
41
42
43
44
45

let char_of_hex =
  let int_of_hex_char = function
    | '0' -> 0 | '1' -> 1 | '2' -> 2 | '3' -> 3 | '4' -> 4 | '5' -> 5 | '6' -> 6
    | '7' -> 7 | '8' -> 8 | '9' -> 9 | 'a' | 'A' -> 10 | 'b' | 'B' -> 11
    | 'c' | 'C' -> 12 | 'd' | 'D' -> 13 | 'e' | 'E' -> 14 | 'f' | 'F' -> 15
    | _ -> assert false
  in
    (* most significative, least significative *)
  fun ms ls -> Char.unsafe_chr (int_of_hex_char ms * 16 + int_of_hex_char ls)

46
47
let strip_parens s = Pcre.replace ~pat:"[()]" s
let add_limits s = "^" ^ s ^ "$"
48

49
50
exception Schema_builtin_error of Utf8.t
let simple_type_error name = raise (Schema_builtin_error (add_xsd_prefix name))
51

52
let qualify s = (Ns.empty, Encodings.Utf8.mk s)
53

54
55
  (* regular expressions used to validate built-in types *)

56
57
58
59
60
61
62
63
64
65
let timezone_RE_raw = "(Z)|(([+-])?(\\d{2}):(\\d{2}))"
let date_RE_raw = "(\\d{4,})-(\\d{2})-(\\d{2})"
let time_RE_raw = "(\\d{2}):(\\d{2}):(\\d{2})"

let gYearMonth_RE_raw = sprintf "(-)?(\\d{4,})-(\\d{2})(%s)?" timezone_RE_raw
let gYear_RE_raw = sprintf "(-)?(\\d{4,})(%s)?" timezone_RE_raw
let gMonthDay_RE_raw = sprintf "--(\\d{2})-(\\d{2})(%s)?" timezone_RE_raw
let gDay_RE_raw = sprintf "---(\\d{2})(%s)?" timezone_RE_raw
let gMonth_RE_raw = "--(\\d{2})--(%s)?"

66
67
  (** {2 CDuce types} *)

68
69
70
71
72
73
74
let positive_field = false, qualify "positive", Builtin_defs.bool
let year_field = false, qualify "year", Builtin_defs.int
let month_field = false, qualify "month", Builtin_defs.int
let day_field = false, qualify "day", Builtin_defs.int
let hour_field = false, qualify "hour", Builtin_defs.int
let minute_field = false, qualify "minute", Builtin_defs.int
let second_field = false, qualify "second", Builtin_defs.int
75
76
77
  (* TODO this should be a decimal *)
let time_type_fields = [ hour_field; minute_field; second_field ]
let date_type_fields = [ year_field; month_field; day_field ]
78
79
80
81
82

  (* TODO the constraint that at least one part should be present isn't easily
  expressible with CDuce types *)
let duration_type = Types.rec_of_list' [
  positive_field;
83
84
85
86
87
88
  true, qualify "year", Builtin_defs.int;
  true, qualify "month", Builtin_defs.int;
  true, qualify "day", Builtin_defs.int;
  true, qualify "hour", Builtin_defs.int;
  true, qualify "minute", Builtin_defs.int;
  true, qualify "second", Builtin_defs.int; (* TODO this should be a decimal *)
89
]
90
let timezone_type = Types.rec_of_list' [
91
  positive_field;
92
  hour_field; minute_field
93
]
94
let timezone_type_fields = [ true, qualify "timezone", timezone_type ]
95
96
let time_type = Types.rec_of_list' (time_type_fields @ timezone_type_fields)
let date_type = Types.rec_of_list' (positive_field :: date_type_fields)
97
98
99
100
101
let dateTime_type =
  Types.rec_of_list' (positive_field ::
    (date_type_fields @ time_type_fields @ timezone_type_fields))
let gYearMonth_type = Types.rec_of_list' [
  positive_field; year_field; month_field
102
103
104
105
106
]
let gYear_type = Types.rec_of_list' [ positive_field; year_field ]
let gMonthDay_type = Types.rec_of_list' [ month_field; day_field ]
let gDay_type = Types.rec_of_list' [ day_field ]
let gMonth_type = Types.rec_of_list' [ month_field ]
107

108
109
110
111
let nonPositiveInteger_type = Builtin_defs.non_pos_int
let negativeInteger_type = Builtin_defs.neg_int
let nonNegativeInteger_type = Builtin_defs.non_neg_int
let positiveInteger_type = Builtin_defs.pos_int
112
113
114
115
let long_type = Builtin_defs.long_int
let int_type = Builtin_defs.int_int
let short_type = Builtin_defs.short_int
let byte_type = Builtin_defs.byte_int
116

117
118
119
let string_list_type = Sequence.star Builtin_defs.string

  (** {2 Validation functions (string -> Value.t)} *)
120

121
122
123
124
125
let parse_sign s =
  if Utf8.equal s (Utf8.mk "+") || Utf8.equal s (Utf8.mk "") then
    Value.vtrue
  else
    Value.vfalse
126
127
128

let validate_integer s =
  try
129
    Value.Integer (Intervals.V.mk (Utf8.get_str s))
130
131
132
  with Failure _ -> simple_type_error "integer"

let strip_decimal_RE = Pcre.regexp "\\..*$"
133
134
let validate_decimal s =
  validate_integer (pcre_replace ~rex:strip_decimal_RE s)
135
136
137
138

let parse_date =
  let rex = Pcre.regexp (add_limits date_RE_raw) in
  fun s ->
139
  let abort () = simple_type_error "date" in
140
141
142
143
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  [ qualify "year", validate_integer subs.(1);
    qualify "month", validate_integer subs.(2);
    qualify "day", validate_integer subs.(3) ]
144
145
146
147

let parse_time =
  let rex = Pcre.regexp (add_limits time_RE_raw) in
  fun s ->
148
  let abort () = simple_type_error "time" in
149
150
151
152
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  [ qualify "hour", validate_integer subs.(1);
    qualify "minute", validate_integer subs.(2);
    qualify "second", validate_integer subs.(3) ]
153
154
155
156

let parse_timezone =
  let rex = Pcre.regexp (add_limits timezone_RE_raw) in
  fun s ->
157
158
159
160
161
162
163
164
165
166
  let abort () = simple_type_error "timezone" in
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  if Utf8.equal subs.(1) (Utf8.mk "Z") then
    [qualify "positive", Value.vtrue;
     qualify "hour", validate_integer (Utf8.mk "0");
     qualify "minute", validate_integer (Utf8.mk "0")]
  else
    [qualify "positive", parse_sign subs.(3);
     qualify "hour", validate_integer subs.(4);
     qualify "minute", validate_integer subs.(5)]
167
168
  (* parse a timezone from a string, if it's empty return the empty list,
  otherwise return a list containing a pair <"timezone", timezone value> *)
169
170
171
172
173
let parse_timezone' s =
  if is_empty s then
    []
  else
    [ qualify "timezone", Value.vrecord (parse_timezone s) ]
174

175
let validate_string s = Value.string_utf8 s
176
177
178
179
180
181
182
183
184
185
let validate_normalizedString s =
  validate_string (normalize_white_space `Replace s)
let validate_token s =
  validate_string (normalize_white_space `Collapse s)
let validate_token_list s =
  Value.sequence (List.map validate_token (split_xml_S s))

let validate_interval interval type_name s =
  let integer =
    try
186
      Intervals.V.mk (Utf8.get_str s)
187
    with Failure _ -> simple_type_error type_name
188
189
190
191
  in
  if Intervals.contains integer interval then
    Value.Integer integer
  else
192
    simple_type_error type_name
193
194
195
196
197
198
199
200
let validate_nonPositiveInteger =
  validate_interval (Intervals.left Intervals.V.zero) "nonPositiveInteger"
let validate_negativeInteger =
  validate_interval (Intervals.left Intervals.V.minus_one) "negativeInteger"
let validate_nonNegativeInteger =
  validate_interval (Intervals.right Intervals.V.zero) "nonNegativeInteger"
let validate_positiveInteger =
  validate_interval (Intervals.right Intervals.V.one) "positiveInteger"
201
202
203
204
205
let validate_long = validate_interval (Intervals.bounded long_l long_r) "long"
let validate_int = validate_interval (Intervals.bounded int_l int_r) "int"
let validate_short =
  validate_interval (Intervals.bounded short_l short_r) "short"
let validate_byte = validate_interval (Intervals.bounded byte_l byte_r) "byte"
206

207
208
209
210
211
212
213
let validate_bool s =
  if Utf8.equal s (Utf8.mk "true") || Utf8.equal s (Utf8.mk "1") then
    Value.vtrue
  else if Utf8.equal s (Utf8.mk "false") || Utf8.equal s (Utf8.mk "0") then
    Value.vfalse
  else
    simple_type_error "boolean"
214

215
let validate_duration =
216
  let rex = pcre_regexp
217
218
219
  "^([+-])?P((\\d+)Y)?((\\d+)M)?((\\d+)D)?(T((\\d+)H)?((\\d+)M)?((\\d+)S)?)?$"
  in
  fun s ->
220
  let abort () = simple_type_error "duration" in
221
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
222
223
  try
    let fields =
224
225
226
227
228
229
230
231
232
233
234
235
236
      [qualify "positive", parse_sign subs.(1) ] @
      (if is_empty subs.(3) then []
      else [qualify "year", validate_integer subs.(3)]) @
      (if is_empty subs.(5) then []
      else [qualify "month", validate_integer subs.(5)]) @
      (if is_empty subs.(7) then []
      else [qualify "day", validate_integer subs.(7)]) @
      (if is_empty subs.(10) then []
      else [qualify "hour", validate_integer subs.(10)]) @
      (if is_empty subs.(12) then []
      else [qualify "minute", validate_integer subs.(12)]) @
      (if is_empty subs.(14) then []
      else [qualify "second", validate_integer subs.(14)])
237
238
    in
    Value.vrecord fields
239
  with Schema_builtin_error _ -> abort ()
240
241
242
243
244
245
246

let validate_dateTime =
  let rex = Pcre.regexp (sprintf "^([+-])?(%s)T(%s)(%s)?$"
    (strip_parens date_RE_raw) (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
247
  let abort () = simple_type_error "dateTime" in
248
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
249
250
  try
    let fields =
251
      [ qualify "positive", parse_sign subs.(1) ] @
252
253
254
255
256
      parse_date subs.(2) @
      parse_time subs.(3) @
      parse_timezone' subs.(4)
    in
    Value.vrecord fields
257
  with Schema_builtin_error _ -> abort ()
258
259
260
261

let validate_gYearMonth =
  let rex = Pcre.regexp (add_limits gYearMonth_RE_raw) in
  fun s ->
262
    let abort () = simple_type_error "gYearMonth" in
263
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
264
265
    try
      let fields = [
266
267
268
        qualify "positive", parse_sign subs.(1);
        qualify "year", validate_integer subs.(2);
        qualify "month", validate_integer subs.(3)
269
270
271
      ] @ parse_timezone' subs.(4)
      in
      Value.vrecord fields
272
    with Schema_builtin_error _ -> abort ()
273
274
275
276

let validate_gYear =
  let rex = Pcre.regexp (add_limits gYear_RE_raw) in
  fun s ->
277
    let abort () = simple_type_error "gYear" in
278
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
279
280
    try
      let fields = [
281
282
        qualify "positive", parse_sign subs.(1);
        qualify "year", validate_integer subs.(2);
283
284
285
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
286
    with Schema_builtin_error _ -> abort ()
287
288
289
290

let validate_gMonthDay =
  let rex = Pcre.regexp (add_limits gMonthDay_RE_raw) in
  fun s ->
291
    let abort () = simple_type_error "gMonthDay" in
292
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
293
294
    try
      let fields = [
295
296
        qualify "month", validate_integer subs.(1);
        qualify "day", validate_integer subs.(2);
297
298
299
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
300
    with Schema_builtin_error _ -> abort ()
301
302
303
304

let validate_gDay =
  let rex = Pcre.regexp (add_limits gDay_RE_raw) in
  fun s ->
305
    let abort () = simple_type_error "gDay" in
306
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
307
308
    try
      let fields =
309
310
        (qualify "day", validate_integer subs.(1)) ::
          (parse_timezone' subs.(2))
311
312
      in
      Value.vrecord fields
313
    with Schema_builtin_error _ -> abort ()
314
315
316
317

let validate_gMonth =
  let rex = Pcre.regexp (add_limits gMonth_RE_raw) in
  fun s ->
318
    let abort () = simple_type_error "gMonth" in
319
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
320
321
    try
      let fields =
322
323
        (qualify "month", validate_integer subs.(1)) ::
          (parse_timezone' subs.(2))
324
325
      in
      Value.vrecord fields
326
    with Schema_builtin_error _ -> abort ()
327
328
329
330
331
332

let validate_time =
  let rex = Pcre.regexp (sprintf "^(%s)(%s)?$" (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
333
  let abort () = simple_type_error "time" in
334
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
335
336
337
  try
    let fields =
      parse_time subs.(1) @
338
339
      (if is_empty subs.(2) then []
      else [ qualify "timezone", Value.vrecord (parse_timezone subs.(2)) ])
340
341
    in
    Value.vrecord fields
342
  with Schema_builtin_error _ -> abort ()
343
344
345
346
347
348

let validate_date =
  let rex = Pcre.regexp (sprintf "^(-)?(%s)(%s)?$" (strip_parens date_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
349
  let abort () = simple_type_error "date" in
350
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
351
352
  try
    let fields =
353
      [ qualify "positive", parse_sign subs.(1) ] @
354
      parse_date subs.(2) @
355
356
      (if is_empty subs.(3) then []
      else [ qualify "timezone", Value.vrecord (parse_timezone subs.(3)) ])
357
358
    in
    Value.vrecord fields
359
  with Schema_builtin_error _ -> abort ()
360
361

let validate_hexBinary s =
362
  let s = Utf8.get_str s in
363
364
  let len = String.length s in
  if len mod 2 <> 0 then
365
    simple_type_error "hexBinary";
366
367
368
369
370
371
372
373
374
  let res = String.create (len / 2) in
  let rec aux idx =
    if idx < len then begin
      String.unsafe_set res (idx / 2)
        (char_of_hex (String.unsafe_get s idx) (String.unsafe_get s (idx + 1)));
      aux (idx + 2)
    end
  in
  aux 0;
375
  validate_string (Utf8.mk res)
376

377
378
379
let validate_base64Binary s =
  let s = Utf8.get_str s in
  validate_string (Utf8.mk (Netencoding.Base64.decode s))
380
381

let validate_anyURI s =
382
  let s = Utf8.get_str s in
383
  try
384
385
    validate_string (Utf8.mk (Neturl.string_of_url (Neturl.url_of_string
      Neturl.ip_url_syntax s)))
386
  with Neturl.Malformed_URL -> simple_type_error "anyURI"
387

388
389
390
  (** {2 API backend} *)

let builtins = Hashtbl.create 50
391
let reg name spec = Hashtbl.add builtins (add_xsd_prefix name) spec
392
let alias alias name =
393
  let (alias, name) = (add_xsd_prefix alias, add_xsd_prefix name) in
394
395
396
397
398
399
400
401
402
403
  Hashtbl.add builtins alias
    (let (st_def, descr, validator) = Hashtbl.find builtins name in
    let new_def =
      match st_def with
      | Primitive _ -> Primitive alias
      | Derived (_, variety, facets, base) ->
          Derived (Some alias, variety, facets, base)
    in
    (new_def, descr, validator))
let restrict' name basename new_facets =
404
  let (name, basename) = (add_xsd_prefix name, add_xsd_prefix basename) in
405
406
407
408
409
410
411
  let (base, _, _) = Hashtbl.find builtins basename in
  let variety = variety_of_simple_type_definition base in
  let facets =
    merge_facets (facets_of_simple_type_definition base) new_facets
  in
  Derived (Some name, variety, facets, base)
let list' name itemname =
412
  let (name, itemname) = (add_xsd_prefix name, add_xsd_prefix itemname) in
413
414
415
416
  let (base, _, _) = Hashtbl.find builtins itemname in
  Derived (Some name, List base, no_facets, base)

let fill () = (* fill "builtins" hashtbl *)
417
  let primitive name = Primitive (add_xsd_prefix name) in
418

419
  (* TODO missing built-in simple types: float, double, QName, NOTATION *)
420
421
422

  (* primitive builtins *)

423
424
425
426
427
428
  reg "anySimpleType"
    (primitive "anySimpleType", Builtin_defs.string, validate_string);
  alias "anyType" "anySimpleType";
  reg "string"
    (primitive "string", Builtin_defs.string, validate_string);
  reg "decimal"
429
    (* collapsed in CDuce to an integer, since CDuce has no decimal numbers *)
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
    (primitive "decimal", Builtin_defs.int, validate_decimal);
  reg "boolean"
    (primitive "boolean", Builtin_defs.bool, validate_bool);
  reg "hexBinary"
    (primitive "hexBinary", Builtin_defs.string, validate_hexBinary);
  reg "base64Binary"
    (primitive "base64Binary", Builtin_defs.string, validate_base64Binary);
  reg "anyURI"
    (primitive "anyURI", Builtin_defs.string, validate_anyURI);
  reg "duration"
    (primitive "duration", duration_type, validate_duration);
  reg "dateTime"
    (primitive "dateTime", dateTime_type, validate_dateTime);
  reg "time"
    (primitive "time", time_type, validate_time);
  reg "date"
    (primitive "date", date_type, validate_date);
  reg "gYearMonth"
    (primitive "gYearMonth", gYearMonth_type, validate_gYearMonth);
  reg "gYear"
    (primitive "gYear", gYear_type, validate_gYear);
  reg "gMonthDay"
    (primitive "gMonthDay", gMonthDay_type, validate_gMonthDay);
  reg "gDay"
    (primitive "gDay", gDay_type, validate_gDay);
  reg "gMonth"
    (primitive "gMonth", gMonth_type, validate_gMonth);
457
458
459

  (* derived builtins *)

460
461
  reg "integer"
    (restrict' "integer" "decimal" no_facets, (* fake restriction *)
462
    Builtin_defs.int, validate_integer);
463
464
  reg "nonPositiveInteger"
    (restrict' "nonPositiveInteger" "integer"
465
      { no_facets with maxInclusive = Some (Value.Integer zero, false) },
466
    nonPositiveInteger_type, validate_nonPositiveInteger);
467
468
  reg "negativeInteger"
    (restrict' "negativeInteger" "nonPositiveInteger"
469
      { no_facets with maxInclusive = Some (Value.Integer minus_one, false) },
470
    negativeInteger_type, validate_negativeInteger);
471
472
  reg "nonNegativeInteger"
    (restrict' "nonNegativeInteger" "integer"
473
      { no_facets with minInclusive = Some (Value.Integer zero, false) },
474
    nonNegativeInteger_type, validate_nonNegativeInteger);
475
476
  reg "positiveInteger"
    (restrict' "positiveInteger" "nonNegativeInteger"
477
      { no_facets with minInclusive = Some (Value.Integer one, false) },
478
    positiveInteger_type, validate_positiveInteger);
479
480
  reg "long"
    (restrict' "long" "integer"
481
482
483
484
      { no_facets with
          minInclusive = Some (Value.Integer long_l, false);
          maxInclusive = Some (Value.Integer long_r, false)},
    long_type, validate_long);
485
486
  reg "int"
    (restrict' "int" "long"
487
488
489
490
      { no_facets with
          minInclusive = Some (Value.Integer int_l, false);
          maxInclusive = Some (Value.Integer int_r, false)},
    int_type, validate_int);
491
492
  reg "short"
    (restrict' "short" "int"
493
494
495
496
      { no_facets with
          minInclusive = Some (Value.Integer short_l, false);
          maxInclusive = Some (Value.Integer short_r, false)},
    short_type, validate_short);
497
498
  reg "byte"
    (restrict' "byte" "short"
499
500
501
502
      { no_facets with
          minInclusive = Some (Value.Integer byte_l, false);
          maxInclusive = Some (Value.Integer byte_r, false)},
    byte_type, validate_short);
503
504
  reg "normalizedString"
    (restrict' "normalizedString" "string"
505
506
      { no_facets with whiteSpace = `Replace, false },
    Builtin_defs.string, validate_normalizedString);
507
508
  reg "token"
    (restrict' "token" "normalizedString"
509
510
      { no_facets with whiteSpace = `Collapse, false },
    Builtin_defs.string, validate_token);
511
512
513
514
515
516
517
518
519
  alias "language" "token";
  alias "Name" "token";
  alias "NMTOKEN" "token";
  alias "NCName" "token";
  alias "ID" "token";
  alias "IDREF" "token";
  alias "ENTITY" "token";
  reg "NMTOKENS"
    (list' "NMTOKENS" "token",
520
    string_list_type, validate_token_list);
521
522
  alias "IDREFS" "NMTOKENS";
  alias "ENTITIES" "NMTOKENS"
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539

let _ = try fill () with Not_found -> assert false

  (** {2 API} *)

let is_builtin = Hashtbl.mem builtins
let iter_builtin f =
  Hashtbl.iter (fun _ (type_def, _, _) -> f type_def) builtins

let lookup name = Hashtbl.find builtins name

let fst (x,_,_) = x
let snd (_,y,_) = y
let trd (_,_,z) = z

let get_builtin name          = fst (lookup name)
let cd_type_of_builtin name   = snd (lookup name)
540
let validate_builtin name     = trd (lookup name)
541