schema_builtin.ml 18.2 KB
Newer Older
1

2
open Printf
3

4
open Schema_types
5
open Schema_common
6
7
8
9
10

(* TODO dates: boundary checks (e.g. 95/26/2003) *)
(* TODO a lot of almost cut-and-paste code, expecially in gFoo types validation
*)

11
12
  (** {2 Aux/Misc stuff} *)

13
14
15
16
17
18
19
20
21
22
23
24
let zero = Intervals.V.zero
let one = (Intervals.V.succ Intervals.V.zero)
let minus_one = (Intervals.V.pred Intervals.V.zero)
let long_l = (Intervals.V.mk "-9223372036854775808")
let long_r = (Intervals.V.mk "9223372036854775807")
let int_l = (Intervals.V.mk "-2147483648")
let int_r = (Intervals.V.mk "2147483647")
let short_l = (Intervals.V.mk "-32768")
let short_r = (Intervals.V.mk "32767")
let byte_l = (Intervals.V.mk "-128")
let byte_r = (Intervals.V.mk "127")

25
26
27
28
29
let regexp' s = Pcre.regexp ~flags:[`UTF8] s
let xml_S_RE = regexp' "[ \\t\\r\\n]+"
  (* split a string at XML recommendation "S" production boundaries *)
let split_xml_S s = Pcre.split ~rex:xml_S_RE s
let norm_RE = regexp' "[\\t\\r\\n]"
30
31
32
33
34
35
36
37
38
39
40

let char_of_hex =
  let int_of_hex_char = function
    | '0' -> 0 | '1' -> 1 | '2' -> 2 | '3' -> 3 | '4' -> 4 | '5' -> 5 | '6' -> 6
    | '7' -> 7 | '8' -> 8 | '9' -> 9 | 'a' | 'A' -> 10 | 'b' | 'B' -> 11
    | 'c' | 'C' -> 12 | 'd' | 'D' -> 13 | 'e' | 'E' -> 14 | 'f' | 'F' -> 15
    | _ -> assert false
  in
    (* most significative, least significative *)
  fun ms ls -> Char.unsafe_chr (int_of_hex_char ms * 16 + int_of_hex_char ls)

41
42
let strip_parens s = Pcre.replace ~pat:"[()]" s
let add_limits s = "^" ^ s ^ "$"
43

44
45
46
exception Schema_builtin_error of string
let simple_type_error name =
  raise (Schema_builtin_error (Schema_xml.xsd_prefix ^ name))
47

48
49
let qualify s = (Schema_xml.xsd_namespace, Encodings.Utf8.mk s)

50
51
  (* regular expressions used to validate built-in types *)

52
53
54
55
56
57
58
59
60
61
let timezone_RE_raw = "(Z)|(([+-])?(\\d{2}):(\\d{2}))"
let date_RE_raw = "(\\d{4,})-(\\d{2})-(\\d{2})"
let time_RE_raw = "(\\d{2}):(\\d{2}):(\\d{2})"

let gYearMonth_RE_raw = sprintf "(-)?(\\d{4,})-(\\d{2})(%s)?" timezone_RE_raw
let gYear_RE_raw = sprintf "(-)?(\\d{4,})(%s)?" timezone_RE_raw
let gMonthDay_RE_raw = sprintf "--(\\d{2})-(\\d{2})(%s)?" timezone_RE_raw
let gDay_RE_raw = sprintf "---(\\d{2})(%s)?" timezone_RE_raw
let gMonth_RE_raw = "--(\\d{2})--(%s)?"

62
63
  (** {2 CDuce types} *)

64
65
66
67
68
69
70
let positive_field = false, qualify "positive", Builtin_defs.bool
let year_field = false, qualify "year", Builtin_defs.int
let month_field = false, qualify "month", Builtin_defs.int
let day_field = false, qualify "day", Builtin_defs.int
let hour_field = false, qualify "hour", Builtin_defs.int
let minute_field = false, qualify "minute", Builtin_defs.int
let second_field = false, qualify "second", Builtin_defs.int
71
72
73
  (* TODO this should be a decimal *)
let time_type_fields = [ hour_field; minute_field; second_field ]
let date_type_fields = [ year_field; month_field; day_field ]
74
75
76
77
78

  (* TODO the constraint that at least one part should be present isn't easily
  expressible with CDuce types *)
let duration_type = Types.rec_of_list' [
  positive_field;
79
80
81
82
83
84
  true, qualify "year", Builtin_defs.int;
  true, qualify "month", Builtin_defs.int;
  true, qualify "day", Builtin_defs.int;
  true, qualify "hour", Builtin_defs.int;
  true, qualify "minute", Builtin_defs.int;
  true, qualify "second", Builtin_defs.int; (* TODO this should be a decimal *)
85
]
86
let timezone_type = Types.rec_of_list' [
87
  positive_field;
88
  hour_field; minute_field
89
]
90
let timezone_type_fields = [ true, qualify "timezone", timezone_type ]
91
92
let time_type = Types.rec_of_list' (time_type_fields @ timezone_type_fields)
let date_type = Types.rec_of_list' (positive_field :: date_type_fields)
93
94
95
96
97
let dateTime_type =
  Types.rec_of_list' (positive_field ::
    (date_type_fields @ time_type_fields @ timezone_type_fields))
let gYearMonth_type = Types.rec_of_list' [
  positive_field; year_field; month_field
98
99
100
101
102
]
let gYear_type = Types.rec_of_list' [ positive_field; year_field ]
let gMonthDay_type = Types.rec_of_list' [ month_field; day_field ]
let gDay_type = Types.rec_of_list' [ day_field ]
let gMonth_type = Types.rec_of_list' [ month_field ]
103

104
105
106
107
let nonPositiveInteger_type = Builtin_defs.non_pos_int
let negativeInteger_type = Builtin_defs.neg_int
let nonNegativeInteger_type = Builtin_defs.non_neg_int
let positiveInteger_type = Builtin_defs.pos_int
108
109
110
111
let long_type = Builtin_defs.long_int
let int_type = Builtin_defs.int_int
let short_type = Builtin_defs.short_int
let byte_type = Builtin_defs.byte_int
112

113
114
115
let string_list_type = Sequence.star Builtin_defs.string

  (** {2 Validation functions (string -> Value.t)} *)
116

117
let parse_sign = function "+" | "" -> Value.vtrue | _ -> Value.vfalse
118
119
120

let validate_integer s =
  try
121
    Value.Integer (Intervals.V.mk s)
122
123
124
125
  with Failure _ -> simple_type_error "integer"

let strip_decimal_RE = Pcre.regexp "\\..*$"
let validate_decimal s = validate_integer (Pcre.replace ~rex:strip_decimal_RE s)
126
127
128
129

let parse_date =
  let rex = Pcre.regexp (add_limits date_RE_raw) in
  fun s ->
130
  let abort () = simple_type_error "date" in
131
132
133
134
135
136
137
138
  let subs = try Pcre.extract ~rex s with Not_found -> abort () in
  [ "year", validate_integer subs.(1);
  "month", validate_integer subs.(2);
  "day", validate_integer subs.(3) ]

let parse_time =
  let rex = Pcre.regexp (add_limits time_RE_raw) in
  fun s ->
139
  let abort () = simple_type_error "time" in
140
141
142
143
144
145
146
147
  let subs = try Pcre.extract ~rex s with Not_found -> abort () in
  [ "hour", validate_integer subs.(1);
  "minute", validate_integer subs.(2);
  "second", validate_integer subs.(3) ]

let parse_timezone =
  let rex = Pcre.regexp (add_limits timezone_RE_raw) in
  fun s ->
148
  let abort () = raise (Schema_builtin_error "timezone") in
149
150
151
152
153
154
155
156
157
  let subs = try Pcre.extract ~rex s with Not_found -> abort () in
  match subs.(1) with
  | "Z" ->
      ["positive", Value.vtrue;
      "hour", validate_integer "0";
      "minute", validate_integer "0"]
  | _ ->
      ["positive", parse_sign subs.(3);
      "hour", validate_integer subs.(4);
158

159
160
161
162
163
164
165
      "minute", validate_integer subs.(5)]
  (* parse a timezone from a string, if it's empty return the empty list,
  otherwise return a list containing a pair <"timezone", timezone value> *)
let parse_timezone' = function
  | "" -> []
  | v -> [ "timezone", Value.vrecord (parse_timezone v) ]

166
let validate_string s = Value.string_utf8 (Encodings.Utf8.mk s)
167
168
169
170
171
172
173
174
175
176
177
let validate_normalizedString s =
  validate_string (normalize_white_space `Replace s)
let validate_token s =
  validate_string (normalize_white_space `Collapse s)
let validate_token_list s =
  Value.sequence (List.map validate_token (split_xml_S s))

let validate_interval interval type_name s =
  let integer =
    try
      Intervals.V.mk s
178
    with Failure _ -> simple_type_error type_name
179
180
181
182
  in
  if Intervals.contains integer interval then
    Value.Integer integer
  else
183
    simple_type_error type_name
184
185
186
187
188
189
190
191
let validate_nonPositiveInteger =
  validate_interval (Intervals.left Intervals.V.zero) "nonPositiveInteger"
let validate_negativeInteger =
  validate_interval (Intervals.left Intervals.V.minus_one) "negativeInteger"
let validate_nonNegativeInteger =
  validate_interval (Intervals.right Intervals.V.zero) "nonNegativeInteger"
let validate_positiveInteger =
  validate_interval (Intervals.right Intervals.V.one) "positiveInteger"
192
193
194
195
196
let validate_long = validate_interval (Intervals.bounded long_l long_r) "long"
let validate_int = validate_interval (Intervals.bounded int_l int_r) "int"
let validate_short =
  validate_interval (Intervals.bounded short_l short_r) "short"
let validate_byte = validate_interval (Intervals.bounded byte_l byte_r) "byte"
197
198
199
200

let validate_bool = function
  | "true" | "1" -> Value.vtrue
  | "false" | "0" -> Value.vfalse
201
  | v -> simple_type_error "boolean"
202

203
204
205
206
207
let validate_duration =
  let rex = Pcre.regexp
  "^([+-])?P((\\d+)Y)?((\\d+)M)?((\\d+)D)?(T((\\d+)H)?((\\d+)M)?((\\d+)S)?)?$"
  in
  fun s ->
208
  let abort () = simple_type_error "duration" in
209
210
211
212
213
214
215
216
217
218
219
220
  let subs = try Pcre.extract ~rex s with Not_found -> abort () in
  try
    let fields =
      ["positive", parse_sign subs.(1) ]                                @
      (match subs.(3) with "" -> [] | v -> ["year", validate_integer v])    @
      (match subs.(5) with "" -> [] | v -> ["month", validate_integer v])   @
      (match subs.(7) with "" -> [] | v -> ["day", validate_integer v])     @
      (match subs.(10) with "" -> [] | v -> ["hour", validate_integer v])   @
      (match subs.(12) with "" -> [] | v -> ["minute", validate_integer v]) @
      (match subs.(14) with "" -> [] | v -> ["second", validate_integer v])
    in
    Value.vrecord fields
221
  with Schema_builtin_error _ -> abort ()
222
223
224
225
226
227
228

let validate_dateTime =
  let rex = Pcre.regexp (sprintf "^([+-])?(%s)T(%s)(%s)?$"
    (strip_parens date_RE_raw) (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
229
  let abort () = simple_type_error "dateTime" in
230
231
232
233
234
235
236
237
238
  let subs = try Pcre.extract ~rex s with Not_found -> abort () in
  try
    let fields =
      [ "positive", parse_sign subs.(1) ] @
      parse_date subs.(2) @
      parse_time subs.(3) @
      parse_timezone' subs.(4)
    in
    Value.vrecord fields
239
  with Schema_builtin_error _ -> abort ()
240
241
242
243

let validate_gYearMonth =
  let rex = Pcre.regexp (add_limits gYearMonth_RE_raw) in
  fun s ->
244
    let abort () = simple_type_error "gYearMonth" in
245
246
247
248
249
250
251
252
253
    let subs = try Pcre.extract ~rex s with Not_found -> abort () in
    try
      let fields = [
        "positive", parse_sign subs.(1);
        "year", validate_integer subs.(2);
        "month", validate_integer subs.(3)
      ] @ parse_timezone' subs.(4)
      in
      Value.vrecord fields
254
    with Schema_builtin_error _ -> abort ()
255
256
257
258

let validate_gYear =
  let rex = Pcre.regexp (add_limits gYear_RE_raw) in
  fun s ->
259
    let abort () = simple_type_error "gYear" in
260
261
262
263
264
265
266
267
    let subs = try Pcre.extract ~rex s with Not_found -> abort () in
    try
      let fields = [
        "positive", parse_sign subs.(1);
        "year", validate_integer subs.(2);
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
268
    with Schema_builtin_error _ -> abort ()
269
270
271
272

let validate_gMonthDay =
  let rex = Pcre.regexp (add_limits gMonthDay_RE_raw) in
  fun s ->
273
    let abort () = simple_type_error "gMonthDay" in
274
275
276
277
278
279
280
281
    let subs = try Pcre.extract ~rex s with Not_found -> abort () in
    try
      let fields = [
        "month", validate_integer subs.(1);
        "day", validate_integer subs.(2);
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
282
    with Schema_builtin_error _ -> abort ()
283
284
285
286

let validate_gDay =
  let rex = Pcre.regexp (add_limits gDay_RE_raw) in
  fun s ->
287
    let abort () = simple_type_error "gDay" in
288
289
290
291
292
293
    let subs = try Pcre.extract ~rex s with Not_found -> abort () in
    try
      let fields =
        ("day", validate_integer subs.(1)) :: (parse_timezone' subs.(2))
      in
      Value.vrecord fields
294
    with Schema_builtin_error _ -> abort ()
295
296
297
298

let validate_gMonth =
  let rex = Pcre.regexp (add_limits gMonth_RE_raw) in
  fun s ->
299
    let abort () = simple_type_error "gMonth" in
300
301
302
303
304
305
    let subs = try Pcre.extract ~rex s with Not_found -> abort () in
    try
      let fields =
        ("month", validate_integer subs.(1)) :: (parse_timezone' subs.(2))
      in
      Value.vrecord fields
306
    with Schema_builtin_error _ -> abort ()
307
308
309
310
311
312

let validate_time =
  let rex = Pcre.regexp (sprintf "^(%s)(%s)?$" (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
313
  let abort () = simple_type_error "time" in
314
315
316
317
318
319
320
321
322
  let subs = try Pcre.extract ~rex s with Not_found -> abort () in
  try
    let fields =
      parse_time subs.(1) @
      (match subs.(2) with
      | "" -> []
      | v -> [ "timezone", Value.vrecord (parse_timezone v) ])
    in
    Value.vrecord fields
323
  with Schema_builtin_error _ -> abort ()
324
325
326
327
328
329

let validate_date =
  let rex = Pcre.regexp (sprintf "^(-)?(%s)(%s)?$" (strip_parens date_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
330
  let abort () = simple_type_error "date" in
331
332
333
334
335
336
337
338
339
340
  let subs = try Pcre.extract ~rex s with Not_found -> abort () in
  try
    let fields =
      [ "positive", parse_sign subs.(1) ] @
      parse_date subs.(2) @
      (match subs.(3) with
      | "" -> []
      | v -> [ "timezone", Value.vrecord (parse_timezone v) ])
    in
    Value.vrecord fields
341
  with Schema_builtin_error _ -> abort ()
342
343
344
345

let validate_hexBinary s =
  let len = String.length s in
  if len mod 2 <> 0 then
346
    simple_type_error "hexBinary";
347
348
349
350
351
352
353
354
355
356
357
  let res = String.create (len / 2) in
  let rec aux idx =
    if idx < len then begin
      String.unsafe_set res (idx / 2)
        (char_of_hex (String.unsafe_get s idx) (String.unsafe_get s (idx + 1)));
      aux (idx + 2)
    end
  in
  aux 0;
  validate_string res

358
let validate_base64Binary s = validate_string (Netencoding.Base64.decode s)
359
360
361
362
363

let validate_anyURI s =
  try
    validate_string (Neturl.string_of_url (Neturl.url_of_string
      Neturl.ip_url_syntax s))
364
  with Neturl.Malformed_URL -> simple_type_error "anyURI"
365

366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
  (** {2 API backend} *)

let builtins = Hashtbl.create 50
let reg name spec = Hashtbl.add builtins name spec
let alias alias name =
  Hashtbl.add builtins alias
    (let (st_def, descr, validator) = Hashtbl.find builtins name in
    let new_def =
      match st_def with
      | Primitive _ -> Primitive alias
      | Derived (_, variety, facets, base) ->
          Derived (Some alias, variety, facets, base)
    in
    (new_def, descr, validator))
let restrict' name basename new_facets =
  let (base, _, _) = Hashtbl.find builtins basename in
  let variety = variety_of_simple_type_definition base in
  let facets =
    merge_facets (facets_of_simple_type_definition base) new_facets
  in
  Derived (Some name, variety, facets, base)
let list' name itemname =
  let (base, _, _) = Hashtbl.find builtins itemname in
  Derived (Some name, List base, no_facets, base)

let fill () = (* fill "builtins" hashtbl *)

  (* TODO missing built-in simple types: xsd:float, xsd:double, xsd:QName,
394
   * xsd:NOTATION, xsd:decimal *)
395
396
397
398
399
400
401
402

  (* primitive builtins *)

  reg "xsd:anySimpleType"
    (Primitive "xsd:anySimpleType", Builtin_defs.string, validate_string);
  alias "xsd:anyType" "xsd:anySimpleType";
  reg "xsd:string"
    (Primitive "xsd:string", Builtin_defs.string, validate_string);
403
404
405
  reg "xsd:decimal"
    (* collapsed in CDuce to an integer, since CDuce has no decimal numbers *)
    (Primitive "xsd:decimal", Builtin_defs.int, validate_decimal);
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
  reg "xsd:boolean"
    (Primitive "xsd:boolean", Builtin_defs.bool, validate_bool);
  reg "xsd:hexBinary"
    (Primitive "xsd:hexBinary", Builtin_defs.string, validate_hexBinary);
  reg "xsd:base64Binary"
    (Primitive "xsd:base64Binary", Builtin_defs.string, validate_base64Binary);
  reg "xsd:anyURI"
    (Primitive "xsd:anyURI", Builtin_defs.string, validate_anyURI);
  reg "xsd:duration"
    (Primitive "xsd:duration", duration_type, validate_duration);
  reg "xsd:dateTime"
    (Primitive "xsd:dateTime", dateTime_type, validate_dateTime);
  reg "xsd:time"
    (Primitive "xsd:time", time_type, validate_time);
  reg "xsd:date"
    (Primitive "xsd:date", date_type, validate_date);
  reg "xsd:gYearMonth"
    (Primitive "xsd:gYearMonth", gYearMonth_type, validate_gYearMonth);
  reg "xsd:gYear"
    (Primitive "xsd:gYear", gYear_type, validate_gYear);
  reg "xsd:gMonthDay"
    (Primitive "xsd:gMonthDay", gMonthDay_type, validate_gMonthDay);
  reg "xsd:gDay"
    (Primitive "xsd:gDay", gDay_type, validate_gDay);
  reg "xsd:gMonth"
    (Primitive "xsd:gMonth", gMonth_type, validate_gMonth);

  (* derived builtins *)

435
436
437
  reg "xsd:integer"
    (restrict' "xsd:integer" "xsd:decimal" no_facets, (* fake restriction *)
    Builtin_defs.int, validate_integer);
438
439
  reg "xsd:nonPositiveInteger"
    (restrict' "xsd:nonPositiveInteger" "xsd:integer"
440
      { no_facets with maxInclusive = Some (Value.Integer zero, false) },
441
442
443
    nonPositiveInteger_type, validate_nonPositiveInteger);
  reg "xsd:negativeInteger"
    (restrict' "xsd:negativeInteger" "xsd:nonPositiveInteger"
444
      { no_facets with maxInclusive = Some (Value.Integer minus_one, false) },
445
446
447
    negativeInteger_type, validate_negativeInteger);
  reg "xsd:nonNegativeInteger"
    (restrict' "xsd:nonNegativeInteger" "xsd:integer"
448
      { no_facets with minInclusive = Some (Value.Integer zero, false) },
449
450
451
    nonNegativeInteger_type, validate_nonNegativeInteger);
  reg "xsd:positiveInteger"
    (restrict' "xsd:positiveInteger" "xsd:nonNegativeInteger"
452
      { no_facets with minInclusive = Some (Value.Integer one, false) },
453
    positiveInteger_type, validate_positiveInteger);
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
  reg "xsd:long"
    (restrict' "xsd:long" "xsd:integer"
      { no_facets with
          minInclusive = Some (Value.Integer long_l, false);
          maxInclusive = Some (Value.Integer long_r, false)},
    long_type, validate_long);
  reg "xsd:int"
    (restrict' "xsd:int" "xsd:long"
      { no_facets with
          minInclusive = Some (Value.Integer int_l, false);
          maxInclusive = Some (Value.Integer int_r, false)},
    int_type, validate_int);
  reg "xsd:short"
    (restrict' "xsd:short" "xsd:int"
      { no_facets with
          minInclusive = Some (Value.Integer short_l, false);
          maxInclusive = Some (Value.Integer short_r, false)},
    short_type, validate_short);
  reg "xsd:byte"
    (restrict' "xsd:byte" "xsd:short"
      { no_facets with
          minInclusive = Some (Value.Integer byte_l, false);
          maxInclusive = Some (Value.Integer byte_r, false)},
    byte_type, validate_short);
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
  reg "xsd:normalizedString"
    (restrict' "xsd:normalizedString" "xsd:string"
      { no_facets with whiteSpace = `Replace, false },
    Builtin_defs.string, validate_normalizedString);
  reg "xsd:token"
    (restrict' "xsd:token" "xsd:normalizedString"
      { no_facets with whiteSpace = `Collapse, false },
    Builtin_defs.string, validate_token);
  alias "xsd:language" "xsd:token";
  alias "xsd:Name" "xsd:token";
  alias "xsd:NMTOKEN" "xsd:token";
  alias "xsd:NCName" "xsd:token";
  alias "xsd:ID" "xsd:token";
  alias "xsd:IDREF" "xsd:token";
  alias "xsd:ENTITY" "xsd:token";
  reg "xsd:NMTOKENS"
    (list' "xsd:NMTOKENS" "xsd:token",
    string_list_type, validate_token_list);
  alias "xsd:IDREFS" "xsd:NMTOKENS";
  alias "xsd:ENTITIES" "xsd:NMTOKENS"

let _ = try fill () with Not_found -> assert false

  (** {2 API} *)

let is_builtin = Hashtbl.mem builtins
let iter_builtin f =
  Hashtbl.iter (fun _ (type_def, _, _) -> f type_def) builtins

let lookup name = Hashtbl.find builtins name

let fst (x,_,_) = x
let snd (_,y,_) = y
let trd (_,_,z) = z

let get_builtin name          = fst (lookup name)
let cd_type_of_builtin name   = snd (lookup name)
515
let validate_builtin name     = trd (lookup name)
516