schema_builtin.ml 19 KB
Newer Older
1

2
open Printf
3

4 5
open Encodings
open Encodings.Utf8.Pcre
6
open Schema_common
7
open Schema_types
8 9 10 11 12

(* TODO dates: boundary checks (e.g. 95/26/2003) *)
(* TODO a lot of almost cut-and-paste code, expecially in gFoo types validation
*)

13 14
  (** {2 Aux/Misc stuff} *)

15 16
let add_xsd_prefix s = Schema_xml.add_xsd_prefix (Utf8.mk s)

17 18 19 20
let unsupported =
  List.map (fun s -> add_xsd_prefix s)
    [ "decimal"; "float"; "double"; "NOTATION"; "QName" ]

21 22
let is_empty s = Utf8.equal s (Utf8.mk "")

23 24 25 26 27 28 29 30 31 32 33 34
let zero = Intervals.V.zero
let one = (Intervals.V.succ Intervals.V.zero)
let minus_one = (Intervals.V.pred Intervals.V.zero)
let long_l = (Intervals.V.mk "-9223372036854775808")
let long_r = (Intervals.V.mk "9223372036854775807")
let int_l = (Intervals.V.mk "-2147483648")
let int_r = (Intervals.V.mk "2147483647")
let short_l = (Intervals.V.mk "-32768")
let short_r = (Intervals.V.mk "32767")
let byte_l = (Intervals.V.mk "-128")
let byte_r = (Intervals.V.mk "127")

35
let xml_S_RE = pcre_regexp "[ \\t\\r\\n]+"
36
  (* split a string at XML recommendation "S" production boundaries *)
37 38
let split_xml_S s = pcre_split ~rex:xml_S_RE s
let norm_RE = pcre_regexp "[\\t\\r\\n]"
39 40 41 42 43 44 45 46 47 48 49

let char_of_hex =
  let int_of_hex_char = function
    | '0' -> 0 | '1' -> 1 | '2' -> 2 | '3' -> 3 | '4' -> 4 | '5' -> 5 | '6' -> 6
    | '7' -> 7 | '8' -> 8 | '9' -> 9 | 'a' | 'A' -> 10 | 'b' | 'B' -> 11
    | 'c' | 'C' -> 12 | 'd' | 'D' -> 13 | 'e' | 'E' -> 14 | 'f' | 'F' -> 15
    | _ -> assert false
  in
    (* most significative, least significative *)
  fun ms ls -> Char.unsafe_chr (int_of_hex_char ms * 16 + int_of_hex_char ls)

50 51
let strip_parens s = Pcre.replace ~pat:"[()]" s
let add_limits s = "^" ^ s ^ "$"
52

53 54
exception Schema_builtin_error of Utf8.t
let simple_type_error name = raise (Schema_builtin_error (add_xsd_prefix name))
55

56
let qualify s = (Ns.empty, Encodings.Utf8.mk s)
57

58 59
  (* regular expressions used to validate built-in types *)

60 61 62 63 64 65 66 67 68 69
let timezone_RE_raw = "(Z)|(([+-])?(\\d{2}):(\\d{2}))"
let date_RE_raw = "(\\d{4,})-(\\d{2})-(\\d{2})"
let time_RE_raw = "(\\d{2}):(\\d{2}):(\\d{2})"

let gYearMonth_RE_raw = sprintf "(-)?(\\d{4,})-(\\d{2})(%s)?" timezone_RE_raw
let gYear_RE_raw = sprintf "(-)?(\\d{4,})(%s)?" timezone_RE_raw
let gMonthDay_RE_raw = sprintf "--(\\d{2})-(\\d{2})(%s)?" timezone_RE_raw
let gDay_RE_raw = sprintf "---(\\d{2})(%s)?" timezone_RE_raw
let gMonth_RE_raw = "--(\\d{2})--(%s)?"

70 71
  (** {2 CDuce types} *)

72 73 74 75 76 77 78
let positive_field = false, qualify "positive", Builtin_defs.bool
let year_field = false, qualify "year", Builtin_defs.int
let month_field = false, qualify "month", Builtin_defs.int
let day_field = false, qualify "day", Builtin_defs.int
let hour_field = false, qualify "hour", Builtin_defs.int
let minute_field = false, qualify "minute", Builtin_defs.int
let second_field = false, qualify "second", Builtin_defs.int
79 80 81
  (* TODO this should be a decimal *)
let time_type_fields = [ hour_field; minute_field; second_field ]
let date_type_fields = [ year_field; month_field; day_field ]
82 83 84 85 86

  (* TODO the constraint that at least one part should be present isn't easily
  expressible with CDuce types *)
let duration_type = Types.rec_of_list' [
  positive_field;
87 88 89 90 91 92
  true, qualify "year", Builtin_defs.int;
  true, qualify "month", Builtin_defs.int;
  true, qualify "day", Builtin_defs.int;
  true, qualify "hour", Builtin_defs.int;
  true, qualify "minute", Builtin_defs.int;
  true, qualify "second", Builtin_defs.int; (* TODO this should be a decimal *)
93
]
94
let timezone_type = Types.rec_of_list' [
95
  positive_field;
96
  hour_field; minute_field
97
]
98
let timezone_type_fields = [ true, qualify "timezone", timezone_type ]
99 100
let time_type = Types.rec_of_list' (time_type_fields @ timezone_type_fields)
let date_type = Types.rec_of_list' (positive_field :: date_type_fields)
101 102 103 104 105
let dateTime_type =
  Types.rec_of_list' (positive_field ::
    (date_type_fields @ time_type_fields @ timezone_type_fields))
let gYearMonth_type = Types.rec_of_list' [
  positive_field; year_field; month_field
106 107 108 109 110
]
let gYear_type = Types.rec_of_list' [ positive_field; year_field ]
let gMonthDay_type = Types.rec_of_list' [ month_field; day_field ]
let gDay_type = Types.rec_of_list' [ day_field ]
let gMonth_type = Types.rec_of_list' [ month_field ]
111

112 113 114 115
let nonPositiveInteger_type = Builtin_defs.non_pos_int
let negativeInteger_type = Builtin_defs.neg_int
let nonNegativeInteger_type = Builtin_defs.non_neg_int
let positiveInteger_type = Builtin_defs.pos_int
116 117 118 119
let long_type = Builtin_defs.long_int
let int_type = Builtin_defs.int_int
let short_type = Builtin_defs.short_int
let byte_type = Builtin_defs.byte_int
120

121 122 123
let string_list_type = Sequence.star Builtin_defs.string

  (** {2 Validation functions (string -> Value.t)} *)
124

125 126 127 128 129
let parse_sign s =
  if Utf8.equal s (Utf8.mk "+") || Utf8.equal s (Utf8.mk "") then
    Value.vtrue
  else
    Value.vfalse
130 131 132

let validate_integer s =
  try
133
    Value.Integer (Intervals.V.mk (Utf8.get_str s))
134 135 136
  with Failure _ -> simple_type_error "integer"

let strip_decimal_RE = Pcre.regexp "\\..*$"
137 138 139 140

let parse_date =
  let rex = Pcre.regexp (add_limits date_RE_raw) in
  fun s ->
141
  let abort () = simple_type_error "date" in
142 143 144 145
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  [ qualify "year", validate_integer subs.(1);
    qualify "month", validate_integer subs.(2);
    qualify "day", validate_integer subs.(3) ]
146 147 148 149

let parse_time =
  let rex = Pcre.regexp (add_limits time_RE_raw) in
  fun s ->
150
  let abort () = simple_type_error "time" in
151 152 153 154
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  [ qualify "hour", validate_integer subs.(1);
    qualify "minute", validate_integer subs.(2);
    qualify "second", validate_integer subs.(3) ]
155 156 157 158

let parse_timezone =
  let rex = Pcre.regexp (add_limits timezone_RE_raw) in
  fun s ->
159 160 161 162 163 164 165 166 167 168
  let abort () = simple_type_error "timezone" in
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
  if Utf8.equal subs.(1) (Utf8.mk "Z") then
    [qualify "positive", Value.vtrue;
     qualify "hour", validate_integer (Utf8.mk "0");
     qualify "minute", validate_integer (Utf8.mk "0")]
  else
    [qualify "positive", parse_sign subs.(3);
     qualify "hour", validate_integer subs.(4);
     qualify "minute", validate_integer subs.(5)]
169 170
  (* parse a timezone from a string, if it's empty return the empty list,
  otherwise return a list containing a pair <"timezone", timezone value> *)
171 172 173 174 175
let parse_timezone' s =
  if is_empty s then
    []
  else
    [ qualify "timezone", Value.vrecord (parse_timezone s) ]
176

177
let validate_string s = Value.string_utf8 s
178 179 180 181 182 183 184 185 186 187
let validate_normalizedString s =
  validate_string (normalize_white_space `Replace s)
let validate_token s =
  validate_string (normalize_white_space `Collapse s)
let validate_token_list s =
  Value.sequence (List.map validate_token (split_xml_S s))

let validate_interval interval type_name s =
  let integer =
    try
188
      Intervals.V.mk (Utf8.get_str s)
189
    with Failure _ -> simple_type_error type_name
190 191 192 193
  in
  if Intervals.contains integer interval then
    Value.Integer integer
  else
194
    simple_type_error type_name
195 196 197 198 199 200 201 202
let validate_nonPositiveInteger =
  validate_interval (Intervals.left Intervals.V.zero) "nonPositiveInteger"
let validate_negativeInteger =
  validate_interval (Intervals.left Intervals.V.minus_one) "negativeInteger"
let validate_nonNegativeInteger =
  validate_interval (Intervals.right Intervals.V.zero) "nonNegativeInteger"
let validate_positiveInteger =
  validate_interval (Intervals.right Intervals.V.one) "positiveInteger"
203 204 205 206 207
let validate_long = validate_interval (Intervals.bounded long_l long_r) "long"
let validate_int = validate_interval (Intervals.bounded int_l int_r) "int"
let validate_short =
  validate_interval (Intervals.bounded short_l short_r) "short"
let validate_byte = validate_interval (Intervals.bounded byte_l byte_r) "byte"
208

209 210 211 212 213 214 215
let validate_bool s =
  if Utf8.equal s (Utf8.mk "true") || Utf8.equal s (Utf8.mk "1") then
    Value.vtrue
  else if Utf8.equal s (Utf8.mk "false") || Utf8.equal s (Utf8.mk "0") then
    Value.vfalse
  else
    simple_type_error "boolean"
216

217
let validate_duration =
218
  let rex = pcre_regexp
219 220 221
  "^([+-])?P((\\d+)Y)?((\\d+)M)?((\\d+)D)?(T((\\d+)H)?((\\d+)M)?((\\d+)S)?)?$"
  in
  fun s ->
222
  let abort () = simple_type_error "duration" in
223
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
224 225
  try
    let fields =
226 227 228 229 230 231 232 233 234 235 236 237 238
      [qualify "positive", parse_sign subs.(1) ] @
      (if is_empty subs.(3) then []
      else [qualify "year", validate_integer subs.(3)]) @
      (if is_empty subs.(5) then []
      else [qualify "month", validate_integer subs.(5)]) @
      (if is_empty subs.(7) then []
      else [qualify "day", validate_integer subs.(7)]) @
      (if is_empty subs.(10) then []
      else [qualify "hour", validate_integer subs.(10)]) @
      (if is_empty subs.(12) then []
      else [qualify "minute", validate_integer subs.(12)]) @
      (if is_empty subs.(14) then []
      else [qualify "second", validate_integer subs.(14)])
239 240
    in
    Value.vrecord fields
241
  with Schema_builtin_error _ -> abort ()
242 243 244 245 246 247 248

let validate_dateTime =
  let rex = Pcre.regexp (sprintf "^([+-])?(%s)T(%s)(%s)?$"
    (strip_parens date_RE_raw) (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
249
  let abort () = simple_type_error "dateTime" in
250
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
251 252
  try
    let fields =
253
      [ qualify "positive", parse_sign subs.(1) ] @
254 255 256 257 258
      parse_date subs.(2) @
      parse_time subs.(3) @
      parse_timezone' subs.(4)
    in
    Value.vrecord fields
259
  with Schema_builtin_error _ -> abort ()
260 261 262 263

let validate_gYearMonth =
  let rex = Pcre.regexp (add_limits gYearMonth_RE_raw) in
  fun s ->
264
    let abort () = simple_type_error "gYearMonth" in
265
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
266 267
    try
      let fields = [
268 269 270
        qualify "positive", parse_sign subs.(1);
        qualify "year", validate_integer subs.(2);
        qualify "month", validate_integer subs.(3)
271 272 273
      ] @ parse_timezone' subs.(4)
      in
      Value.vrecord fields
274
    with Schema_builtin_error _ -> abort ()
275 276 277 278

let validate_gYear =
  let rex = Pcre.regexp (add_limits gYear_RE_raw) in
  fun s ->
279
    let abort () = simple_type_error "gYear" in
280
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
281 282
    try
      let fields = [
283 284
        qualify "positive", parse_sign subs.(1);
        qualify "year", validate_integer subs.(2);
285 286 287
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
288
    with Schema_builtin_error _ -> abort ()
289 290 291 292

let validate_gMonthDay =
  let rex = Pcre.regexp (add_limits gMonthDay_RE_raw) in
  fun s ->
293
    let abort () = simple_type_error "gMonthDay" in
294
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
295 296
    try
      let fields = [
297 298
        qualify "month", validate_integer subs.(1);
        qualify "day", validate_integer subs.(2);
299 300 301
      ] @ parse_timezone' subs.(3)
      in
      Value.vrecord fields
302
    with Schema_builtin_error _ -> abort ()
303 304 305 306

let validate_gDay =
  let rex = Pcre.regexp (add_limits gDay_RE_raw) in
  fun s ->
307
    let abort () = simple_type_error "gDay" in
308
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
309 310
    try
      let fields =
311 312
        (qualify "day", validate_integer subs.(1)) ::
          (parse_timezone' subs.(2))
313 314
      in
      Value.vrecord fields
315
    with Schema_builtin_error _ -> abort ()
316 317 318 319

let validate_gMonth =
  let rex = Pcre.regexp (add_limits gMonth_RE_raw) in
  fun s ->
320
    let abort () = simple_type_error "gMonth" in
321
    let subs = try pcre_extract ~rex s with Not_found -> abort () in
322 323
    try
      let fields =
324 325
        (qualify "month", validate_integer subs.(1)) ::
          (parse_timezone' subs.(2))
326 327
      in
      Value.vrecord fields
328
    with Schema_builtin_error _ -> abort ()
329 330 331 332 333 334

let validate_time =
  let rex = Pcre.regexp (sprintf "^(%s)(%s)?$" (strip_parens time_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
335
  let abort () = simple_type_error "time" in
336
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
337 338 339
  try
    let fields =
      parse_time subs.(1) @
340 341
      (if is_empty subs.(2) then []
      else [ qualify "timezone", Value.vrecord (parse_timezone subs.(2)) ])
342 343
    in
    Value.vrecord fields
344
  with Schema_builtin_error _ -> abort ()
345 346 347 348 349 350

let validate_date =
  let rex = Pcre.regexp (sprintf "^(-)?(%s)(%s)?$" (strip_parens date_RE_raw)
    (strip_parens timezone_RE_raw))
  in
  fun s ->
351
  let abort () = simple_type_error "date" in
352
  let subs = try pcre_extract ~rex s with Not_found -> abort () in
353 354
  try
    let fields =
355
      [ qualify "positive", parse_sign subs.(1) ] @
356
      parse_date subs.(2) @
357 358
      (if is_empty subs.(3) then []
      else [ qualify "timezone", Value.vrecord (parse_timezone subs.(3)) ])
359 360
    in
    Value.vrecord fields
361
  with Schema_builtin_error _ -> abort ()
362 363

let validate_hexBinary s =
364
  let s = Utf8.get_str s in
365 366
  let len = String.length s in
  if len mod 2 <> 0 then
367
    simple_type_error "hexBinary";
368 369 370 371 372 373 374 375 376
  let res = String.create (len / 2) in
  let rec aux idx =
    if idx < len then begin
      String.unsafe_set res (idx / 2)
        (char_of_hex (String.unsafe_get s idx) (String.unsafe_get s (idx + 1)));
      aux (idx + 2)
    end
  in
  aux 0;
377
  validate_string (Utf8.mk res)
378

379 380 381
let validate_base64Binary s =
  let s = Utf8.get_str s in
  validate_string (Utf8.mk (Netencoding.Base64.decode s))
382 383

let validate_anyURI s =
384
  let s = Utf8.get_str s in
385
  try
386 387
    validate_string (Utf8.mk (Neturl.string_of_url (Neturl.url_of_string
      Neturl.ip_url_syntax s)))
388
  with Neturl.Malformed_URL -> simple_type_error "anyURI"
389

390 391 392
  (** {2 API backend} *)

let builtins = Hashtbl.create 50
393
let reg name spec = Hashtbl.add builtins (add_xsd_prefix name) spec
394
let alias alias name =
395
  let (alias, name) = (add_xsd_prefix alias, add_xsd_prefix name) in
396 397 398 399 400 401 402 403 404 405
  Hashtbl.add builtins alias
    (let (st_def, descr, validator) = Hashtbl.find builtins name in
    let new_def =
      match st_def with
      | Primitive _ -> Primitive alias
      | Derived (_, variety, facets, base) ->
          Derived (Some alias, variety, facets, base)
    in
    (new_def, descr, validator))
let restrict' name basename new_facets =
406
  let (name, basename) = (add_xsd_prefix name, add_xsd_prefix basename) in
407 408 409 410 411 412 413
  let (base, _, _) = Hashtbl.find builtins basename in
  let variety = variety_of_simple_type_definition base in
  let facets =
    merge_facets (facets_of_simple_type_definition base) new_facets
  in
  Derived (Some name, variety, facets, base)
let list' name itemname =
414
  let (name, itemname) = (add_xsd_prefix name, add_xsd_prefix itemname) in
415 416 417 418
  let (base, _, _) = Hashtbl.find builtins itemname in
  Derived (Some name, List base, no_facets, base)

let fill () = (* fill "builtins" hashtbl *)
419
  let primitive name = Primitive (add_xsd_prefix name) in
420 421 422

  (* primitive builtins *)

423 424
  reg "anySimpleType"
    (primitive "anySimpleType", Builtin_defs.string, validate_string);
425
  alias "anyType" "anySimpleType";  (* TODO BUG HERE *)
426 427
  reg "string"
    (primitive "string", Builtin_defs.string, validate_string);
428 429 430 431 432 433 434 435

    (* TODO following types not yet supported (see "unsupported" above) *)
  alias "decimal" "string";
  alias "float" "string";
  alias "double" "string";
  alias "NOTATION" "string";
  alias "QName" "string";

436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
  reg "boolean"
    (primitive "boolean", Builtin_defs.bool, validate_bool);
  reg "hexBinary"
    (primitive "hexBinary", Builtin_defs.string, validate_hexBinary);
  reg "base64Binary"
    (primitive "base64Binary", Builtin_defs.string, validate_base64Binary);
  reg "anyURI"
    (primitive "anyURI", Builtin_defs.string, validate_anyURI);
  reg "duration"
    (primitive "duration", duration_type, validate_duration);
  reg "dateTime"
    (primitive "dateTime", dateTime_type, validate_dateTime);
  reg "time"
    (primitive "time", time_type, validate_time);
  reg "date"
    (primitive "date", date_type, validate_date);
  reg "gYearMonth"
    (primitive "gYearMonth", gYearMonth_type, validate_gYearMonth);
  reg "gYear"
    (primitive "gYear", gYear_type, validate_gYear);
  reg "gMonthDay"
    (primitive "gMonthDay", gMonthDay_type, validate_gMonthDay);
  reg "gDay"
    (primitive "gDay", gDay_type, validate_gDay);
  reg "gMonth"
    (primitive "gMonth", gMonth_type, validate_gMonth);
462 463 464

  (* derived builtins *)

465 466
  reg "integer"
    (restrict' "integer" "decimal" no_facets, (* fake restriction *)
467
    Builtin_defs.int, validate_integer);
468 469
  reg "nonPositiveInteger"
    (restrict' "nonPositiveInteger" "integer"
470
      { no_facets with maxInclusive = Some (Value.Integer zero, false) },
471
    nonPositiveInteger_type, validate_nonPositiveInteger);
472 473
  reg "negativeInteger"
    (restrict' "negativeInteger" "nonPositiveInteger"
474
      { no_facets with maxInclusive = Some (Value.Integer minus_one, false) },
475
    negativeInteger_type, validate_negativeInteger);
476 477
  reg "nonNegativeInteger"
    (restrict' "nonNegativeInteger" "integer"
478
      { no_facets with minInclusive = Some (Value.Integer zero, false) },
479
    nonNegativeInteger_type, validate_nonNegativeInteger);
480 481
  reg "positiveInteger"
    (restrict' "positiveInteger" "nonNegativeInteger"
482
      { no_facets with minInclusive = Some (Value.Integer one, false) },
483
    positiveInteger_type, validate_positiveInteger);
484 485
  reg "long"
    (restrict' "long" "integer"
486 487 488 489
      { no_facets with
          minInclusive = Some (Value.Integer long_l, false);
          maxInclusive = Some (Value.Integer long_r, false)},
    long_type, validate_long);
490 491
  reg "int"
    (restrict' "int" "long"
492 493 494 495
      { no_facets with
          minInclusive = Some (Value.Integer int_l, false);
          maxInclusive = Some (Value.Integer int_r, false)},
    int_type, validate_int);
496 497
  reg "short"
    (restrict' "short" "int"
498 499 500 501
      { no_facets with
          minInclusive = Some (Value.Integer short_l, false);
          maxInclusive = Some (Value.Integer short_r, false)},
    short_type, validate_short);
502 503
  reg "byte"
    (restrict' "byte" "short"
504 505 506 507
      { no_facets with
          minInclusive = Some (Value.Integer byte_l, false);
          maxInclusive = Some (Value.Integer byte_r, false)},
    byte_type, validate_short);
508 509
  reg "normalizedString"
    (restrict' "normalizedString" "string"
510 511
      { no_facets with whiteSpace = `Replace, false },
    Builtin_defs.string, validate_normalizedString);
512 513
  reg "token"
    (restrict' "token" "normalizedString"
514 515
      { no_facets with whiteSpace = `Collapse, false },
    Builtin_defs.string, validate_token);
516 517 518 519 520 521 522 523 524
  alias "language" "token";
  alias "Name" "token";
  alias "NMTOKEN" "token";
  alias "NCName" "token";
  alias "ID" "token";
  alias "IDREF" "token";
  alias "ENTITY" "token";
  reg "NMTOKENS"
    (list' "NMTOKENS" "token",
525
    string_list_type, validate_token_list);
526 527
  alias "IDREFS" "NMTOKENS";
  alias "ENTITIES" "NMTOKENS"
528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543

let _ = try fill () with Not_found -> assert false

  (** {2 API} *)

let is_builtin = Hashtbl.mem builtins
let iter_builtin f =
  Hashtbl.iter (fun _ (type_def, _, _) -> f type_def) builtins

let lookup name = Hashtbl.find builtins name

let fst (x,_,_) = x
let snd (_,y,_) = y
let trd (_,_,z) = z

let get_builtin name          = fst (lookup name)
544 545 546 547 548 549
let cd_type_of_builtin name   =
  if List.mem name unsupported then
    Format.fprintf Format.err_formatter
      "Warning: %s isn't properly supported and is thread as a string by CDuce@."
      (Utf8.get_str name);
  snd (lookup name)
550
let validate_builtin name     = trd (lookup name)
551