load_xml.ml 3.65 KB
Newer Older
1
2
3
(* Loading XML documents *)

open Value
4
open Ident
5
open Encodings
6

7
let keep_ns = ref true
8

9
10
11
12
13
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

14
let txt = { buffer = String.create 1024; pos = 0; length = 1024 }
15

16
let resize txt n  =
17
  let new_len = txt.length * 2 + n in
18
  let new_buf = String.create new_len in
19
20
21
  String.unsafe_blit txt.buffer 0 new_buf 0 txt.pos;
  txt.buffer <- new_buf;
  txt.length <- new_len
22

23
let add_string txt s =
24
  let len = String.length s in
25
  let new_pos = txt.pos + len in
26
  if new_pos > txt.length then resize txt len;
27
28
  String.unsafe_blit s 0 txt.buffer txt.pos len;
  txt.pos <- new_pos
29
30
31
32
33

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
34
     | _ -> false) 
35
36


37
38
39
40
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

41

42
let attrib att = 
43
44
  (* TODO: better error message *)
  let att = List.map (fun (n,v) -> LabelPool.mk n, string_utf8 v) att in
45
  Imap.create (Array.of_list att)
46

47
48
49
50
51
let elem ns (tag_ns,tag) att child =
  if !keep_ns then
    XmlNs (Atom (Atoms.V.mk tag_ns tag), Record (attrib att), child, ns)
  else
    Xml (Atom (Atoms.V.mk tag_ns tag), Record (attrib att), child)
52

53
54
type stack = 
  | Element of Value.t * stack
55
  | Start of Ns.table * Ns.qname * (Ns.qname * Utf8.t) list * Ns.table * stack
56
57
  | String of string * stack
  | Empty
58

59
let stack = ref Empty
60
let ns_table = ref Ns.empty_table
61
62

let rec create_elt accu = function
63
64
  | String (s,st) -> create_elt (string s accu) st
  | Element (x,st) -> create_elt (Pair (x,accu)) st
65
66
67
  | Start (ns,name,att,old_table,st) -> 
      stack := Element (elem ns name att accu, st);
      ns_table := old_table
68
  | Empty -> assert false
69

70
71
let start_element_handler name att =
  if not (only_ws txt.buffer txt.pos) then 
72
    stack := String (String.sub txt.buffer 0 txt.pos, !stack); 
73
74
75
  txt.pos <- 0;

  let (table,name,att) = Ns.process_start_tag !ns_table name att in
76
  stack := Start (table,name,att,!ns_table, !stack);
77
  ns_table := table
78
79
80
81
82
83
84
85

let end_element_handler _ =
  let accu =
    if only_ws txt.buffer txt.pos 
    then nil 
    else string (String.sub txt.buffer 0 txt.pos) nil in
  txt.pos <- 0; 
  create_elt accu !stack
86

87
88
let text_handler = add_string txt

89

90
let xml_parser = ref (fun s -> failwith "No XML parser available")
91
92


93
let load_xml ?(ns=false) s =
94
  try
95
    keep_ns := ns;
96
    !xml_parser s;
97
    match !stack with
98
      | Element (x,Empty) -> stack := Empty; x
99
      | _ -> Value.failwith' "No XML stream to parse"
100
101
102
103
  with e -> stack := Empty; txt.pos <-0; 
    match e with 
      | Ns.UnknownPrefix _ -> Value.failwith' "Unknown namespace prefix"
      | e -> raise e
104

105
      
106
107
108
109

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
110
	if (only_ws data (String.length data)) then q else string data q
111
    | Nethtml.Element (tag, att, child) -> 
112
	let att = List.map (fun (n,v) -> ((Ns.empty, U.mk n), U.mk v)) att in
113
	Pair (elem Ns.empty_table (Ns.empty,U.mk tag) att (val_of_docs child), q)
114
115
116
117
118
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

119
  Location.protect_op "load_html";
120
121
  let parse src = Nethtml.parse_document ~dtd:Nethtml.relaxed_html40_dtd src in
  let doc = 
122
123
124
125
126
127
128
129
130
    if Url.is_url s then
      parse (Lexing.from_string (Url.load_url s))
    else
      let ic = open_in s in
      let doc = 
	try parse (Lexing.from_channel ic) 
	with exn -> close_in ic; raise exn in
      close_in ic;
      doc
131
  in
132
133
134
135
136
  let doc = Nethtml.decode ~subst:(fun _ -> "???") doc in
  let doc = Nethtml.map_list 
	      (Netconversion.convert ~in_enc:`Enc_iso88591
		 ~out_enc:`Enc_utf8) doc in
  val_of_docs doc