load_xml.ml 5.67 KB
Newer Older
1
2
(* Loading XML documents *)

3
4
5
6
7
8
ifdef EXPAT then
  let expat_support = true
else
  let expat_support = false

let use_parser = ref (if expat_support then `Expat else `Pxp)
9
10
11
12
13

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
14
open Ident
15
open Encodings
16

17

18
19
20
21
22
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

23
let txt = { buffer = String.create 1024; pos = 0; length = 1024 }
24

25
let resize txt n  =
26
  let new_len = txt.length * 2 + n in
27
  let new_buf = String.create new_len in
28
29
30
  String.unsafe_blit txt.buffer 0 new_buf 0 txt.pos;
  txt.buffer <- new_buf;
  txt.length <- new_len
31

32
let add_string txt s =
33
  let len = String.length s in
34
  let new_pos = txt.pos + len in
35
  if new_pos > txt.length then resize txt len;
36
37
  String.unsafe_blit s 0 txt.buffer txt.pos len;
  txt.pos <- new_pos
38
39
40
41
42

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
43
     | _ -> false) 
44
45


46
47
48
49
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

50

51
let attrib att = 
52
53
54
  (* TODO: better error message *)
  let att = List.map (fun (n,v) -> LabelPool.mk n, string_utf8 v) att in
  LabelMap.from_list (fun _ _ -> failwith "Invalid XML document: uniqueness of attributes") att
55

56
let elem (tag_ns,tag) att child =
57
  Xml (Atom (Atoms.V.mk tag_ns tag), Record (attrib att), child)
58

59
(*
60
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end
61
*)
62

63
64
type token = 
  | Element of Value.t 
65
  | Start of Ns.qname * (Ns.qname * Utf8.t) list * Ns.table
66
67
  | String of string

68
let stack = ref []
69
let ns_table = ref Ns.empty_table
70
71
72
73

let rec create_elt accu = function
  | String s :: st -> create_elt (string s accu) st
  | Element x :: st -> create_elt (Pair (x,accu)) st
74
75
76
  | Start (name,att,table) :: st -> 
      stack := Element (elem name att accu) :: st;
      ns_table := table
77
78
  | [] -> assert false

79
80
81
let start_element_handler name att =
  if not (only_ws txt.buffer txt.pos) then 
    stack := String (String.sub txt.buffer 0 txt.pos) :: !stack; 
82
83
84
85
86
  txt.pos <- 0;

  let (table,name,att) = Ns.process_start_tag !ns_table name att in
  stack := Start (name,att,!ns_table) :: !stack;
  ns_table := table
87
88
89
90
91
92
93
94

let end_element_handler _ =
  let accu =
    if only_ws txt.buffer txt.pos 
    then nil 
    else string (String.sub txt.buffer 0 txt.pos) nil in
  txt.pos <- 0; 
  create_elt accu !stack
95

96
97
ifdef EXPAT then

98
99
100
101
102
let load_expat =
  let buflen = 1024 in
  let buf = String.create buflen in
  fun s -> 
  let ic = 
103
    if Url.is_url s then
104
      let msg = 
105
        Printf.sprintf "load_xml, file \"%s\": URLs support is not available for expat, yet." s
106
107
      in
      raise (Location.Generic msg)
108
109
110
111
112
113
114
    else  
      try open_in s
      with exn ->
        let msg = 
	  Printf.sprintf "load_xml, file \"%s\": %s" s (Printexc.to_string exn)
        in
        raise (Location.Generic msg)
115
  in
116
  let p = Expat.parser_create "" in
117
118
  Expat.set_start_element_handler p start_element_handler;
  Expat.set_end_element_handler p end_element_handler;
119
  Expat.set_character_data_handler p (add_string txt);
120
121
  let rec loop () =
    let n = input ic buf 0 buflen in
122
    if (n > 0) then (Expat.parse_sub p buf 0 n; loop ()) 
123
124
125
126
127
128
129
  in
  try 
    loop(); 
    Expat.final p;
    close_in ic;
  with 
      Expat.Expat_error e -> 
130
131
132
133
134
135
136
137
138
139
140
141
	close_in ic;
	let line = Expat.get_current_line_number p 
	and col  = Expat.get_current_column_number p in
	let msg =
	  Printf.sprintf
	    "load_xml, file \"%s\", at line %i, column %i: %s"
	    s 
	    (Expat.get_current_line_number p)
	    (Expat.get_current_column_number p)
	    (Expat.xml_error_to_string e)
	in
	raise (Location.Generic msg)
142
143
144
145
146
else

let load_expat s =
  failwith "Expat support not included"

147
148
let pxp_handle_event = function
  | E_start_tag (name,att,_) -> start_element_handler name att
149
  | E_char_data data -> add_string txt data 
150
  | E_end_tag (_,_) -> end_element_handler ()
151
152
  | _ -> ()

153
154
155
156
157
158
159
let pxp_config = 
  { default_config with 
      (* warner = new warner; *)
      encoding = `Enc_utf8;
      store_element_positions = false;
      drop_ignorable_whitespace = true
  }
160

161
162
let load_pxp s =
  try
163
164
165
166
167
    let src = 
      match s with 
        | Url.Url s -> from_string s
        | Url.Filename s -> from_file s in
    let mgr = create_entity_manager pxp_config src in
168
    process_entity pxp_config (`Entry_document[`Extend_dtd_fully]) mgr pxp_handle_event;
169
170
  with exn ->
    raise (Location.Generic (Pxp_types.string_of_exn exn))
171

172
173


174
let load_xml s =
175
  Location.protect_op "load_xml";
176
177
178
  try
    (match !use_parser with
       | `Expat -> load_expat s
179
       | `Pxp -> load_pxp (Url.process s));
180
181
182
183
184
    match !stack with
      | [ Element x ] -> stack := []; x
      | _ -> assert false
  with e -> stack := []; txt.pos <-0; raise e

185
      
186
187
188
189

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
190
	if (only_ws data (String.length data)) then q else string data q
191
    | Nethtml.Element (tag, att, child) -> 
192
193
	let att = List.map (fun (n,v) -> ((Ns.empty, U.mk n), U.mk v)) att in
	Pair (elem (Ns.empty,U.mk tag) att (val_of_docs child), q)
194
195
196
197
198
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

199
  Location.protect_op "load_html";
200
201
202
203
204
205
206
207
208
209
210
  let parse src = Nethtml.parse_document ~dtd:Nethtml.relaxed_html40_dtd src in
  let doc = 
    match Url.process s with 
      | Url.Filename s ->
          let ic = open_in s in
          let doc = parse (Lexing.from_channel ic) in
          close_in ic;
	  doc
      | Url.Url s ->
          parse (Lexing.from_string s) 
  in
211
212
213
214
215
216
  let doc = Nethtml.decode ~subst:(fun _ -> "???") doc in
  let doc = Nethtml.map_list 
	      (Netconversion.convert ~in_enc:`Enc_iso88591
		 ~out_enc:`Enc_utf8) doc in
  val_of_docs doc

217
  
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232