load_xml.ml 5.04 KB
Newer Older
1
2
(* Loading XML documents *)

3
4
5
6
7
8
ifdef EXPAT then
  let expat_support = true
else
  let expat_support = false

let use_parser = ref (if expat_support then `Expat else `Pxp)
9
10
11
12
13

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
14
open Ident
15
open Encodings
16

17
18
19
20
21
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

22
let txt = { buffer = String.create 1024; pos = 0; length = 1024 }
23

24
25
let resize n =
  let new_len = txt.length * 2 + n in
26
  let new_buf = String.create new_len in
27
28
29
  String.unsafe_blit txt.buffer 0 new_buf 0 txt.pos;
  txt.buffer <- new_buf;
  txt.length <- new_len
30

31
let add_string s =
32
  let len = String.length s in
33
34
35
36
  let new_pos = txt.pos + len in
  if new_pos > txt.length then resize len;
  String.unsafe_blit s 0 txt.buffer txt.pos len;
  txt.pos <- new_pos
37
38
39
40
41

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
42
     | _ -> false) 
43
44


45
46
47
48
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

49

50
let attrib att = 
51
52
53
  (* TODO: better error message *)
  let att = List.map (fun (n,v) -> LabelPool.mk n, string_utf8 v) att in
  LabelMap.from_list (fun _ _ -> failwith "Invalid XML document: uniqueness of attributes") att
54

55
let elem (tag_ns,tag) att child =
56
  Xml (Atom (Atoms.V.mk tag_ns tag), Record (attrib att), child)
57

58
(*
59
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end
60
*)
61

62
63
type token = 
  | Element of Value.t 
64
  | Start of Ns.qname * (Ns.qname * Utf8.t) list * Ns.table
65
66
  | String of string

67
let stack = ref []
68
let ns_table = ref Ns.empty_table
69
70
71
72

let rec create_elt accu = function
  | String s :: st -> create_elt (string s accu) st
  | Element x :: st -> create_elt (Pair (x,accu)) st
73
74
75
  | Start (name,att,table) :: st -> 
      stack := Element (elem name att accu) :: st;
      ns_table := table
76
77
  | [] -> assert false

78
79
80
let start_element_handler name att =
  if not (only_ws txt.buffer txt.pos) then 
    stack := String (String.sub txt.buffer 0 txt.pos) :: !stack; 
81
82
83
84
85
  txt.pos <- 0;

  let (table,name,att) = Ns.process_start_tag !ns_table name att in
  stack := Start (name,att,!ns_table) :: !stack;
  ns_table := table
86
87
88
89
90
91
92
93

let end_element_handler _ =
  let accu =
    if only_ws txt.buffer txt.pos 
    then nil 
    else string (String.sub txt.buffer 0 txt.pos) nil in
  txt.pos <- 0; 
  create_elt accu !stack
94

95
96
ifdef EXPAT then

97
98
99
100
101
102
103
104
105
106
107
108
let load_expat =
  let buflen = 1024 in
  let buf = String.create buflen in
  fun s -> 
  let ic = 
    try open_in s
    with exn ->
      let msg = 
	Printf.sprintf "load_xml, file \"%s\": %s" s (Printexc.to_string exn)
      in
      raise (Location.Generic msg)
  in
109
  let p = Expat.parser_create "" in
110
111
112
  Expat.set_start_element_handler p start_element_handler;
  Expat.set_end_element_handler p end_element_handler;
  Expat.set_character_data_handler p add_string;
113
114
  let rec loop () =
    let n = input ic buf 0 buflen in
115
    if (n > 0) then (Expat.parse_sub p buf 0 n; loop ()) 
116
117
118
119
120
121
122
  in
  try 
    loop(); 
    Expat.final p;
    close_in ic;
  with 
      Expat.Expat_error e -> 
123
124
125
126
127
128
129
130
131
132
133
134
	close_in ic;
	let line = Expat.get_current_line_number p 
	and col  = Expat.get_current_column_number p in
	let msg =
	  Printf.sprintf
	    "load_xml, file \"%s\", at line %i, column %i: %s"
	    s 
	    (Expat.get_current_line_number p)
	    (Expat.get_current_column_number p)
	    (Expat.xml_error_to_string e)
	in
	raise (Location.Generic msg)
135
136
137
138
139
else

let load_expat s =
  failwith "Expat support not included"

140
141
142
143
let pxp_handle_event = function
  | E_start_tag (name,att,_) -> start_element_handler name att
  | E_char_data data -> add_string data
  | E_end_tag (_,_) -> end_element_handler ()
144
145
  | _ -> ()

146
147
148
149
150
151
152
let pxp_config = 
  { default_config with 
      (* warner = new warner; *)
      encoding = `Enc_utf8;
      store_element_positions = false;
      drop_ignorable_whitespace = true
  }
153

154
155
156
let load_pxp s =
  try
    let mgr = create_entity_manager pxp_config (from_file s) in
157
    process_entity pxp_config (`Entry_document[`Extend_dtd_fully]) mgr pxp_handle_event;
158
159
  with exn ->
    raise (Location.Generic (Pxp_types.string_of_exn exn))
160

161
let load_xml s =
162
  Location.protect_op "load_xml";
163
164
165
166
167
168
169
170
171
  try
    (match !use_parser with
       | `Expat -> load_expat s
       | `Pxp -> load_pxp s);
    match !stack with
      | [ Element x ] -> stack := []; x
      | _ -> assert false
  with e -> stack := []; txt.pos <-0; raise e

172
      
173
174
175
176

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
177
	if (only_ws data (String.length data)) then q else string data q
178
    | Nethtml.Element (tag, att, child) -> 
179
180
	let att = List.map (fun (n,v) -> ((Ns.empty, U.mk n), U.mk v)) att in
	Pair (elem (Ns.empty,U.mk tag) att (val_of_docs child), q)
181
182
183
184
185
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

186
  Location.protect_op "load_html";
187
188
189
190
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
191
  let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
192
193
  close_in ic;
  val_of_docs doc
194