load_xml.ml 4.5 KB
Newer Older
1
2
(* Loading XML documents *)

3
let use_parser = ref `Pxp
4
5
6
7
8

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
9
open Ident
10
open Encodings
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

let create n = { buffer = String.create n; pos = 0; length = n }

let resize b n =
  let new_len = b.length * 2 + n in
  let new_buf = String.create new_len in
  String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
  b.buffer <- new_buf;
  b.length <- new_len

let add_string b s =
  let len = String.length s in
  let new_pos = b.pos + len in
  if new_pos > b.length then resize b len;
  String.unsafe_blit s 0 b.buffer b.pos len;
  b.pos <- new_pos

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
37
     | _ -> false) 
38
39


40
41
42
43
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

44

45
let attrib att = 
46
  let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
47
  LabelMap.from_list (fun _ _ -> assert false) att
48
49

let elem tag att child =
50
  Xml (Atom (Atoms.mk (U.mk tag)), Record (attrib att), child)
51

52
(*
53
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end
54
*)
55

56
57
58
59
60
type token = 
  | Element of Value.t 
  | Start of string * (string * string) list 
  | String of string

61
62
63
64
65
66
67
68
69
70
71
72
73
74
let stack = ref []
let txt = create 1024


let rec create_elt accu = function
  | String s :: st -> create_elt (string s accu) st
  | Element x :: st -> create_elt (Pair (x,accu)) st
  | Start (name,att) :: st -> stack := Element (elem name att accu) :: st
  | [] -> assert false


let buflen = 1000
let buf = String.create buflen

75
76
ifdef EXPAT then

77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
let load_expat s =
  let p = Expat.parser_create "" in
  Expat.set_start_element_handler p 
    (fun name att -> 
      if not (only_ws txt.buffer txt.pos) then 
	stack := String (String.sub txt.buffer 0 txt.pos) :: !stack; 
      txt.pos <- 0;   
      stack := Start (name,att) :: !stack);
  Expat.set_end_element_handler p 
    (fun _ -> 
      let accu =
	if only_ws txt.buffer txt.pos 
	then nil 
	else string (String.sub txt.buffer 0 txt.pos) nil in
      txt.pos <- 0; 
      create_elt accu !stack);
  Expat.set_character_data_handler p (add_string txt);
94
95
(*  Gc.full_major (); *)
(*   Gc.compact (); *)
96
97
98
99
  let ic = open_in s in
  let rec loop () =
    let n = input ic buf 0 buflen in
    if (n > 0) then 
100
101
(*(Expat.parse p (String.sub buf 0 n); loop ()) *)
      (Expat.parse_sub p buf 0 n; loop ()) 
102
103
104
105
106
107
108
109
110
111
112
113
  in
  try 
    loop(); 
    Expat.final p;
    close_in ic;
    match !stack with
      | [ Element x ] -> stack := []; x
      | _ -> assert false
  with 
      Expat.Expat_error e -> 
	failwith ("Expat ("^s^"):"^Expat.xml_error_to_string e)

114
115
116
117
118
119
else

let load_expat s =
  failwith "Expat support not included"


120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

let handle_event = function
  | E_start_tag (name,att,_) ->
      if not (only_ws txt.buffer txt.pos) then 
	stack := String (String.sub txt.buffer 0 txt.pos) :: !stack; 
      txt.pos <- 0;   
      stack := Start (name,att) :: !stack
  | E_char_data data ->
      add_string txt data 
  | E_end_tag (_,_) ->
      let accu =
	if only_ws txt.buffer txt.pos 
	then nil 
	else string (String.sub txt.buffer 0 txt.pos) nil in
      txt.pos <- 0; 
      create_elt accu !stack
  | _ -> ()

let load_pxp s =
139
  let config = { default_config with 
140
		   (* warner = new warner; *)
141
		   encoding = `Enc_utf8;
142
143
144
145
146
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
147
148
149
150
151
152
153
154
155
  process_entity config (`Entry_document[]) mgr handle_event;
  match !stack with
    | [ Element x ] -> stack := []; x
    | _ -> assert false

let load_xml_aux s = 
  match !use_parser with
    | `Expat -> load_expat s
    | `Pxp -> load_pxp s
156

157
let load_xml s =
158
  Location.protect_op "load_xml";
159
  try load_xml_aux s 
160
161
162
163
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
164
165
166
167

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
168
	if (only_ws data (String.length data)) then q else string data q
169
170
171
172
173
174
175
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

176
  Location.protect_op "load_html";
177
178
179
180
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
181
  let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
182
183
  close_in ic;
  val_of_docs doc
184