load_xml.ml 4.37 KB
Newer Older
1
2
(* Loading XML documents *)

3
let use_parser = ref `Pxp
4
5
6
7
8

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
9
open Ident
10
open Encodings
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

let create n = { buffer = String.create n; pos = 0; length = n }

let resize b n =
  let new_len = b.length * 2 + n in
  let new_buf = String.create new_len in
  String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
  b.buffer <- new_buf;
  b.length <- new_len

let add_string b s =
  let len = String.length s in
  let new_pos = b.pos + len in
  if new_pos > b.length then resize b len;
  String.unsafe_blit s 0 b.buffer b.pos len;
  b.pos <- new_pos

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
37
     | _ -> false) 
38
39


40
41
42
43
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

44

45
let attrib att = 
46
  let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
47
  LabelMap.from_list (fun _ _ -> assert false) att
48
49

let elem tag att child =
50
  Xml (Atom (Atoms.mk (U.mk tag)), Record (attrib att), child)
51

52
(*
53
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end
54
*)
55

56
57
58
59
60
type token = 
  | Element of Value.t 
  | Start of string * (string * string) list 
  | String of string

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
let stack = ref []
let txt = create 1024


let rec create_elt accu = function
  | String s :: st -> create_elt (string s accu) st
  | Element x :: st -> create_elt (Pair (x,accu)) st
  | Start (name,att) :: st -> stack := Element (elem name att accu) :: st
  | [] -> assert false


let buflen = 1000
let buf = String.create buflen

let load_expat s =
  let p = Expat.parser_create "" in
  Expat.set_start_element_handler p 
    (fun name att -> 
      if not (only_ws txt.buffer txt.pos) then 
	stack := String (String.sub txt.buffer 0 txt.pos) :: !stack; 
      txt.pos <- 0;   
      stack := Start (name,att) :: !stack);
  Expat.set_end_element_handler p 
    (fun _ -> 
      let accu =
	if only_ws txt.buffer txt.pos 
	then nil 
	else string (String.sub txt.buffer 0 txt.pos) nil in
      txt.pos <- 0; 
      create_elt accu !stack);
  Expat.set_character_data_handler p (add_string txt);
  let ic = open_in s in
  let rec loop () =
    let n = input ic buf 0 buflen in
    if (n > 0) then 
(*(Expat.parse p (String.sub buf 0 n); loop ())*)
 (Expat.parse_sub p buf 0 n; loop ()) 
  in
  try 
    loop(); 
    Expat.final p;
    close_in ic;
    match !stack with
      | [ Element x ] -> stack := []; x
      | _ -> assert false
  with 
      Expat.Expat_error e -> 
	failwith ("Expat ("^s^"):"^Expat.xml_error_to_string e)


let handle_event = function
  | E_start_tag (name,att,_) ->
      if not (only_ws txt.buffer txt.pos) then 
	stack := String (String.sub txt.buffer 0 txt.pos) :: !stack; 
      txt.pos <- 0;   
      stack := Start (name,att) :: !stack
  | E_char_data data ->
      add_string txt data 
  | E_end_tag (_,_) ->
      let accu =
	if only_ws txt.buffer txt.pos 
	then nil 
	else string (String.sub txt.buffer 0 txt.pos) nil in
      txt.pos <- 0; 
      create_elt accu !stack
  | _ -> ()

let load_pxp s =
129
  let config = { default_config with 
130
		   (* warner = new warner; *)
131
		   encoding = `Enc_utf8;
132
133
134
135
136
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
137
138
139
140
141
142
143
144
145
  process_entity config (`Entry_document[]) mgr handle_event;
  match !stack with
    | [ Element x ] -> stack := []; x
    | _ -> assert false

let load_xml_aux s = 
  match !use_parser with
    | `Expat -> load_expat s
    | `Pxp -> load_pxp s
146

147
let load_xml s =
148
  Location.protect_op "load_xml";
149
  try load_xml_aux s 
150
151
152
153
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
154
155
156
157

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
158
	if (only_ws data (String.length data)) then q else string data q
159
160
161
162
163
164
165
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

166
  Location.protect_op "load_html";
167
168
169
170
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
171
  let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
172
173
  close_in ic;
  val_of_docs doc
174