load_xml.ml 3.73 KB
Newer Older
1
2
3
4
5
6
7
8
(* Loading XML documents *)

(*TODO: close the file ! *)

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
9
open Ident
10
open Encodings
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

let create n = { buffer = String.create n; pos = 0; length = n }

let resize b n =
  let new_len = b.length * 2 + n in
  let new_buf = String.create new_len in
  String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
  b.buffer <- new_buf;
  b.length <- new_len

let add_string b s =
  let len = String.length s in
  let new_pos = b.pos + len in
  if new_pos > b.length then resize b len;
  String.unsafe_blit s 0 b.buffer b.pos len;
  b.pos <- new_pos

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
     | _ -> false)
38
39


40
41
42
43
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

44

45
let attrib att = 
46
  let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
47
  LabelMap.from_list (fun _ _ -> assert false) att
48
49

let elem tag att child =
50
  Xml (Atom (Atoms.mk (U.mk tag)), Pair (Record (attrib att), child))
51

52
53
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end

54
let load_xml_aux s =
55
  let config = { default_config with 
56
		   (* warner = new warner; *)
57
		   encoding = `Enc_utf8;
58
59
60
61
62
63
64
65
66
67
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
  let next_event = 
    create_pull_parser config (`Entry_document[]) mgr in
  let curr = ref E_end_of_stream in
  let get () =
    match next_event () with 
68
69
      | Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
      | Some E_end_of_stream -> failwith "Unexpected end of XML stream"
70
71
72
      | Some x -> curr := x 
      | None -> () in

73
  let txt = create 1024 in
74

75
  let rec parse_elt name att =
76
    let elt = elem name att (parse_seq ()) in
77
78
79
80
81
    (match !curr with
       | E_end_tag (_,_) -> get ()
       | _ -> failwith "Expect end_tag");
    elt
      
82
83

  and parse_seq () =
84
    match !curr with
85
      | E_start_tag (name,att,_) ->
86
	  get ();
87
88
89
90
91
92
93
94
95
96
97
98
99
	  if only_ws txt.buffer txt.pos then
	    let () = txt.pos <- 0 in
	    let e1 = parse_elt name att in
	    let rest = parse_seq () in
	    Pair (e1,rest)
	  else
	    let s = String.sub txt.buffer 0 txt.pos in
	    let () = txt.pos <- 0 in
	    let e1 = parse_elt name att in
	    let rest = parse_seq () in
	    let q = Pair (e1,rest) in
	    string s q

100
      | E_char_data data ->
101
	  get();
102
	  add_string txt data;
103
	  parse_seq ()
104

105
      | E_end_tag (_,_) ->
106
107
108
109
110
111
	  if only_ws txt.buffer txt.pos then (txt.pos <- 0; nil)
	  else
	    let s = String.sub txt.buffer 0 txt.pos in
	    txt.pos <- 0;
	    string s nil

112
113
114
115
116
117
118
119
      | _ -> failwith "Expect start_tag, char_data, or end_tag"
	  
  and parse_doc () =
    match !curr with
      | E_start_tag (name,att,_) ->  get (); parse_elt name att
      | _ -> get (); parse_doc ()
  in
  get ();
120
121
  parse_doc ()

122

123
let load_xml s =
124
  Location.protect_op "load_xml";
125
  try load_xml_aux s 
126
127
128
129
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
130
131
132
133

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
134
	if (only_ws data (String.length data)) then q else string data q
135
136
137
138
139
140
141
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

142
  Location.protect_op "load_html";
143
144
145
146
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
147
  let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
148
149
  close_in ic;
  val_of_docs doc
150