load_xml.ml 3.8 KB
Newer Older
1
2
3
4
5
6
7
8
(* Loading XML documents *)

(*TODO: close the file ! *)

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
9
open Ident
10
open Encodings
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

let create n = { buffer = String.create n; pos = 0; length = n }

let resize b n =
  let new_len = b.length * 2 + n in
  let new_buf = String.create new_len in
  String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
  b.buffer <- new_buf;
  b.length <- new_len

let add_string b s =
  let len = String.length s in
  let new_pos = b.pos + len in
  if new_pos > b.length then resize b len;
  String.unsafe_blit s 0 b.buffer b.pos len;
  b.pos <- new_pos

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
     | _ -> false)
38
39


40
41
42
43
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

44

45
let attrib att = 
46
  let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
47
  LabelMap.from_list (fun _ _ -> assert false) att
48
49

let elem tag att child =
50
  Xml (Atom (Atoms.mk (U.mk tag)), Pair (Record (attrib att), child))
51

52
53
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end

54
55
56
57
58
type token = 
  | Element of Value.t 
  | Start of string * (string * string) list 
  | String of string

59
let load_xml_aux s =
60
  let config = { default_config with 
61
		   (* warner = new warner; *)
62
		   encoding = `Enc_utf8;
63
64
65
66
67
68
69
70
71
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
  let next_event = 
    create_pull_parser config (`Entry_document[]) mgr in
  let get () =
    match next_event () with 
72
      | Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
73
74
      | Some E_end_of_stream | None -> failwith "Unexpected end of XML stream"
      | Some x -> x  in
75

76
  let txt = create 1024 in
77

78
79
80
81
82
83
84
85
  let rec create_elt accu = function
    | String s :: st -> create_elt (string s accu) st
    | Element x :: st -> create_elt (Pair (x,accu)) st
    | [ Start (name,att) ] -> elem name att accu
    | Start (name,att) :: st -> parse_seq (Element (elem name att accu) :: st)
    | [] -> assert false
  and parse_seq stack =
    match get() with
86
      | E_start_tag (name,att,_) ->
87
88
	  if only_ws txt.buffer txt.pos then
	    let () = txt.pos <- 0 in
89
	    parse_seq (Start (name,att) :: stack)
90
91
92
	  else
	    let s = String.sub txt.buffer 0 txt.pos in
	    let () = txt.pos <- 0 in
93
	    parse_seq (Start (name,att) :: String s :: stack)
94
      | E_char_data data ->
95
	  add_string txt data;
96
	  parse_seq stack
97
      | E_end_tag (_,_) ->
98
99
100
	  if only_ws txt.buffer txt.pos then 
	    (txt.pos <- 0; 
	     create_elt nil stack)
101
102
103
	  else
	    let s = String.sub txt.buffer 0 txt.pos in
	    txt.pos <- 0;
104
	    create_elt (string s nil) stack
105
106
      | _ -> failwith "Expect start_tag, char_data, or end_tag"
  in
107
108
109
110
  let rec parse_doc () =
    match get () with
      | E_start_tag (name,att,_) -> parse_seq [ Start (name,att) ]
      | _ -> parse_doc () in
111
112
  parse_doc ()

113

114
let load_xml s =
115
  Location.protect_op "load_xml";
116
  try load_xml_aux s 
117
118
119
120
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
121
122
123
124

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
125
	if (only_ws data (String.length data)) then q else string data q
126
127
128
129
130
131
132
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

133
  Location.protect_op "load_html";
134
135
136
137
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
138
  let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
139
140
  close_in ic;
  val_of_docs doc
141