load_xml.ml 3.71 KB
Newer Older
1
2
3
4
5
6
7
8
(* Loading XML documents *)

(*TODO: close the file ! *)

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
9
open Ident
10
open Encodings
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

let create n = { buffer = String.create n; pos = 0; length = n }

let resize b n =
  let new_len = b.length * 2 + n in
  let new_buf = String.create new_len in
  String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
  b.buffer <- new_buf;
  b.length <- new_len

let add_string b s =
  let len = String.length s in
  let new_pos = b.pos + len in
  if new_pos > b.length then resize b len;
  String.unsafe_blit s 0 b.buffer b.pos len;
  b.pos <- new_pos

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
     | _ -> false)
38
39


40
41
42
43
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

44

45
let attrib att = 
46
  let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
47
  LabelMap.from_list (fun _ _ -> assert false) att
48
49

let elem tag att child =
50
  Xml (Atom (Atoms.mk (U.mk tag)), Pair (Record (attrib att), child))
51

52
(*
53
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end
54
*)
55

56
57
58
59
60
type token = 
  | Element of Value.t 
  | Start of string * (string * string) list 
  | String of string

61
let load_xml_aux s =
62
  let config = { default_config with 
63
		   (* warner = new warner; *)
64
		   encoding = `Enc_utf8;
65
66
67
68
69
70
71
72
73
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
  let next_event = 
    create_pull_parser config (`Entry_document[]) mgr in
  let get () =
    match next_event () with 
74
      | Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
75
76
      | Some E_end_of_stream | None -> failwith "Unexpected end of XML stream"
      | Some x -> x  in
77

78
  let txt = create 1024 in
79

80
81
82
83
84
85
86
87
  let rec create_elt accu = function
    | String s :: st -> create_elt (string s accu) st
    | Element x :: st -> create_elt (Pair (x,accu)) st
    | [ Start (name,att) ] -> elem name att accu
    | Start (name,att) :: st -> parse_seq (Element (elem name att accu) :: st)
    | [] -> assert false
  and parse_seq stack =
    match get() with
88
      | E_start_tag (name,att,_) ->
89
90
91
92
93
	  let stack =
	    if only_ws txt.buffer txt.pos then stack 
	    else String (String.sub txt.buffer 0 txt.pos) :: stack in
	  txt.pos <- 0; 
	  parse_seq (Start (name,att) :: stack)
94
      | E_char_data data ->
95
	  add_string txt data;
96
	  parse_seq stack
97
      | E_end_tag (_,_) ->
98
99
100
101
102
103
	  let accu = 
	    if only_ws txt.buffer txt.pos 
	    then nil 
	    else string (String.sub txt.buffer 0 txt.pos) nil in
	  txt.pos <- 0; 
	  create_elt accu stack
104
105
      | _ -> failwith "Expect start_tag, char_data, or end_tag"
  in
106
107
108
109
  let rec parse_doc () =
    match get () with
      | E_start_tag (name,att,_) -> parse_seq [ Start (name,att) ]
      | _ -> parse_doc () in
110
111
  parse_doc ()

112

113
let load_xml s =
114
  Location.protect_op "load_xml";
115
  try load_xml_aux s 
116
117
118
119
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
120
121
122
123

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
124
	if (only_ws data (String.length data)) then q else string data q
125
126
127
128
129
130
131
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

132
  Location.protect_op "load_html";
133
134
135
136
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
137
  let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
138
139
  close_in ic;
  val_of_docs doc
140