load_xml.ml 3.7 KB
Newer Older
1
2
3
4
5
6
7
8
(* Loading XML documents *)

(*TODO: close the file ! *)

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
9
open Ident
10
open Encodings
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
type buf =
    { mutable buffer : string;
      mutable pos : int;
      mutable length : int }

let create n = { buffer = String.create n; pos = 0; length = n }

let resize b n =
  let new_len = b.length * 2 + n in
  let new_buf = String.create new_len in
  String.unsafe_blit b.buffer 0 new_buf 0 b.pos;
  b.buffer <- new_buf;
  b.length <- new_len

let add_string b s =
  let len = String.length s in
  let new_pos = b.pos + len in
  if new_pos > b.length then resize b len;
  String.unsafe_blit s 0 b.buffer b.pos len;
  b.pos <- new_pos

let rec only_ws s i =
  (i = 0) ||
  (let i = pred i in match (String.unsafe_get s i) with
     | ' ' | '\t' | '\n' | '\r' -> only_ws s i
     | _ -> false)
38
39


40
41
42
43
let string s q =
  let s = Utf8.mk s in
  String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q)

44

45
let attrib att = 
46
  let att = List.map (fun (l,v) -> LabelPool.mk (U.mk l), string v nil) att in
47
  LabelMap.from_list (fun _ _ -> assert false) att
48
49

let elem tag att child =
50
  Xml (Atom (Atoms.mk (U.mk tag)), Pair (Record (attrib att), child))
51

52
53
class warner = object method warn w =  print_endline ("PXP WARNING: " ^ w) end

54
55
56
57
58
type token = 
  | Element of Value.t 
  | Start of string * (string * string) list 
  | String of string

59
let load_xml_aux s =
60
  let config = { default_config with 
61
		   (* warner = new warner; *)
62
		   encoding = `Enc_utf8;
63
64
65
66
67
68
69
70
71
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
  let next_event = 
    create_pull_parser config (`Entry_document[]) mgr in
  let get () =
    match next_event () with 
72
      | Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
73
74
      | Some E_end_of_stream | None -> failwith "Unexpected end of XML stream"
      | Some x -> x  in
75

76
  let txt = create 1024 in
77

78
79
80
81
82
83
84
85
  let rec create_elt accu = function
    | String s :: st -> create_elt (string s accu) st
    | Element x :: st -> create_elt (Pair (x,accu)) st
    | [ Start (name,att) ] -> elem name att accu
    | Start (name,att) :: st -> parse_seq (Element (elem name att accu) :: st)
    | [] -> assert false
  and parse_seq stack =
    match get() with
86
      | E_start_tag (name,att,_) ->
87
88
89
90
91
	  let stack =
	    if only_ws txt.buffer txt.pos then stack 
	    else String (String.sub txt.buffer 0 txt.pos) :: stack in
	  txt.pos <- 0; 
	  parse_seq (Start (name,att) :: stack)
92
      | E_char_data data ->
93
	  add_string txt data;
94
	  parse_seq stack
95
      | E_end_tag (_,_) ->
96
97
98
99
100
101
	  let accu = 
	    if only_ws txt.buffer txt.pos 
	    then nil 
	    else string (String.sub txt.buffer 0 txt.pos) nil in
	  txt.pos <- 0; 
	  create_elt accu stack
102
103
      | _ -> failwith "Expect start_tag, char_data, or end_tag"
  in
104
105
106
107
  let rec parse_doc () =
    match get () with
      | E_start_tag (name,att,_) -> parse_seq [ Start (name,att) ]
      | _ -> parse_doc () in
108
109
  parse_doc ()

110

111
let load_xml s =
112
  Location.protect_op "load_xml";
113
  try load_xml_aux s 
114
115
116
117
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
118
119
120
121

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
122
	if (only_ws data (String.length data)) then q else string data q
123
124
125
126
127
128
129
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

130
  Location.protect_op "load_html";
131
132
133
134
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
135
  let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in
136
137
  close_in ic;
  val_of_docs doc
138