load_xml.ml 2.78 KB
Newer Older
1
2
3
4
5
6
7
8
(* Loading XML documents *)

(*TODO: close the file ! *)

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value
9
open Ident
10

11
12
13
14
15
16
17
18
19
let is_ws s =
  let rec check i = 
    (i < 0) || 
    (match s.[i] with
       | ' ' | '\t' | '\n' | '\r' -> check (i - 1) 
       | _ -> false) in
  check (String.length s - 1)


20
let string s q =
21
  String (0,String.length s,s,q)
22

23
let attrib att = 
24
25
  let att = List.map (fun (l,v) -> LabelPool.mk l, string v nil) att in
  LabelMap.from_list (fun _ _ -> assert false) att
26
27

let elem tag att child =
28
  Xml (Atom (Atoms.mk tag), Pair (Record (attrib att), child))
29
30

let load_xml_aux s =
31
32
33
34
35
36
37
38
39
40
41
  let config = { default_config with 
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
  let next_event = 
    create_pull_parser config (`Entry_document[]) mgr in
  let curr = ref E_end_of_stream in
  let get () =
    match next_event () with 
42
43
      | Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
      | Some E_end_of_stream -> failwith "Unexpected end of XML stream"
44
45
46
      | Some x -> curr := x 
      | None -> () in

47
48
  let txt = Buffer.create 1024 in

49
  let rec parse_elt name att =
50
    let elt = elem name att (parse_seq ()) in
51
52
53
54
55
    (match !curr with
       | E_end_tag (_,_) -> get ()
       | _ -> failwith "Expect end_tag");
    elt
      
56
57
58
59
60

  and dump_txt q =
    let data = Buffer.contents txt in
    Buffer.clear txt;
    if (is_ws data) then q () else string data (q ())
61
      
62
  and parse_seq () =
63
    match !curr with
64
      | E_start_tag (name,att,_) ->
65
	  get ();
66
67
68
69
70
	  dump_txt (fun () ->
		      let e1 = parse_elt name att in
		      let rest = parse_seq () in
		      Pair (e1, rest)
		   )
71
      | E_char_data data ->
72
73
74
	  get();
	  Buffer.add_string txt data;
	  parse_seq ()
75
      | E_end_tag (_,_) ->
76
	  dump_txt (fun () -> nil)
77
78
79
80
81
82
83
84
      | _ -> failwith "Expect start_tag, char_data, or end_tag"
	  
  and parse_doc () =
    match !curr with
      | E_start_tag (name,att,_) ->  get (); parse_elt name att
      | _ -> get (); parse_doc ()
  in
  get ();
85
86
  parse_doc ()

87

88
let load_xml s =
89
  Location.protect_op "load_xml";
90
  try load_xml_aux s 
91
92
93
94
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
	if (is_ws data) then q else string data q
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

  Location.protect_op "load_xml";
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
  let doc = Nethtml.decode ~subst:(fun _ -> "???") doc in
  close_in ic;
  val_of_docs doc