load_xml.ml 2.77 KB
Newer Older
1
2
3
4
5
6
7
8
9
(* Loading XML documents *)

(*TODO: close the file ! *)

open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Value

10
11
12
13
14
15
16
17
18
let is_ws s =
  let rec check i = 
    (i < 0) || 
    (match s.[i] with
       | ' ' | '\t' | '\n' | '\r' -> check (i - 1) 
       | _ -> false) in
  check (String.length s - 1)


19
let string s q =
20
  String (0,String.length s,s,q)
21

22
23
24
25
26
let attrib att = 
  let att = List.map (fun (l,v) -> Types.LabelPool.mk l, string v nil) att in
  SortedMap.from_list (fun _ _ -> assert false) att

let elem tag att child =
27
  Xml (Atom (Atoms.mk tag), Pair (Record (attrib att), child))
28
29

let load_xml_aux s =
30
31
32
33
34
35
36
37
38
39
40
  let config = { default_config with 
		   store_element_positions = false;
		   drop_ignorable_whitespace = true
	       }
  in
  let mgr = create_entity_manager config (from_file s) in
  let next_event = 
    create_pull_parser config (`Entry_document[]) mgr in
  let curr = ref E_end_of_stream in
  let get () =
    match next_event () with 
41
42
      | Some (E_error exn) -> failwith (Pxp_types.string_of_exn exn)
      | Some E_end_of_stream -> failwith "Unexpected end of XML stream"
43
44
45
      | Some x -> curr := x 
      | None -> () in

46
47
  let txt = Buffer.create 1024 in

48
  let rec parse_elt name att =
49
    let elt = elem name att (parse_seq ()) in
50
51
52
53
54
    (match !curr with
       | E_end_tag (_,_) -> get ()
       | _ -> failwith "Expect end_tag");
    elt
      
55
56
57
58
59

  and dump_txt q =
    let data = Buffer.contents txt in
    Buffer.clear txt;
    if (is_ws data) then q () else string data (q ())
60
      
61
  and parse_seq () =
62
    match !curr with
63
      | E_start_tag (name,att,_) ->
64
	  get ();
65
66
67
68
69
	  dump_txt (fun () ->
		      let e1 = parse_elt name att in
		      let rest = parse_seq () in
		      Pair (e1, rest)
		   )
70
      | E_char_data data ->
71
72
73
	  get();
	  Buffer.add_string txt data;
	  parse_seq ()
74
      | E_end_tag (_,_) ->
75
	  dump_txt (fun () -> nil)
76
77
78
79
80
81
82
83
      | _ -> failwith "Expect start_tag, char_data, or end_tag"
	  
  and parse_doc () =
    match !curr with
      | E_start_tag (name,att,_) ->  get (); parse_elt name att
      | _ -> get (); parse_doc ()
  in
  get ();
84
85
  parse_doc ()

86

87
let load_xml s =
88
  Location.protect_op "load_xml";
89
  try load_xml_aux s 
90
91
92
93
  with exn ->
    raise
      (Location.Generic (Pxp_types.string_of_exn exn))
      
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

let load_html s =
  let rec val_of_doc q = function
    | Nethtml.Data data -> 
	if (is_ws data) then q else string data q
    | Nethtml.Element (tag, att, child) -> 
	Pair (elem tag att (val_of_docs child), q)
  and val_of_docs = function
    | [] -> nil
    | h::t -> val_of_doc (val_of_docs t) h
  in	

  Location.protect_op "load_xml";
  let ic = open_in s in
  let doc = Nethtml.parse_document 
	      ~dtd:Nethtml.relaxed_html40_dtd 
	      (Lexing.from_channel ic) in
  let doc = Nethtml.decode ~subst:(fun _ -> "???") doc in
  close_in ic;
  val_of_docs doc