(* Loading XML documents *) ifdef EXPAT then let expat_support = true else let expat_support = false let use_parser = ref (if expat_support then `Expat else `Pxp) open Pxp_yacc open Pxp_lexer_types open Pxp_types open Value open Ident open Encodings open List open Url type buf = { mutable buffer : string; mutable pos : int; mutable length : int } let txt = { buffer = String.create 1024; pos = 0; length = 1024 } let resize txt n = let new_len = txt.length * 2 + n in let new_buf = String.create new_len in String.unsafe_blit txt.buffer 0 new_buf 0 txt.pos; txt.buffer <- new_buf; txt.length <- new_len let add_string txt s = let len = String.length s in let new_pos = txt.pos + len in if new_pos > txt.length then resize txt len; String.unsafe_blit s 0 txt.buffer txt.pos len; txt.pos <- new_pos let rec only_ws s i = (i = 0) || (let i = pred i in match (String.unsafe_get s i) with | ' ' | '\t' | '\n' | '\r' -> only_ws s i | _ -> false) let string s q = let s = Utf8.mk s in String_utf8 (Utf8.start_index s,Utf8.end_index s, s, q) let attrib att = (* TODO: better error message *) let att = List.map (fun (n,v) -> LabelPool.mk n, string_utf8 v) att in LabelMap.from_list (fun _ _ -> failwith "Invalid XML document: uniqueness of attributes") att let elem (tag_ns,tag) att child = Xml (Atom (Atoms.V.mk tag_ns tag), Record (attrib att), child) (* class warner = object method warn w = print_endline ("PXP WARNING: " ^ w) end *) type token = | Element of Value.t | Start of Ns.qname * (Ns.qname * Utf8.t) list * Ns.table | String of string let stack = ref [] let ns_table = ref Ns.empty_table let rec create_elt accu = function | String s :: st -> create_elt (string s accu) st | Element x :: st -> create_elt (Pair (x,accu)) st | Start (name,att,table) :: st -> stack := Element (elem name att accu) :: st; ns_table := table | [] -> assert false let start_element_handler name att = if not (only_ws txt.buffer txt.pos) then stack := String (String.sub txt.buffer 0 txt.pos) :: !stack; txt.pos <- 0; let (table,name,att) = Ns.process_start_tag !ns_table name att in stack := Start (name,att,!ns_table) :: !stack; ns_table := table let end_element_handler _ = let accu = if only_ws txt.buffer txt.pos then nil else string (String.sub txt.buffer 0 txt.pos) nil in txt.pos <- 0; create_elt accu !stack ifdef EXPAT then let load_expat = let buflen = 1024 in let buf = String.create buflen in fun s -> let ic = try open_in s with exn -> let msg = Printf.sprintf "load_xml, file \"%s\": %s" s (Printexc.to_string exn) in raise (Location.Generic msg) in let p = Expat.parser_create "" in Expat.set_start_element_handler p start_element_handler; Expat.set_end_element_handler p end_element_handler; Expat.set_character_data_handler p add_string txt; let rec loop () = let n = input ic buf 0 buflen in if (n > 0) then (Expat.parse_sub p buf 0 n; loop ()) in try loop(); Expat.final p; close_in ic; with Expat.Expat_error e -> close_in ic; let line = Expat.get_current_line_number p and col = Expat.get_current_column_number p in let msg = Printf.sprintf "load_xml, file \"%s\", at line %i, column %i: %s" s (Expat.get_current_line_number p) (Expat.get_current_column_number p) (Expat.xml_error_to_string e) in raise (Location.Generic msg) else let load_expat s = failwith "Expat support not included" let pxp_handle_event = function | E_start_tag (name,att,_) -> start_element_handler name att | E_char_data data -> add_string txt data | E_end_tag (_,_) -> end_element_handler () | _ -> () let pxp_config = { default_config with (* warner = new warner; *) encoding = `Enc_utf8; store_element_positions = false; drop_ignorable_whitespace = true } let load_pxp s = try let mgr = create_entity_manager pxp_config (match s with | Url(s) -> from_string s | Filename(s) -> from_file s ) in process_entity pxp_config (`Entry_document[`Extend_dtd_fully]) mgr pxp_handle_event; with exn -> raise (Location.Generic (Pxp_types.string_of_exn exn)) let load_xml s = Location.protect_op "load_xml"; try (match !use_parser with | `Expat -> load_expat s | `Pxp -> load_pxp (process_url s)); match !stack with | [ Element x ] -> stack := []; x | _ -> assert false with e -> stack := []; txt.pos <-0; raise e let load_html s = let rec val_of_doc q = function | Nethtml.Data data -> if (only_ws data (String.length data)) then q else string data q | Nethtml.Element (tag, att, child) -> let att = List.map (fun (n,v) -> ((Ns.empty, U.mk n), U.mk v)) att in Pair (elem (Ns.empty,U.mk tag) att (val_of_docs child), q) and val_of_docs = function | [] -> nil | h::t -> val_of_doc (val_of_docs t) h in Location.protect_op "load_html"; match process_url s with | Filename(s) -> ( let ic = open_in s in let doc = Nethtml.parse_document ~dtd:Nethtml.relaxed_html40_dtd (Lexing.from_channel ic) in let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in close_in ic; val_of_docs doc) | Url s -> ( let doc = Nethtml.parse_document ~dtd:Nethtml.relaxed_html40_dtd (Lexing.from_string s) in let doc = Nethtml.decode ~enc:`Enc_utf8 ~subst:(fun _ -> "???") doc in val_of_docs doc)