xml.scm
(module xml mzscheme
	(require (lib "xml.ss"  "xml"))
	(require (lib "pregexp.ss" "mzlib"))
	(require (lib "list.ss" "mzlib"))
	(provide read-xexpr
		 write-xexpr
		 xexpr-remove-whitespace
		 xpath-xexpr
		 xe-1
		 xe-1-more
		 xe-0-more

		 xexpr-sax
		 xexpr-get-attr 

		 xexpr-elem
		 xexpr-elems
		 xexpr-attrs
		 xexpr-attr
		 )

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;=pod
;;
;;=head1 HO-Utils - XML Extensions
;;
;;=head2 Synopsys
;;
;;=syn scm,8
;;
;; >(require (lib "xml.scm" "xml"))
;; >(require (planet "xml.scm" ("oesterholt" "ho-utils.plt" 1 0)))
;;
;; >(define p (open-input-port "some.xml"))
;; >(define e (read-xexpr p))
;; >(set! e (xexpr-remove-whitespace e))
;; >(xpath-xexpr "/sdf-styles/style[1]/name" e)
;; ("Style 1")
;; >(xpath-xexpr "/*/*/name" e)
;; ("Style 1" "Style 2" "Style 3")
;; >(xpath-xexpr "/*/*/name[2]" e)
;; ("Style 2")
;; >(xe-1 e  "/*/*/fg-color[1]/r" (lambda (x) (error "HE!")) string->number)
;; 232
;; >(xe-1 e  "/*/*/qq-color[1]/r" (lambda (x) (error "HE!")) string->number)
;; HE!
;; >(xe-0-more e "/*/*/fg-color/r" string->number)
;; (2 232 4 23)
;; >(xe-1-more e "/*/*/tr-color" (lambda (x) (error "ZERO!")) string->number)
;; ZERO!
;; > (write-xexpr e "new.xml")
;; > (define e (read-xexpr "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><a>Hello!<b><c>1</c><d>2</d></b><b>Hi</b></a>"))
;; > (xexpr-sax (lambda (element attributes value tree-level intermediate xexprs)
;;                   (display (format "~s:~s=~s (~s)~%" tree-level element value intermediate))
;;                   #t)
;;              e)
;; 0:a="Hello!" (#t)
;; 1:b="" (#t)
;; 2:c="1" (#f)
;; 2:d="2" (#f)
;; 1:b="Hi" (#f)
;;
;; >(define X (xexpr-elem 'sdf (xexpr-attrs (xexpr-attr 'version 1.0))
;;                   (xexpr-elems (xexpr-elem 'fg-color "#434343")
;;                                (xexpr-elem 'bg-color "black"))))
;;
;; >(write-xexpr X)
;;
;;=head2 Reading/Writing XML Documents
;;
;;=head3 C<(read-xexpr port) : xexpr>
;;
;;Reads XML from port and converts it directly to an XExpression, removing
;;all whitespace only values.
;;
;;=head3 C<(write-xexpr xexpr . port) : void>
;;
;;Writes the xexpr as XML document to the default output port, or port.
;;
;;
;;=head2 Processing X-Expressions
;;
;;=head3 C<(xexpr-remove-whitespace xexpr) : xexpr>
;;
;;Removes all whitespace only values from xexpr.
;;
;;=head3 C<(xpath-xexpr xpath e) : xexpr>
;;
;;Evaluates xpath on e and returns the result set for this evaluation,
;;which is a list of XML subexpressions or (end-)values. If the expression
;;doesn't compute, the empty list is returned. If no elements are found,
;;also the empty list is returned.
;;
;;B<This function can't handle values for intermediate nodes yet!>
;;
;;B<precondition:> C<xpath> must be a valid XPath B<and> white space must
;;have been removed from C<xexpr> using C<xexpr-remove-whitespace>.
;;
;;=head3 C<(xe-1 xexpr xpath handler . convertor) : value>
;;
;;Expects 1 result in the result set of (xpath-xexpr xpath xexpr).
;;Returns (convertor applied to) this one result, or the result
;;of handler if 0 or more than 1 results have been returned.
;;
;;=head3 C<(xe-1-more xexpr xpath handler . convertor) : list of values>
;;
;;Expects 1 or more results in the result set of (xpath-xexpr xpath xexpr).
;;Returns (convertor applied to) this result-set (list of results), or the result
;;of handler if 0 results have been returned.
;;
;;=head3 C<(xe-0-more xexpr xpath . convertor) : list of values>
;;
;;Expects 0 or more results in the result set of (xpath-xexpr xpath xexpr).
;;Returns (convertor applied to) this result-set (list of results).
;;
;;=head3 C<(xexpr-get-attr attributes attribute . default) : string>
;;
;;Returns #f (or (car default)), if attribute is not found in attributes.E<lb>
;;Returns the associated value with attribute, otherwise.E<lb>
;;
;;Can be used in conjunction with the callback that must be provided for C<xexpr-sax>.
;;
;;=head3 C<(xexpr-sax callback xexpr) : boolean>
;;
;;Traverses the xexpr depth first, calling C<callback> for each element
;;with parameters C<element>, C<attributes>, C<value>, C<level>, C<intermediate-node> and C<xexprs>.
;;
;;If C<intermediate-node> is #f, this is and end node in the XML tree, otherwise,
;;this node contains other nodes. C<level> gives the current depth in the
;;XML tree, 0 being the top level.E<lb>
;;
;;=over 1
;;
;;=over 1
;;
;;=item C<element>
;;
;;is a symbol giving the element name of the current node.
;;
;;=item C<attributes>
;;
;;is a list of attributes C<(attribute-name:symbol value:string))>.
;;
;;=item C<value>
;;
;;is a string containing the value of the current node (works also for intermediate nodes).
;;
;;=item C<xexprs>
;;
;;is '(), if this is an end node. It is a list of all next level nodes, otherwise.
;;
;;=back
;;
;;=back
;;
;;I<Return value of callback:> if C<#f>, C<xexpr-sax> will not go depth first.
;;if C<#t>, C<xexpr-sax> will go depth first. This makes it possible to call
;;the sax parser again with a different C<callback> function on deeper level nodes.
;;
;;The basic behaviour of C<xexpr-sax> is O(xml-tags) (it traverses all xml-tags of
;;an XML document twice) (first pass assembling the value of a node and the next level nodes
;;of that node; second pass traversing the next level nodes).
;;
;;=head2 Creating X-Expressions
;;
;;=head3 C<(xexpr-elems first-elements . elements) : xexpr elements>
;;
;;Creates a sequence of xexpr XML elements.
;;
;;=head3 C<(xexpr-elem element . value|attributes|xexprs{0,3}) : xexpr element>
;;
;;Creates an xexpr element with optional value and/or attributes.
;;
;;=head3 C<(xexpr-attr attr value) : xexpr attribute>
;;
;;Creates an attribute.
;;
;;=head3 C<(xexpr-attrs . attributes) : xexpr attributes>
;;
;;Creates the attributes for an xexpression element
;;
;;=head2 Info
;;
;;Author: Hans Oesterholt-Dijkema, License: LGPL, (c) 2006.
;;
;;=cut
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


;; THIS IS WAY TO SLOW!
; (define reg-ws (pregexp "^[ \t\r\n]+$"))
;
; (define (is-space? str)
;   (not (eq? (pregexp-match reg-ws str) #f)))
;;;;;;;

;; THIS IS ORDERS OF MAGNITUDE FASTER
; (define (is-space? str)
;   (do
;       ((i 0 (+ i 1))
;        (n (string-length str)))
;       ((or (= i n)
; 	   (not (char-whitespace? (string-ref str i))))
;        (= i n))
;     ()))

; (define (is-space-char? L)
;   (if (null? L)
;       #t
;       (if (char-whitespace? (car L))
; 	  (is-space-char? (cdr L))
; 	  #f)))

; (define (is-space? str)	  
;   (is-space-char? (string->list str)))

;;; THIS ONE SEEMS TO BE THE FASTEST
; (define re (regexp "^[ \t\n]*$"))
; (define (is-space? x) (regexp-match re x))
(define (is-space? x) (regexp-match #rx"^[ \t\n]*$" x))

; (define (is-space? x) (andmap char-whitespace? (string->list x)))


(define (xexpr-remove-whitespace xexpr)
  
  (define (filter L n)
    (if (null? L)
	L
	(if (string? (car L))
	    (if (null? (cdr L))
		(if (is-space? (car L))
		    (if (= n 0)
			(list "")
			(list))
		    (cons (car L) (filter (cdr L) (+ n 1))))
		(if (is-space? (car L))
		    (filter (cdr L) (+ n 1))
		    (cons (car L) (filter (cdr L) (+ n 1)))))
	    (if (list? (car L))
		(cons (xexpr-remove-whitespace (car L)) 
		      (filter (cdr L) (+ n 1)))
		(cons (car L) (filter (cdr L) (+ n 1)))))))

  (filter xexpr 0))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define re-slash         (pregexp "[/]"))
(define re-bracket-left  (pregexp "\\["))
(define re-bracket-right (pregexp "\\]"))

(define (xpath-xexpr path xexpr)

  (define (iterate-childs-for-all-symbols path-elem xexpressions)
    (if (null? xexpressions)
	(list)
	(if (or (eq? path-elem '*) (eq? (caar xexpressions) path-elem))
	    (cons (car xexpressions) (iterate-childs-for-all-symbols path-elem (cdr xexpressions)))
	    (iterate-childs-for-all-symbols path-elem (cdr xexpressions)))))

  (define (iterate-childs-for-nth-of-symbol path-elem n xexpressions)
    (if (< n 1)
	(list)
	(let ((R (iterate-childs-for-all-symbols path-elem xexpressions)))
	  (if (> n (length R))
	      (list)
	      (list (list-ref R (- n 1)))))))

  (define (find-attribute attribute xexpressions)

    (define (find L attribute)
      (if (null? L)
	  #f
	  (if (eq? (caar L) attribute)
	      (cadar L)
	      (find (cdr L) attribute))))

    (filter (lambda (e)
	      (not (eq? e #f)))
	    (map (lambda (xexpr)
		   (if (list? xexpr)
		       (find (cadr xexpr) attribute)
		       #f))
		 xexpressions)))

  (define (calculate-for-xpath-element elem xexprs)
    (cond
     ((eq? (car elem) '@)  (find-attribute (cadr elem) xexprs))
     ((= (length elem) 2)  (if (eq? (cadr elem) '*)
				(iterate-childs-for-all-symbols (car elem) xexprs)
				(iterate-childs-for-nth-of-symbol (car elem) (cadr elem) xexprs)))
     ((= (length elem) 1)  (iterate-childs-for-all-symbols (car elem) xexprs))
     (else (list))))

  (define (xml-values result-set)
    (apply append 
	   (filter (lambda (e) 
		     (list? (car e)))
		   (map (lambda (expr)
			  (cddr expr))
			result-set))))

  (define (all-values result-set)
    (apply append
	   (map (lambda (expr)
		  (if (list? expr)
		      (cddr expr)
		      (list expr)))
		result-set)))

  (define (unroll-xpath xpath-elements xexpressions)
    ;(display (format "~s, ~s~%" xpath-elements xexpressions))
    (if (null? xpath-elements)
	xexpressions
	(let* ((xpath-elem (car xpath-elements))
	       (result-set (calculate-for-xpath-element xpath-elem xexpressions)))

	  ;(display (format "result-set: ~s~%" result-set))

	  (if (null? result-set)
	      (list)
	      (if (null? (cdr xpath-elements))
		  (all-values result-set)
		  (if (eq? (caadr xpath-elements) '@)
		      (unroll-xpath (cdr xpath-elements) result-set)
		      (unroll-xpath (cdr xpath-elements) (xml-values result-set))))))))

  (let ((path-elements (pregexp-split re-slash path)))
    ;(write path-elements)(newline)
    (unroll-xpath 
      (filter (lambda (element)
		(not (eq? element '%nil%)))
	      (map (lambda (element)
		     (if (string=? element "")
			 '%nil%
			 (if (string=? (substring element 0 1) "@")
			     (list '@ (string->symbol (substring element 1)))
			     (let ((S (pregexp-split re-bracket-left element)))
			       (if (= (length S) 2)
				   (list (string->symbol (car S))
					 (string->number (pregexp-replace re-bracket-right (cadr S) "")))
				   (list (string->symbol element)))))))
		   path-elements))
     (list xexpr)))
  )

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define (read-xexpr port)
  (if (string? port)
      (let ((S (open-input-string port)))
	(read-xexpr S))
      (xml->xexpr (document-element (read-xml port)))))

(define (write-xexpr x . port)
  (let ((d (make-document
            (make-prolog (list (make-pi #f #f 'xml "version=\"1.0\" encoding=\"UTF-8\"")) #f)
            (if (list? x) (xexpr->xml x) x)
            '())))
    (if (null? port)
        (write-xml d)
        (write-xml d (car port)))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define (xe-1 e xpath handler . convertor)
  (let ((values (xpath-xexpr xpath e)))
    ;(display (format "~s ~s ~s~%" values handler convertor))
    (if (null? values)
	(handler values)
	(if (null? (cdr values))
	    (if (null? convertor)
		(car values)
		((car convertor) (car values)))
	    (handler values)))))

(define (xe-1-more e xpath handler . convertor)
  (let ((values (xpath-xexpr xpath e)))
    (if (null? values)
	(handler values)
	(if (null? convertor)
	    values
	    (map (car convertor) values)))))

(define (xe-0-more e xpath . convertor)
  (let ((values (xpath-xexpr xpath e)))
    (if (null? convertor)
	values
	(map (car convertor) values))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define (xexpr-sax f xexpr)

  (define (traverse-xml-depth-first xexpr level)

    (define (assemble-element-value L intermediate)
      (if (null? L)
	  (list "" (list))
	  (let ((v (assemble-element-value (cdr L) intermediate)))
	    (if (string? (car L))
		(cons (string-append (car L) (car v)) (cdr v))
		(begin
		  (set-car! intermediate #t)
		  (list (car v) (cons (car L) (cadr v))))))))

    (define (iterate xexpr)
      (if (null? xexpr)
	  #t
	  (let ((xexpr-element (car xexpr)))
	    (if (list? xexpr-element)
		(let ((e (car xexpr-element))
		      (a (cadr xexpr-element))
		      (v (cddr xexpr-element))
		      (i (list #f)))
		  (let ((value (assemble-element-value v i)))
		    (let ((result (f e a (car value) level (car i) (cadr value))))
		      (if (car i)
			  (if result
			      (begin
				(traverse-xml-depth-first v (+ level 1))
				(iterate (cdr xexpr)))
			      (iterate (cdr xexpr)))
			  (iterate (cdr xexpr))))))
		(iterate (cdr xexpr))))))
    
    (iterate xexpr))

  (if (null? xexpr)
      #t
      (if (symbol? (car xexpr))
	  (traverse-xml-depth-first (list xexpr) 0)
	  (traverse-xml-depth-first xexpr 0))))


(define (xexpr-get-attr attributes name . default)
  (let ((a (assq name attributes)))
    (if (eq? a #f)
        (if (null? default)
            #f
            (car default))
        (cadr a))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Creating X expressions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define (xexpr-elems . args)
  (if (not (null? args))
      (if (null? (cdr args))
          (if (null? (car args))
              (list '%xexpr-elems%)
              (if (list? (caar args))
                  (cons '%xexpr-elems% (car args))
                  (cons '%xexpr-elems% args)))
          (cons '%xexpr-elems% args))
      (cons '%xexpr-elems% args)))

(define (xexpr-elem e . args)

  (define (convert v)
    (if (list? v)
	v
	(format "~a" v)))

  (define (get-attrs v)
    (cdr v))

  (define (get-xexpr v)
    (cdr v))

  (define (attributes? v)
    (if (list? v)
        (if (null? v)
            #f
            (if (eq? (car v) '%attributes)
                #t
                #f))
        #f))

  (define (xexpr? v)
    (if (list? v)
	(if (null? v)
	    #f
	    (if (eq? (car v) '%xexpr-elems%)
		#t
		#f))
	#f))

  (let ((a #f)
        (v #f)
        (x #f))
    (for-each (lambda (r)
                (if (attributes? r)
                    (set! a (get-attrs r))
                    (if (xexpr? r)
                        (set! x (get-xexpr r))
                        (set! v (convert r)))))
              args)

    ;(display (format "a=~s, v=~s, x=~s~%" a v x))

    (cond
     ((and a v x) (cons e (cons a (cons v x))))
     ((and a v)   (list e a v))
     ((and a x)   (cons e (cons a x)))
     ((and v x)   (cons e (cons '() (cons v x))))
     (a           (list e a))
     (v           (list e '() v))
     (x           (cons e (cons '() x)))
     (else        (list e '())))))

(define (xexpr-attr a v)
  (list a (format "~a" v)))

(define (xexpr-attrs . attrs)
  (cons '%attributes attrs))

) ;; module