xexp.rkt
#lang racket/base
;; For legal info, see file "info.rkt".

(require (planet neil/mcfly:1:0))

(doc (section "Introduction")

     (para (italic "Note: This package is in a state of active development, and
                    some interface changes, perhaps not backward-compatible,
                    are expected. Documentation is also in-progress."))

     (para (hyperlink "http://pobox.com/~oleg/ftp/Scheme/SXML.html"
                      "SXML")
           " is a representation for XML in Scheme, defined by Oleg
Kiselyov. ``SXML/xexp'' is the temporary name for a format for Racket that's
based on SXML and is mostly compatible with it.  SXML/xexp is used for both
HTML and XML.  The current plan is, hopefully, for the ``/xexp'' part of the
name to go away, and for SXML and SXML/xexp to merge.  For now, Racket language
identifiers based on SXML/xexp will have ``"
           (racketfont "xexp")
           "''instead of ``"
           (racketfont "sxml")
           "'', because we do not want to call something ``SXML'' if it is not
strictly SXML. (And, historically, ``xexp'' was much more different from SXML,
while we experimented with unifying SXML, SHTML, and PLT "
           (racketfont "xexpr")
           ", but we have decided to move back to as compatible with SXML as
practical.)"))

(doc (subsection "Differences with SXML")

     (para "SXML/xexp can be defined as differences from SXML:")

     (itemlist

      (item (italic "xexp")
            " syntax must be ordered as in SXML first normal form (1NF).  For
example, any attributes list must precede child elements. SXML/xexp tools "
            (italic "may")
            " be permissive about accepting other orderings, but generally
should not emit any ordering but 1NF ordering.")

      (item "The SXML keyword symbols, such as "
            (racket *TOP*)
            " may be in lowercase (e.g., "
            (racket *top*)
            ")")

      (item (italic "xexp")
            " adds a special "
            (racket &)
            " syntax for character entity references. The syntax is "
            (racket (& #,(italic "val")))
            ", where "
            (racket #,(italic "val"))
            " is the symbolic name of the character as a symbol, or an integer
with the numeric value of the character.")))

(doc (section "SXML and SXML/xexp Tools")

     (para "Libraries using SXML/xexp include:")

     (tabular
      (list

       (list (hyperlink "http://www.neilvandyke.org/racket-html-writing/"
                        "html-writing")
             (list "Writing HTML from SXML/xexp."))

       (list (hyperlink "http://www.neilvandyke.org/racket-html-template/"
                        "html-template")
             (list "Writing HTML from SXML/xexp templates."))

       (list (hyperlink "http://www.neilvandyke.org/racket-html-parsing/"
                        "html-parsing")
             (list "Permissively parsing HTML to SXML/xexp."))

       (list (hyperlink "http://www.neilvandyke.org/webscraperhelper/"
                        "WebScraperHelper")
             (list "Example-based SXPath query generation for SXML/xexp."))))

     (para "There are also some older libraries for SXML, which often can be
used for SXML/xexp:")

     (tabular
      (list

       (list (hyperlink "http://okmij.org/ftp/Scheme/xml.html#SXPath"
                        "SXPath")
             "XPath query language implementation for SXML, by Oleg Kiselyov.")

       (list (hyperlink "http://celtic.benderweb.net/sxml-match/"
                        "sxml-match")
             "Pattern-matching of SXML, by Jim Bender.")

       (list (hyperlink "http://www196.pair.com/lisovsky/xml/ssax/"
                        "SSAX")
             "Parsing of XML to SXML, by Oleg Kiselyov, and maintained by Kirill
              Lisovsky."))))

(doc (section "Definitions")

     (para "Some definitions used by many SXML/xexp packages..."))

(doc (subsection "Exceptions"))

(doc (defstruct (exn:fail:invalid-xexp exn:fail)
         ((expected     string?)
          (context-xexp any/c)
          (invalid-xexp any/c))
       #:transparent)
     "!!!")
(provide (struct-out exn:fail:invalid-xexp))
;; !!! (provide (struct exn:fail:invalid-xexp))
(define-struct (exn:fail:invalid-xexp exn:fail)
  (expected
   context-xexp
   invalid-xexp)
  #:transparent)

(doc (defproc (make-invalid-xexp-exn
               (                     sym                symbol?)
               (#:continuation-marks continuation-marks continuation-marks?)
               (#:expected           expected           string?)
               (#:invalid-xexp       invalid-xexp       any/c)
               (#:context-xexp       context-xexp       any/c (void)))
         fail:exn:invalid-xexp?
       (para "Constructs a "
             (racket fail:exn:invalid-xexp)
             " exception object.")))
(provide make-invalid-xexp-exn)
(define (make-invalid-xexp-exn
         sym
         #:continuation-marks continuation-marks
         #:expected           expected
         #:invalid-xexp       invalid-xexp
         #:context-xexp       (context-xexp (void)))
  (exn:fail:invalid-xexp
   (if (void? context-xexp)
       (format "~A: invalid xexp: expected ~A; got ~S"
               sym
               expected
               invalid-xexp)
       (format "~A: invalid xexp: expected ~A; got ~S in ~S"
               sym
               expected
               invalid-xexp
               context-xexp))
   continuation-marks
   expected
   context-xexp
   invalid-xexp))

(doc (defform (raise-invalid-xexp-exn !!!)
       "!!!"))
(provide       raise-invalid-xexp-exn)
(define-syntax raise-invalid-xexp-exn
  (syntax-rules ()
    ((_ SYM
        #:expected     EXPECTED
        #:invalid-xexp INVALID-XEXP
        #:context-xexp CONTEXT-XEXP)
     (raise (make-invalid-xexp-exn
             SYM
             #:continuation-marks (current-continuation-marks)
             #:expected           EXPECTED
             #:invalid-xexp       INVALID-XEXP
             #:context-xexp       CONTEXT-XEXP)))
    ((_ SYM
        #:expected     EXPECTED
        #:invalid-xexp INVALID-XEXP)
     (raise-invalid-xexp-exn
      SYM
      #:expected           EXPECTED
      #:invalid-xexp       INVALID-XEXP
      #:context-xexp       (void)))))

(doc (subsection "Misc.")

     (para "The following definitions are used by some "
           (italic "xexp")
           "-related libraries."))

(doc (defproc (make-xexp-char-ref (val symbol?))
         xexp-char-ref?
       "Yields an SXML/xexp "
       (racket xexp)
       " character entity reference for "
       (racket val)
       ".  For example:"
       (racketinput (make-xexp-char-ref 'rArr)
                    #,(racketresult (& rArr)))
       (racketinput (make-xexp-char-ref 151)
                    #,(racketresult (& 151)))))
(define (make-xexp-char-ref val)
  (if (or (symbol? val) (integer? val))
      `(& ,val)
      (error 'make-xexp-char-ref
             "invalid xexp reference value: ~S"
             val)))

;; TODO: !!!
;;
;; (define (xexp-entity? x)
;;   (and (xexp-char-ref-value entity) #t))

;; (:%find-first-xexp-non-extraneous-list (Any -> Any))
(define (%find-first-xexp-non-extraneous-list lst)
  (let loop ((lst lst))
    ;; (: loop (Any -> Any))
    (cond ((null? lst) #f)
          ((pair? lst)
           (loop (car lst))
           (loop (cdr lst)))
          (else lst))))

;; (:%xexp:assert-only-xexp-extraneous-lists (Any -> Void))
(define (%xexp:assert-only-xexp-extraneous-lists lst)
  (cond ((%find-first-xexp-non-extraneous-list lst)
         => (lambda (x)
              (raise-invalid-xexp-exn
               '%xexp:assert-only-xexp-extraneous-lists
               #:expected "nothing except extraneous lists and nulls"
               #:invalid-xexp x)))))

(doc (defproc (xexp-char-ref-value (char-ref xexp-char-ref?))
         (or/c symbol? integer?)
       "Yields the symbol or integer value for SXML/xexp character reference "
       (racket char-ref)
       ".  Raises exception "
       (racket exn:fail:invalid-xexp)
       " on an error.  For example:"
       (racketinput (xexp-char-ref-value '(& nbsp))
                    #,(racketresult nbsp))
       (racketinput (xexp-char-ref-value '(& 2000))
                    #,(racketresult 2000))))
(provide xexp-char-ref-value)
(define (xexp-char-ref-value char-ref)
  (if (and (pair? char-ref)
           (eq? '& (car char-ref)))
      ;; TODO: What is this monstrosity?  Rewrite.
      (let loop-find-symbol ((lst (cdr char-ref)))
        (cond ((null? lst)
               (raise-invalid-xexp-exn
                'xexp-char-ref-value
                #:expected     "proper list in xexp-char-ref body"
                #:invalid-xexp lst
                #:context-xexp char-ref))
              ((pair? lst)
               (let ((head (car lst)))
                 (cond ((symbol? head)
                        (%xexp:assert-only-xexp-extraneous-lists (cdr lst))
                        head)
                       ((pair? head)
                        (cond ((loop-find-symbol head)
                               => (lambda (found)
                                    (%xexp:assert-only-xexp-extraneous-lists (cdr head))
                                    (%xexp:assert-only-xexp-extraneous-lists (cdr lst))
                                    found))
                              ((loop-find-symbol (cdr head))
                               => (lambda (found)
                                    (%xexp:assert-only-xexp-extraneous-lists (cdr lst))
                                    found))
                              (else (loop-find-symbol (cdr lst)))))
                       ((null? head) (loop-find-symbol (cdr lst)))
                       (else (raise-invalid-xexp-exn
                              'xexp-char-ref-value
                              #:expected     "xexp-char-ref body"
                              #:invalid-xexp head
                              #:context-xexp char-ref)))))
              (else (raise-invalid-xexp-exn
                     'xexp-char-ref-value
                     #:expected     "proper list in xexp-char-ref body"
                     #:invalid-xexp lst
                     #:context-xexp char-ref))))
      (raise-invalid-xexp-exn 'xexp-char-ref-value
                              #:expected     "xexp-char-ref"
                              #:invalid-xexp char-ref)))

;; TODO: !!! move this to html-specific, shared by "html-parsing" and
;; "html-writing".
(doc (defthing always-empty-html-elements
         (list/c symbol?)
       (para "Deprecated. This is a legacy definition from HtmlPrag that will
eventually disappear."))
     (para "List of symbols for names of HTML elements that can never have content.
For example, the "
           (code "br")
           " element."))
(provide always-empty-html-elements)
(define always-empty-html-elements
  '(area base br frame hr img input isindex keygen link meta object param
         spacer wbr))

(define %xexp:whitespace-char-list
  (list #\space
        #\tab
        #\newline
        #\return))

(define (%xexp:whitespace-char? c)
  (memv c %xexp:whitespace-char-list))

;; PI ::= (? PI-TARGET [* [| STRING (SYMBOL [+ STRING ] ) ] ] )
;;
;; PI-TARGET ::= SYMBOL

;;CDSECT ::= (!cdata [* STRING ] )

;;(!cdata "<greeting>Hello, world!</greeting>")

;;(? xml (version "1.1"))
;;(greeting "Hello, world!")

;;PROLOG ::= XML-DECL [* MISC ] [? doctypedecl [* MISC ] ]

;;XML-DECL ::= (? xml VERSION-INFO [? ENCODINGDECL ] [? SD-DECL ] )

;;VERSION-INFO ::= (version [| "1.0" "1.1" ] )

;;MISC ::= [| COMMENT PI ]

;;DOCTYPEDECL ::= (!doctype NAME [? EXTERNAL-ID ] [? INT-SUBSET ] )

;;EXTERNAL-ID ::= [| (system SYSTEM-LITERAL)
;;                   (public PUBID-LITERAL SYSTEM-LITERAL) ]

;;INT-SUBSET ::= (internal [* [| MARKUP-DECL DECL-SEP ] ])

;;DECL-SEP ::= PE-REFERENCE
;;
;;MARKUP-DECL ::= [| ELEMENT-DECL ATTLIST-DECL ENTITY-DECL NOTATION-DECL PI COMMENT ]

;;EXT-SUBSET ::= [? TEXT-DECL ] EXT-SUBSET-DECL

;;EXT-SUBSET-DECL ::= [* [| MARKUP-DECL CONDITIONAL-SECT DECL-SEP ] ]

;;(? xml (version "1.1"))
;;(!doctype greeting SYSTEM "hello.dtd")
;;(greeting "Hello, world!")

;;(? xml (version "1.1") (encoding "UTF-8"))
;;(!doctype greeting ((!ELEMENT greeting (PCDATA))))
;;(greeting "Hello, world!")

;; SD-DECL ::= (standalone [| "yes" "no" ] )

;;(? xml (version "1.1") (standalone "yes"))

;;(!attlist poem xml:space (default preserve) "preserve")
;;(!attlist pre xml:space (preserve) fixed "preserve")

;;(p ((xml:lang "en")) "The quick brown fox jumps over the lazy dog.")
;;(p ((xml:lang "en-GB")) "What colour is it?")
;;(p ((xml:lang "en-US")) "What color is it?")
;;(sp ((who "Faust") (desc "leide") (xml:lang "de"))
;;    (l "Habe nun, ach! Philosophie,")
;;    (l "Juristerei, und Medizin")
;;    (l "und leider auch Theologie")
;;    (l "durchaus studiert mid hei" #xdf "em Bem" #xfc "h'n."))

;;(!attlist poem  xml:lang CDATA "fr")
;;(!attlist gloss xml:lang CDATA "en")
;;(!attlist note  xml:lang CDATA "en")

;; ELEMENT ::= ( NAME [? ( [* ATTRIBUTE ] ) ] )

;; ATTRIBUTE ::= (NAME [* ATTR-VALUE-PART ] )

;;(!attlist termdef
;;          (id      id      #REQUIRED)
;;          (name    cdata   #IMPLIED))
;;(!attlist list
;;          (type    (| bullets ordered glossary) ordered))
;;(!attlist form
;;          (method  cdata   #FIXED "POST"))

;;(!-- "declare the parameter entity \"ISOLat2\"...")
;;(!entity % ISOLat2 SYSTEM "http://www.xml.com/iso/isolat2-xml.entities")
;;(!-- "... now reference it.")
;;(% ISOLat2)

;; | 4.2 Entity Declarations

;; (!entity Pub-Status "This is a pre-release of the specification.")

;; | 4.2.2 External Entities

;;(!entity open-hatch
;;         SYSTEM
;;         "http://www.textuality.com/boilerplate/OpenHatch.xml")
;;(!entity open-hatch
;;         PUBLIC
;;         "-//Textuality//TEXT Standard open-hatch boilerplate//EN"
;;         "http://www.textuality.com/boilerplate/OpenHatch.xml")
;;(!entity hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif)

;; | 4.3 Parsed Entities

;; | 4.3.1 The Text Declaration

;; (? xml VERSION-INFO ENCODING-DECL)

;; ---------------------

;; document
;;
;; prolog
;; element
;; misc*
;;
;; ;; http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-prolog-dtd
;;
;; (?xml version STRING [ encoding STRING ] [ standalone BOOL ] )
;;
;; (!doctype NAME [ EXTERNAL-ID ] [ ( INT-SUBSET ) ] )
;;
;; EXTERNAL-ID ::= system SYSTEM-LITERAL
;;                 | public PUBID-LITERAL SYSTEM-LITERAL
;;
;; SYSTEM-LITERAL ::= STRING
;;
;; PUBID-LITERAL ::= STRING
;;
;; ;; http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-cdata-sect
;;
;; (!cdata [ STRING ]+ )
;;
;; (!-- [ STRING ]+ )
;;
;;
;; (!entity % draft 'INCLUDE')
;; (!entity % final 'IGNORE')
;;
;; ;; http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-condition-sect
;;
;; (!% draft [ XEXP ]+ )
;; (!% final [ XEXP ]+ )
;;
;; (!% draft
;;     (!element book ((* comments) title body (? supplements))))
;;
;; (!% final
;;     (!element book (title body (? supplements))))
;;
;;
;; ;; 4.1 Character and Entity References
;; ;; http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-references
;;
;; CharRef
;; 42 #x42
;;
;;
;; EntityRef
;; nbsp
;; copy
;;
;; PEReference
;; (% !!!)
;;
;; (!entity NAME ENTITYDEF)
;; (!entity % NAME PEDEF)
;;
;; (!entity Pub-Status "This is a pre-release of the specification.")
;;
;; ;; 4.5 Construction of Entity Replacement Text
;;
;; (!entity % pub #xC9 "ditions Gallimard")
;; (!entity rights "All rights reserved")
;; (!entity book "La Peste: Albert Camus, " #xA9 " 1947 " (% pub) ". " rights)
;;
;;
;; ;; 4.6 Predefined Entities
;;
;; (!entity lt #x38 #x60)
;; (!entity gt #x62)
;;
;; ;; 4.7 Notation Declarations
;;
;; (!notation NAME EXTERNAL-ID-OR-PUBLIC-ID)
;;
;; PUBLIC-ID ::= public PUBID-LITERAL
;;
;;
;;
;; ;; (?splice ...) used to write multiple elements at once when only one xexp is
;; ;; expected.  and to efficiently incorporate large content lists.
;;
;; @section Introduction
;;
;;
;;
;; (define (xexp-elem-name elem)
;;
;;   )
;;
;; (define (xexp-elem-attrs elem)
;;
;;   )
;;
;; (define (xexp-elem-content elem)
;;
;;   )
;;
;; (define (sxml->xexp sxml)
;;   '!!!)
;;
;; (define (xexp->xexp xexp)
;;   '!!!)

(doc history

     (#:planet 2:0 #:date "2012-06-12"
               (itemlist
                (item "Somewhat heavy changes.  (Previous version was labeled
                       as in-development, as is this one.)")
                (item "Converted to McFly.")
                (item "Removed the "
                      (code "typed/racket/base/no-check")
                      " for now.")))

     (#:version "0.1" #:planet 1:0 #:date "2011-08-21"
                (itemlist
                 (item
                  "Part of forked development from HtmlPrag."))))