test-html-parsing.rkt
#lang racket/base
;; For legal info, see file "info.rkt".

(require (planet neil/overeasy:3:1)
         "html-parsing.rkt")

(test-section 'test-html-parsing

  (test (html->xexp "<a>>") '(*TOP* (a ">")))
  (test (html->xexp "<a<>") '(*TOP* (a "<" ">")))

  (test (html->xexp "<>")      '(*TOP* "<" ">"))
  (test (html->xexp "< >")
        ;; `(*TOP* "<" ">")
        '(*TOP* "<" " " ">"))
  (test (html->xexp "< a>")
        ;; `(*TOP* (a))
        '(*TOP*  "<" " a" ">"))
  (test (html->xexp "< a / >")
        ;; `(*TOP* (a))
        '(*TOP* "<" " a / " ">"))

  (test (html->xexp "<a<")  '(*TOP* (a "<")))
  (test (html->xexp "<a<b") '(*TOP* (a (b))))

  (test (html->xexp "><a>") '(*TOP* ">" (a)))

  (test (html->xexp "</>") '(*TOP*))

  (test (html->xexp "<\">") '(*TOP* "<" "\"" ">"))

  (test (html->xexp (string-append "<a>xxx<plaintext>aaa" "\n"
                                   "bbb" "\n"
                                   "c<c<c"))
        `(*TOP*
          (a "xxx" (plaintext ,(string-append "aaa" "\n")
                              ,(string-append "bbb" "\n")
                              "c<c<c"))))

  (test (html->xexp "aaa<!-- xxx -->bbb")
        `(*TOP*
          "aaa" (*COMMENT* " xxx ")   "bbb"))

  (test (html->xexp "aaa<! -- xxx -->bbb")
        `(*TOP*
          "aaa" (*COMMENT* " xxx ")   "bbb"))

  (test (html->xexp "aaa<!-- xxx --->bbb")
        `(*TOP*
          "aaa" (*COMMENT* " xxx -")  "bbb"))

  (test (html->xexp "aaa<!-- xxx ---->bbb")
        `(*TOP*
          "aaa" (*COMMENT* " xxx --") "bbb"))

  (test (html->xexp "aaa<!-- xxx -y-->bbb")
        `(*TOP*
          "aaa" (*COMMENT* " xxx -y") "bbb"))

  (test (html->xexp "aaa<!----->bbb")
        `(*TOP*
          "aaa" (*COMMENT* "-")       "bbb"))

  (test (html->xexp "aaa<!---->bbb")
        `(*TOP*
          "aaa" (*COMMENT* "")        "bbb"))

  (test (html->xexp "aaa<!--->bbb")
        `(*TOP* "aaa" (*COMMENT* "->bbb")))

  (test (html->xexp "<hr>")   `(*TOP* (hr)))
  (test (html->xexp "<hr/>")  `(*TOP* (hr)))
  (test (html->xexp "<hr />") `(*TOP* (hr)))

  (test (html->xexp "<hr noshade>")
        `(*TOP* (hr (@ (noshade)))))
  (test (html->xexp "<hr noshade/>")
        `(*TOP* (hr (@ (noshade)))))
  (test (html->xexp "<hr noshade />")
        `(*TOP* (hr (@ (noshade)))))
  (test (html->xexp "<hr noshade / >")
        `(*TOP* (hr (@ (noshade)))))
  (test (html->xexp "<hr noshade=1 />")
        `(*TOP* (hr (@ (noshade "1")))))
  (test (html->xexp "<hr noshade=1/>")
        `(*TOP* (hr (@ (noshade "1/")))))

  (test (html->xexp "<q>aaa<p/>bbb</q>ccc</p>ddd")
        `(*TOP* (q "aaa" (p) "bbb") "ccc" "ddd"))

  (test (html->xexp "&lt;") `(*TOP* "<"))
  (test (html->xexp "&gt;") `(*TOP* ">"))

  (test (html->xexp "Gilbert &amp; Sullivan")
        `(*TOP* "Gilbert & Sullivan"))
  (test (html->xexp "Gilbert &amp Sullivan")
        `(*TOP* "Gilbert & Sullivan"))
  (test (html->xexp "Gilbert & Sullivan")
        `(*TOP* "Gilbert & Sullivan"))

  (test (html->xexp "Copyright &copy; Foo")
        `(*TOP* "Copyright "
                (& copy)
                " Foo"))
  (test (html->xexp "aaa&copy;bbb")
        `(*TOP*
          "aaa" (& copy) "bbb"))
  (test (html->xexp "aaa&copy")
        `(*TOP*
          "aaa" (& copy)))

  (test (html->xexp "&#42;")  '(*TOP* "*"))
  (test (html->xexp "&#42")   '(*TOP* "*"))
  (test (html->xexp "&#42x")  '(*TOP* "*x"))
  (test (html->xexp "&#151")  `(*TOP* ,(string (integer->char 151))))
  (test (html->xexp "&#1000") `(*TOP* ,(string (integer->char 1000))))
  (test (html->xexp "&#x42")  '(*TOP* "B"))
  (test (html->xexp "&#xA2")  `(*TOP* ,(string (integer->char 162))))
  (test (html->xexp "&#xFF")  `(*TOP* ,(string (integer->char 255))))
  (test (html->xexp "&#x100") `(*TOP* ,(string (integer->char 256))))
  (test (html->xexp "&#X42")  '(*TOP* "B"))
  (test (html->xexp "&42;")   '(*TOP* "&42;"))

  (test (html->xexp (string-append "aaa&copy;bbb&amp;ccc&lt;ddd&&gt;"
                                   "eee&#42;fff&#1000;ggg&#x5a;hhh"))
        `(*TOP*
          "aaa"
          (& copy)
          ,(string-append "bbb&ccc<ddd&>eee*fff"
                          (string (integer->char 1000))
                          "gggZhhh")))

  (test (html->xexp
         (string-append
          "<IMG src=\"http://e.e/aw/pics/listings/"
          "ebayLogo_38x16.gif\" border=0 width=\"38\" height=\"16\" "
          "HSPACE=5 VSPACE=0\">2</FONT>"))
        `(*TOP*
          (img (@
                (src
                 "http://e.e/aw/pics/listings/ebayLogo_38x16.gif")
                (border "0") (width "38") (height "16")
                (hspace "5") (vspace "0")))
          "2"))

  (test (html->xexp "<aaa bbb=ccc\"ddd>eee")
        `(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))
  (test (html->xexp "<aaa bbb=ccc \"ddd>eee")
        `(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))

  (test (html->xexp
         (string-append
          "<HTML><Head><Title>My Title</Title></Head>"
          "<Body BGColor=\"white\" Foo=42>"
          "This is a <B><I>bold-italic</B></I> test of </Erk>"
          "broken HTML.<br>Yes it is.</Body></HTML>"))
        `(*TOP*
          (html (head (title "My Title"))
                (body (@ (bgcolor "white") (foo "42"))
                      "This is a "
                      (b (i "bold-italic"))
                      " test of "
                      "broken HTML."
                      (br)
                      "Yes it is."))))

  (test (html->xexp
         (string-append
          "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
          " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"))
        `(*TOP*
          (*DECL*
           DOCTYPE
           html
           PUBLIC
           "-//W3C//DTD XHTML 1.0 Strict//EN"
           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")))

  (test (html->xexp
         (string-append
          "<html xmlns=\"http://www.w3.org/1999/xhtml\" "
          "xml:lang=\"en\" "
          "lang=\"en\">"))
        `(*TOP*
          (html (@ (xmlns "http://www.w3.org/1999/xhtml")
                   (xml:lang "en") (lang "en")))))

  (test (html->xexp
         (string-append
          "<html:html xmlns:html=\"http://www.w3.org/TR/REC-html40\">"
          "<html:head><html:title>Frobnostication</html:title></html:head>"
          "<html:body><html:p>Moved to <html:a href=\"http://frob.com\">"
          "here.</html:a></html:p></html:body></html:html>"))
        `(*TOP*
          (html (@ (xmlns:html "http://www.w3.org/TR/REC-html40"))
                (head (title "Frobnostication"))
                (body (p "Moved to "
                         (a (@ (href "http://frob.com"))
                            "here."))))))

  (test (html->xexp
         (string-append
          "<RESERVATION xmlns:HTML=\"http://www.w3.org/TR/REC-html40\">"
          "<NAME HTML:CLASS=\"largeSansSerif\">Layman, A</NAME>"
          "<SEAT CLASS=\"Y\" HTML:CLASS=\"largeMonotype\">33B</SEAT>"
          "<HTML:A HREF=\"/cgi-bin/ResStatus\">Check Status</HTML:A>"
          "<DEPARTURE>1997-05-24T07:55:00+1</DEPARTURE></RESERVATION>"))
        `(*TOP*
          (reservation (@ (,(string->symbol "xmlns:HTML")
                           "http://www.w3.org/TR/REC-html40"))
                       (name (@ (class "largeSansSerif"))
                             "Layman, A")
                       (seat (@ (class "Y") (class "largeMonotype"))
                             "33B")
                       (a (@ (href "/cgi-bin/ResStatus"))
                          "Check Status")
                       (departure "1997-05-24T07:55:00+1"))))

  (test (html->xexp
         (string-append
          "<html><head><title></title><title>whatever</title></head><body>"
          "<a href=\"url\">link</a><p align=center><ul compact style=\"aa\">"
          "<p>BLah<!-- comment <comment> --> <i> italic <b> bold <tt> ened </i>"
          " still &lt; bold </b></body><P> But not done yet..."))
        `(*TOP*
          (html (head (title) (title "whatever"))
                (body (a (@ (href "url")) "link")
                      (p (@ (align "center"))
                         (ul (@ (compact) (style "aa"))))
                      (p "BLah"
                         (*COMMENT* " comment <comment> ")
                         " "
                         (i " italic " (b " bold " (tt " ened ")))
                         " still < bold "))
                (p " But not done yet..."))))

  (test (html->xexp "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
        `(*TOP*
          (*PI* xml "version=\"1.0\" encoding=\"UTF-8\"")))

  (test (html->xexp "<?php php_info(); ?>")
        `(*TOP* (*PI* php "php_info(); ")))
  (test (html->xexp "<?php php_info(); ?")
        `(*TOP* (*PI* php "php_info(); ?")))
  (test (html->xexp "<?php php_info(); ")
        `(*TOP* (*PI* php "php_info(); ")))

  (test (html->xexp "<?foo bar ? baz > blort ?>")
        `(*TOP*
          (*PI* foo "bar ? baz > blort ")))

  (test (html->xexp "<?foo b?>x")
        `(*TOP* (*PI* foo "b") "x"))
  (test (html->xexp "<?foo ?>x")
        `(*TOP* (*PI* foo "")  "x"))
  (test (html->xexp "<?foo ?>x")
        `(*TOP* (*PI* foo "")  "x"))
  (test (html->xexp "<?foo?>x")
        `(*TOP* (*PI* foo "")  "x"))
  (test (html->xexp "<?f?>x")
        `(*TOP* (*PI* f   "")  "x"))
  (test (html->xexp "<??>x")
        `(*TOP* (*PI* #f  "")  "x"))
  (test (html->xexp "<?>x")
        `(*TOP* (*PI* #f  ">x")))

  (test (html->xexp "<foo bar=\"baz\">blort")
        `(*TOP* (foo (@ (bar "baz")) "blort")))
  (test (html->xexp "<foo bar='baz'>blort")
        `(*TOP* (foo (@ (bar "baz")) "blort")))
  (test (html->xexp "<foo bar=\"baz'>blort")
        `(*TOP* (foo (@ (bar "baz'>blort")))))
  (test (html->xexp "<foo bar='baz\">blort")
        `(*TOP* (foo (@ (bar "baz\">blort")))))

  (test (html->xexp (string-append "<p>A</p>"
                                   "<script>line0 <" "\n"
                                   "line1" "\n"
                                   "<line2></script>"
                                   "<p>B</p>"))
        `(*TOP* (p "A")
                (script ,(string-append "line0 <" "\n")
                        ,(string-append "line1"   "\n")
                        "<line2>")
                (p "B")))

  (test (html->xexp "<xmp>a<b>c</XMP>d")
        `(*TOP* (xmp "a<b>c") "d"))
  (test (html->xexp "<XMP>a<b>c</xmp>d")
        `(*TOP* (xmp "a<b>c") "d"))
  (test (html->xexp "<xmp>a<b>c</foo:xmp>d")
        `(*TOP* (xmp "a<b>c") "d"))
  (test (html->xexp "<foo:xmp>a<b>c</xmp>d")
        `(*TOP* (xmp "a<b>c") "d"))
  (test (html->xexp "<foo:xmp>a<b>c</foo:xmp>d")
        `(*TOP* (xmp "a<b>c") "d"))
  (test (html->xexp "<foo:xmp>a<b>c</bar:xmp>d")
        `(*TOP* (xmp "a<b>c") "d"))

  (test (html->xexp "<xmp>a</b>c</xmp>d")
        `(*TOP* (xmp "a</b>c")     "d"))
  (test (html->xexp "<xmp>a</b >c</xmp>d")
        `(*TOP* (xmp "a</b >c")    "d"))
  (test (html->xexp "<xmp>a</ b>c</xmp>d")
        `(*TOP* (xmp "a</ b>c")    "d"))
  (test (html->xexp "<xmp>a</ b >c</xmp>d")
        `(*TOP* (xmp "a</ b >c")   "d"))
  (test (html->xexp "<xmp>a</b:x>c</xmp>d")
        `(*TOP* (xmp "a</b:x>c")   "d"))
  (test (html->xexp "<xmp>a</b::x>c</xmp>d")
        `(*TOP* (xmp "a</b::x>c")  "d"))
  (test (html->xexp "<xmp>a</b:::x>c</xmp>d")
        `(*TOP* (xmp "a</b:::x>c") "d"))
  (test (html->xexp "<xmp>a</b:>c</xmp>d")
        `(*TOP* (xmp "a</b:>c")    "d"))
  (test (html->xexp "<xmp>a</b::>c</xmp>d")
        `(*TOP* (xmp "a</b::>c")   "d"))
  (test (html->xexp "<xmp>a</xmp:b>c</xmp>d")
        `(*TOP* (xmp "a</xmp:b>c") "d"))

  (let ((expected `(*TOP* (p "real1")
                          "\n"
                          (xmp "\n"
                               ,(string-append "alpha"       "\n")
                               ,(string-append "<P>fake</P>" "\n")
                               ,(string-append "bravo"       "\n"))
                          (p "real2"))))

    (test (html->xexp (string-append "<P>real1</P>" "\n"
                                     "<XMP>"        "\n"
                                     "alpha"        "\n"
                                     "<P>fake</P>"  "\n"
                                     "bravo"        "\n"
                                     "</XMP "       "\n"
                                     "<P>real2</P>"))
          expected)

    (test (html->xexp (string-append "<P>real1</P>" "\n"
                                     "<XMP>"        "\n"
                                     "alpha"        "\n"
                                     "<P>fake</P>"  "\n"
                                     "bravo"        "\n"
                                     "</XMP"        "\n"
                                     "<P>real2</P>"))
          expected))

  (test (html->xexp "<xmp>a</xmp>x")
        `(*TOP* (xmp "a")   "x"))
  (test (html->xexp (string-append "<xmp>a" "\n" "</xmp>x"))
        `(*TOP* (xmp ,(string-append "a" "\n")) "x"))
  (test (html->xexp "<xmp></xmp>x")
        `(*TOP* (xmp)       "x"))

  (test (html->xexp "<xmp>a</xmp") `(*TOP* (xmp "a")))
  (test (html->xexp "<xmp>a</xm")  `(*TOP* (xmp "a</xm")))
  (test (html->xexp "<xmp>a</x")   `(*TOP* (xmp "a</x")))
  (test (html->xexp "<xmp>a</")    `(*TOP* (xmp "a</")))
  (test (html->xexp "<xmp>a<")     `(*TOP* (xmp "a<")))
  (test (html->xexp "<xmp>a")      `(*TOP* (xmp "a")))
  (test (html->xexp "<xmp>")       `(*TOP* (xmp)))
  (test (html->xexp "<xmp")        `(*TOP* (xmp)))

  (test (html->xexp "<xmp x=42 ")
        `(*TOP* (xmp (@ (x "42")))))
  (test (html->xexp "<xmp x= ")   `(*TOP* (xmp (@ (x)))))
  (test (html->xexp "<xmp x ")    `(*TOP* (xmp (@ (x)))))
  (test (html->xexp "<xmp x")     `(*TOP* (xmp (@ (x)))))

  (test (html->xexp "<script>xxx")
        `(*TOP* (script "xxx")))
  (test (html->xexp "<script/>xxx")
        `(*TOP* (script) "xxx"))

  (test (html->xexp "<html xml:lang=\"en\" lang=\"en\">")
        `(*TOP* (html (@ (xml:lang "en") (lang "en")))))

  (test (html->xexp "<a href=/foo.html>")
        `(*TOP* (a (@ (href "/foo.html")))))
  (test (html->xexp "<a href=/>foo.html")
        `(*TOP* (a (@ (href "/")) "foo.html")))

  ;; TODO: Add verbatim-pair cases with attributes in the end tag.

  (test (html->xexp "&copy;")
        `(*TOP* (& copy)))
  (test (html->xexp "&rArr;")
        `(*TOP* (& ,(string->symbol "rArr"))))
  (test (html->xexp "&#151;")
        `(*TOP* ,(string (integer->char 151))))

  (test (html->xexp "&#999;")
        `(*TOP* ,(string (integer->char 999))))

  (test (html->xexp "xxx<![CDATA[abc]]>yyy")
        `(*TOP* "xxx" "abc" "yyy"))

  (test (html->xexp "xxx<![CDATA[ab]c]]>yyy")
        `(*TOP* "xxx" "ab]c" "yyy"))

  (test (html->xexp "xxx<![CDATA[ab]]c]]>yyy")
        `(*TOP* "xxx" "ab]]c" "yyy"))

  (test (html->xexp "xxx<![CDATA[]]]>yyy")
        `(*TOP* "xxx" "]" "yyy"))

  (test (html->xexp "xxx<![CDATAyyy")
        `(*TOP* "xxx" "<![CDATA" "yyy"))

  (test (html->xexp "<html><div><p>P1</p><p>P2</p></div><p>P3</p>")
        `(*TOP* (html (div (p "P1")
                           (p "P2"))
                      (p "P3")))
        #:id 'parent-constraints-with-div)

  (test (html->xexp "&#151;")
        `(*TOP* ,(string (integer->char 151)))
        #:id 'once-again-converting-character-references-above-126-to-string)

  (test (html->xexp "<ul><li>a<p>b</p>")
        `(*TOP* (ul (li "a" (p "b"))))
        #:id 'p-element-can-be-child-of-li-element)

  ;; TODO: Document this.
  ;;
  ;; (define html-1 "<myelem myattr=\"&\">")
  ;; (define shtml   (html->xexp html-1))
  ;; shtml
  ;; (define html-2 (shtml->html shtml))
  ;; html-2

  )