test-html-parsing.rkt
#lang racket/base
;; See file "html-parsing.rkt" for legal info.
;; $Id: test-html-parsing.rkt,v 1.7 2011/03/04 07:52:42 neilpair Exp $

(require (planet neil/testeez:1:1)
         "html-parsing.rkt")

(testeez
 "test-html-parsing.rkt"

 (test-define "" lf (string (integer->char 10)))

 (test/equal "" (html->xexp "<a>>") '(*TOP* (a ">")))
 (test/equal "" (html->xexp "<a<>") '(*TOP* (a "<" ">")))

 (test/equal "" (html->xexp "<>")      '(*TOP* "<" ">"))
 (test/equal "" (html->xexp "< >")
             ;; `(*TOP* "<" ">")
             '(*TOP* "<" " " ">"))
 (test/equal "" (html->xexp "< a>")
             ;; `(*TOP* (a))
             '(*TOP*  "<" " a" ">"))
 (test/equal "" (html->xexp "< a / >")
             ;; `(*TOP* (a))
             '(*TOP* "<" " a / " ">"))

 (test/equal "" (html->xexp "<a<")  '(*TOP* (a "<")))
 (test/equal "" (html->xexp "<a<b") '(*TOP* (a (b))))

 (test/equal "" (html->xexp "><a>") '(*TOP* ">" (a)))

 (test/equal "" (html->xexp "</>") '(*TOP*))

 (test/equal "" (html->xexp "<\">") '(*TOP* "<" "\"" ">"))

 (test/equal ""
             (html->xexp (string-append "<a>xxx<plaintext>aaa" lf
                                        "bbb" lf
                                        "c<c<c"))
             `(*TOP*
               (a "xxx" (plaintext ,(string-append "aaa" lf)
                                   ,(string-append "bbb" lf)
                                   "c<c<c"))))

 (test/equal ""
             (html->xexp "aaa<!-- xxx -->bbb")
             `(*TOP*
               "aaa" (*COMMENT* " xxx ")   "bbb"))

 (test/equal ""
             (html->xexp "aaa<! -- xxx -->bbb")
             `(*TOP*
               "aaa" (*COMMENT* " xxx ")   "bbb"))

 (test/equal ""
             (html->xexp "aaa<!-- xxx --->bbb")
             `(*TOP*
               "aaa" (*COMMENT* " xxx -")  "bbb"))

 (test/equal ""
             (html->xexp "aaa<!-- xxx ---->bbb")
             `(*TOP*
               "aaa" (*COMMENT* " xxx --") "bbb"))

 (test/equal ""
             (html->xexp "aaa<!-- xxx -y-->bbb")
             `(*TOP*
               "aaa" (*COMMENT* " xxx -y") "bbb"))

 (test/equal ""
             (html->xexp "aaa<!----->bbb")
             `(*TOP*
               "aaa" (*COMMENT* "-")       "bbb"))

 (test/equal ""
             (html->xexp "aaa<!---->bbb")
             `(*TOP*
               "aaa" (*COMMENT* "")        "bbb"))

 (test/equal ""
             (html->xexp "aaa<!--->bbb")
             `(*TOP* "aaa" (*COMMENT* "->bbb")))

 (test/equal "" (html->xexp "<hr>")   `(*TOP* (hr)))
 (test/equal "" (html->xexp "<hr/>")  `(*TOP* (hr)))
 (test/equal "" (html->xexp "<hr />") `(*TOP* (hr)))

 (test/equal ""
             (html->xexp "<hr noshade>")
             `(*TOP* (hr (@ (noshade)))))
 (test/equal ""
             (html->xexp "<hr noshade/>")
             `(*TOP* (hr (@ (noshade)))))
 (test/equal ""
             (html->xexp "<hr noshade />")
             `(*TOP* (hr (@ (noshade)))))
 (test/equal ""
             (html->xexp "<hr noshade / >")
             `(*TOP* (hr (@ (noshade)))))
 (test/equal ""
             (html->xexp "<hr noshade=1 />")
             `(*TOP* (hr (@ (noshade "1")))))
 (test/equal ""
             (html->xexp "<hr noshade=1/>")
             `(*TOP* (hr (@ (noshade "1/")))))

 (test/equal ""
             (html->xexp "<q>aaa<p/>bbb</q>ccc</p>ddd")
             `(*TOP* (q "aaa" (p) "bbb") "ccc" "ddd"))

 (test/equal "" (html->xexp "&lt;") `(*TOP* "<"))
 (test/equal "" (html->xexp "&gt;") `(*TOP* ">"))

 (test/equal ""
             (html->xexp "Gilbert &amp; Sullivan")
             `(*TOP* "Gilbert & Sullivan"))
 (test/equal ""
             (html->xexp "Gilbert &amp Sullivan")
             `(*TOP* "Gilbert & Sullivan"))
 (test/equal ""
             (html->xexp "Gilbert & Sullivan")
             `(*TOP* "Gilbert & Sullivan"))

 (test/equal ""
             (html->xexp "Copyright &copy; Foo")
             `(*TOP* "Copyright "
                     (& copy)
                     " Foo"))
 (test/equal ""
             (html->xexp "aaa&copy;bbb")
             `(*TOP*
               "aaa" (& copy) "bbb"))
 (test/equal ""
             (html->xexp "aaa&copy")
             `(*TOP*
               "aaa" (& copy)))

 (test/equal "" (html->xexp "&#42;")  '(*TOP* "*"))
 (test/equal "" (html->xexp "&#42")   '(*TOP* "*"))
 (test/equal "" (html->xexp "&#42x")  '(*TOP* "*x"))
 (test/equal "" (html->xexp "&#151")  `(*TOP* ,(integer->char 151)))
 (test/equal "" (html->xexp "&#1000") `(*TOP* ,(integer->char 1000)))
 (test/equal "" (html->xexp "&#x42")  '(*TOP* "B"))
 (test/equal "" (html->xexp "&#xA2")  `(*TOP* ,(integer->char 162)))
 (test/equal "" (html->xexp "&#xFF")  `(*TOP* ,(integer->char 255)))
 (test/equal "" (html->xexp "&#x100") `(*TOP* ,(integer->char 256)))
 (test/equal "" (html->xexp "&#X42")  '(*TOP* "B"))
 (test/equal "" (html->xexp "&42;")   '(*TOP* "&42;"))

 (test/equal ""
             (html->xexp (string-append "aaa&copy;bbb&amp;ccc&lt;ddd&&gt;"
                                        "eee&#42;fff&#1000;ggg&#x5a;hhh"))
             `(*TOP*
               "aaa"
               (& copy)
               "bbb&ccc<ddd&>eee*fff"
               ,(integer->char 1000)
               "gggZhhh"))

 (test/equal ""
             (html->xexp
              (string-append
               "<IMG src=\"http://e.e/aw/pics/listings/"
               "ebayLogo_38x16.gif\" border=0 width=\"38\" height=\"16\" "
               "HSPACE=5 VSPACE=0\">2</FONT>"))
             `(*TOP*
               (img (@
                     (src
                      "http://e.e/aw/pics/listings/ebayLogo_38x16.gif")
                     (border "0") (width "38") (height "16")
                     (hspace "5") (vspace "0")))
               "2"))

 (test/equal ""
             (html->xexp "<aaa bbb=ccc\"ddd>eee")
             `(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))
 (test/equal ""
             (html->xexp "<aaa bbb=ccc \"ddd>eee")
             `(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))

 (test/equal ""
             (html->xexp
              (string-append
               "<HTML><Head><Title>My Title</Title></Head>"
               "<Body BGColor=\"white\" Foo=42>"
               "This is a <B><I>bold-italic</B></I> test of </Erk>"
               "broken HTML.<br>Yes it is.</Body></HTML>"))
             `(*TOP*
               (html (head (title "My Title"))
                     (body (@ (bgcolor "white") (foo "42"))
                           "This is a "
                           (b (i "bold-italic"))
                           " test of "
                           "broken HTML."
                           (br)
                           "Yes it is."))))

 (test/equal ""
             (html->xexp
              (string-append
               "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
               " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"))
             `(*TOP*
               (*DECL*
                DOCTYPE
                html
                PUBLIC
                "-//W3C//DTD XHTML 1.0 Strict//EN"
                "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")))

 (test/equal ""
             (html->xexp
              (string-append
               "<html xmlns=\"http://www.w3.org/1999/xhtml\" "
               "xml:lang=\"en\" "
               "lang=\"en\">"))
             `(*TOP*
               (html (@ (xmlns "http://www.w3.org/1999/xhtml")
                        (xml:lang "en") (lang "en")))))

 (test/equal
  ""
  (html->xexp
   (string-append
    "<html:html xmlns:html=\"http://www.w3.org/TR/REC-html40\">"
    "<html:head><html:title>Frobnostication</html:title></html:head>"
    "<html:body><html:p>Moved to <html:a href=\"http://frob.com\">"
    "here.</html:a></html:p></html:body></html:html>"))
  `(*TOP*
    (html (@ (xmlns:html "http://www.w3.org/TR/REC-html40"))
          (head (title "Frobnostication"))
          (body (p "Moved to "
                   (a (@ (href "http://frob.com"))
                      "here."))))))

 (test/equal ""
             (html->xexp
              (string-append
               "<RESERVATION xmlns:HTML=\"http://www.w3.org/TR/REC-html40\">"
               "<NAME HTML:CLASS=\"largeSansSerif\">Layman, A</NAME>"
               "<SEAT CLASS=\"Y\" HTML:CLASS=\"largeMonotype\">33B</SEAT>"
               "<HTML:A HREF=\"/cgi-bin/ResStatus\">Check Status</HTML:A>"
               "<DEPARTURE>1997-05-24T07:55:00+1</DEPARTURE></RESERVATION>"))
             `(*TOP*
               (reservation (@ (,(string->symbol "xmlns:HTML")
                                "http://www.w3.org/TR/REC-html40"))
                            (name (@ (class "largeSansSerif"))
                                  "Layman, A")
                            (seat (@ (class "Y") (class "largeMonotype"))
                                  "33B")
                            (a (@ (href "/cgi-bin/ResStatus"))
                               "Check Status")
                            (departure "1997-05-24T07:55:00+1"))))

 (test/equal
  ""
  (html->xexp
   (string-append
    "<html><head><title></title><title>whatever</title></head><body>"
    "<a href=\"url\">link</a><p align=center><ul compact style=\"aa\">"
    "<p>BLah<!-- comment <comment> --> <i> italic <b> bold <tt> ened </i>"
    " still &lt; bold </b></body><P> But not done yet..."))
  `(*TOP*
    (html (head (title) (title "whatever"))
          (body (a (@ (href "url")) "link")
                (p (@ (align "center"))
                   (ul (@ (compact) (style "aa"))))
                (p "BLah"
                   (*COMMENT* " comment <comment> ")
                   " "
                   (i " italic " (b " bold " (tt " ened ")))
                   " still < bold "))
          (p " But not done yet..."))))

 (test/equal ""
             (html->xexp "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
             `(*TOP*
               (*PI* xml "version=\"1.0\" encoding=\"UTF-8\"")))

 (test/equal ""
             (html->xexp "<?php php_info(); ?>")
             `(*TOP* (*PI* php "php_info(); ")))
 (test/equal ""
             (html->xexp "<?php php_info(); ?")
             `(*TOP* (*PI* php "php_info(); ?")))
 (test/equal ""
             (html->xexp "<?php php_info(); ")
             `(*TOP* (*PI* php "php_info(); ")))

 (test/equal ""
             (html->xexp "<?foo bar ? baz > blort ?>")
             `(*TOP*
               (*PI* foo "bar ? baz > blort ")))

 (test/equal ""
             (html->xexp "<?foo b?>x")
             `(*TOP* (*PI* foo "b") "x"))
 (test/equal ""
             (html->xexp "<?foo ?>x")
             `(*TOP* (*PI* foo "")  "x"))
 (test/equal ""
             (html->xexp "<?foo ?>x")
             `(*TOP* (*PI* foo "")  "x"))
 (test/equal ""
             (html->xexp "<?foo?>x")
             `(*TOP* (*PI* foo "")  "x"))
 (test/equal ""
             (html->xexp "<?f?>x")
             `(*TOP* (*PI* f   "")  "x"))
 (test/equal ""
             (html->xexp "<??>x")
             `(*TOP* (*PI* #f  "")  "x"))
 (test/equal ""
             (html->xexp "<?>x")
             `(*TOP* (*PI* #f  ">x")))

 (test/equal ""
             (html->xexp "<foo bar=\"baz\">blort")
             `(*TOP* (foo (@ (bar "baz")) "blort")))
 (test/equal ""
             (html->xexp "<foo bar='baz'>blort")
             `(*TOP* (foo (@ (bar "baz")) "blort")))
 (test/equal ""
             (html->xexp "<foo bar=\"baz'>blort")
             `(*TOP* (foo (@ (bar "baz'>blort")))))
 (test/equal ""
             (html->xexp "<foo bar='baz\">blort")
             `(*TOP* (foo (@ (bar "baz\">blort")))))

 (test/equal ""
             (html->xexp (string-append "<p>A</p>"
                                        "<script>line0 <" lf
                                        "line1" lf
                                        "<line2></script>"
                                        "<p>B</p>"))
             `(*TOP* (p "A")
                     (script ,(string-append "line0 <" lf)
                             ,(string-append "line1"   lf)
                             "<line2>")
                     (p "B")))

 (test/equal ""
             (html->xexp "<xmp>a<b>c</XMP>d")
             `(*TOP* (xmp "a<b>c") "d"))
 (test/equal ""
             (html->xexp "<XMP>a<b>c</xmp>d")
             `(*TOP* (xmp "a<b>c") "d"))
 (test/equal ""
             (html->xexp "<xmp>a<b>c</foo:xmp>d")
             `(*TOP* (xmp "a<b>c") "d"))
 (test/equal ""
             (html->xexp "<foo:xmp>a<b>c</xmp>d")
             `(*TOP* (xmp "a<b>c") "d"))
 (test/equal ""
             (html->xexp "<foo:xmp>a<b>c</foo:xmp>d")
             `(*TOP* (xmp "a<b>c") "d"))
 (test/equal ""
             (html->xexp "<foo:xmp>a<b>c</bar:xmp>d")
             `(*TOP* (xmp "a<b>c") "d"))

 (test/equal ""
             (html->xexp "<xmp>a</b>c</xmp>d")
             `(*TOP* (xmp "a</b>c")     "d"))
 (test/equal ""
             (html->xexp "<xmp>a</b >c</xmp>d")
             `(*TOP* (xmp "a</b >c")    "d"))
 (test/equal ""
             (html->xexp "<xmp>a</ b>c</xmp>d")
             `(*TOP* (xmp "a</ b>c")    "d"))
 (test/equal ""
             (html->xexp "<xmp>a</ b >c</xmp>d")
             `(*TOP* (xmp "a</ b >c")   "d"))
 (test/equal ""
             (html->xexp "<xmp>a</b:x>c</xmp>d")
             `(*TOP* (xmp "a</b:x>c")   "d"))
 (test/equal ""
             (html->xexp "<xmp>a</b::x>c</xmp>d")
             `(*TOP* (xmp "a</b::x>c")  "d"))
 (test/equal ""
             (html->xexp "<xmp>a</b:::x>c</xmp>d")
             `(*TOP* (xmp "a</b:::x>c") "d"))
 (test/equal ""
             (html->xexp "<xmp>a</b:>c</xmp>d")
             `(*TOP* (xmp "a</b:>c")    "d"))
 (test/equal ""
             (html->xexp "<xmp>a</b::>c</xmp>d")
             `(*TOP* (xmp "a</b::>c")   "d"))
 (test/equal ""
             (html->xexp "<xmp>a</xmp:b>c</xmp>d")
             `(*TOP* (xmp "a</xmp:b>c") "d"))

 (test-define "expected output for next two tests"
              expected
              `(*TOP* (p "real1")
                      ,lf
                      (xmp ,lf
                           ,(string-append "alpha"       lf)
                           ,(string-append "<P>fake</P>" lf)
                           ,(string-append "bravo"       lf))
                      (p "real2")))

 (test/equal ""
             (html->xexp (string-append "<P>real1</P>" lf
                                        "<XMP>"        lf
                                        "alpha"        lf
                                        "<P>fake</P>"  lf
                                        "bravo"        lf
                                        "</XMP "       lf
                                        "<P>real2</P>"))
             expected)

 (test/equal ""
             (html->xexp (string-append "<P>real1</P>" lf
                                        "<XMP>"        lf
                                        "alpha"        lf
                                        "<P>fake</P>"  lf
                                        "bravo"        lf
                                        "</XMP"        lf
                                        "<P>real2</P>"))
             expected)

 (test/equal ""
             (html->xexp "<xmp>a</xmp>x")
             `(*TOP* (xmp "a")   "x"))
 (test/equal ""
             (html->xexp (string-append "<xmp>a" lf "</xmp>x"))
             `(*TOP* (xmp ,(string-append "a" lf)) "x"))
 (test/equal ""
             (html->xexp "<xmp></xmp>x")
             `(*TOP* (xmp)       "x"))

 (test/equal ""
             (html->xexp "<xmp>a</xmp") `(*TOP* (xmp "a")))
 (test/equal ""
             (html->xexp "<xmp>a</xm")  `(*TOP* (xmp "a</xm")))
 (test/equal ""
             (html->xexp "<xmp>a</x")   `(*TOP* (xmp "a</x")))
 (test/equal ""
             (html->xexp "<xmp>a</")    `(*TOP* (xmp "a</")))
 (test/equal ""
             (html->xexp "<xmp>a<")     `(*TOP* (xmp "a<")))
 (test/equal ""
             (html->xexp "<xmp>a")      `(*TOP* (xmp "a")))
 (test/equal ""
             (html->xexp "<xmp>")       `(*TOP* (xmp)))
 (test/equal ""
             (html->xexp "<xmp")        `(*TOP* (xmp)))

 (test/equal ""
             (html->xexp "<xmp x=42 ")
             `(*TOP* (xmp (@ (x "42")))))
 (test/equal ""
             (html->xexp "<xmp x= ")   `(*TOP* (xmp (@ (x)))))
 (test/equal ""
             (html->xexp "<xmp x ")    `(*TOP* (xmp (@ (x)))))
 (test/equal ""
             (html->xexp "<xmp x")     `(*TOP* (xmp (@ (x)))))

 (test/equal ""
             (html->xexp "<script>xxx")
             `(*TOP* (script "xxx")))
 (test/equal ""
             (html->xexp "<script/>xxx")
             `(*TOP* (script) "xxx"))

 (test/equal ""
             (html->xexp "<html xml:lang=\"en\" lang=\"en\">")
             `(*TOP* (html (@ (xml:lang "en") (lang "en")))))

 (test/equal ""
             (html->xexp "<a href=/foo.html>")
             `(*TOP* (a (@ (href "/foo.html")))))
 (test/equal ""
             (html->xexp "<a href=/>foo.html")
             `(*TOP* (a (@ (href "/")) "foo.html")))

 ;; TODO: Add verbatim-pair cases with attributes in the end tag.

 (test/equal ""
             (html->xexp "&copy;")
             `(*TOP* (& copy)))
 (test/equal ""
             (html->xexp "&rArr;")
             `(*TOP* (& ,(string->symbol "rArr"))))
 (test/equal ""
             (html->xexp "&#151;")
             `(*TOP* ,(integer->char 151)))

 (test/equal ""
             (html->xexp "&#999;")
             `(*TOP* ,(integer->char 999)))

 (test/equal ""
             (html->xexp "xxx<![CDATA[abc]]>yyy")
             `(*TOP* "xxx" "abc" "yyy"))

 (test/equal ""
             (html->xexp "xxx<![CDATA[ab]c]]>yyy")
             `(*TOP* "xxx" "ab]c" "yyy"))

 (test/equal ""
             (html->xexp "xxx<![CDATA[ab]]c]]>yyy")
             `(*TOP* "xxx" "ab]]c" "yyy"))

 (test/equal ""
             (html->xexp "xxx<![CDATA[]]]>yyy")
             `(*TOP* "xxx" "]" "yyy"))

 (test/equal ""
             (html->xexp "xxx<![CDATAyyy")
             `(*TOP* "xxx" "<![CDATA" "yyy"))

 (test/equal "parent constraints with div"
             (html->xexp "<html><div><p>P1</p><p>P2</p></div><p>P3</p>")
             `(*TOP* (html (div (p "P1")
                                (p "P2"))
                           (p "P3"))))

 (test/equal "we no longer convert character references above 126 to string"
             (html->xexp "&#151;")
             `(*TOP* ,(integer->char 151)))

 (test/equal "p element can be child of li element"
             (html->xexp "<ul><li>a<p>b</p>")
             `(*TOP* (ul (li "a" (p "b")))))

 ;; TODO: Document this.
 ;;
 ;; (define html-1 "<myelem myattr=\"&\">")
 ;; (define shtml   (html->xexp html-1))
 ;; shtml
 ;; (define html-2 (shtml->html shtml))
 ;; html-2

 )