#lang racket/base
(require (planet neil/testeez:1:1)
"html-parsing.rkt")
(testeez
"test-html-parsing.rkt"
(test-define "" lf (string (integer->char 10)))
(test/equal "" (html->xexp "<a>>") '(*TOP* (a ">")))
(test/equal "" (html->xexp "<a<>") '(*TOP* (a "<" ">")))
(test/equal "" (html->xexp "<>") '(*TOP* "<" ">"))
(test/equal "" (html->xexp "< >")
'(*TOP* "<" " " ">"))
(test/equal "" (html->xexp "< a>")
'(*TOP* "<" " a" ">"))
(test/equal "" (html->xexp "< a / >")
'(*TOP* "<" " a / " ">"))
(test/equal "" (html->xexp "<a<") '(*TOP* (a "<")))
(test/equal "" (html->xexp "<a<b") '(*TOP* (a (b))))
(test/equal "" (html->xexp "><a>") '(*TOP* ">" (a)))
(test/equal "" (html->xexp "</>") '(*TOP*))
(test/equal "" (html->xexp "<\">") '(*TOP* "<" "\"" ">"))
(test/equal ""
(html->xexp (string-append "<a>xxx<plaintext>aaa" lf
"bbb" lf
"c<c<c"))
`(*TOP*
(a "xxx" (plaintext ,(string-append "aaa" lf)
,(string-append "bbb" lf)
"c<c<c"))))
(test/equal ""
(html->xexp "aaa<!-- xxx -->bbb")
`(*TOP*
"aaa" (*COMMENT* " xxx ") "bbb"))
(test/equal ""
(html->xexp "aaa<! -- xxx -->bbb")
`(*TOP*
"aaa" (*COMMENT* " xxx ") "bbb"))
(test/equal ""
(html->xexp "aaa<!-- xxx --->bbb")
`(*TOP*
"aaa" (*COMMENT* " xxx -") "bbb"))
(test/equal ""
(html->xexp "aaa<!-- xxx ---->bbb")
`(*TOP*
"aaa" (*COMMENT* " xxx --") "bbb"))
(test/equal ""
(html->xexp "aaa<!-- xxx -y-->bbb")
`(*TOP*
"aaa" (*COMMENT* " xxx -y") "bbb"))
(test/equal ""
(html->xexp "aaa<!----->bbb")
`(*TOP*
"aaa" (*COMMENT* "-") "bbb"))
(test/equal ""
(html->xexp "aaa<!---->bbb")
`(*TOP*
"aaa" (*COMMENT* "") "bbb"))
(test/equal ""
(html->xexp "aaa<!--->bbb")
`(*TOP* "aaa" (*COMMENT* "->bbb")))
(test/equal "" (html->xexp "<hr>") `(*TOP* (hr)))
(test/equal "" (html->xexp "<hr/>") `(*TOP* (hr)))
(test/equal "" (html->xexp "<hr />") `(*TOP* (hr)))
(test/equal ""
(html->xexp "<hr noshade>")
`(*TOP* (hr (@ (noshade)))))
(test/equal ""
(html->xexp "<hr noshade/>")
`(*TOP* (hr (@ (noshade)))))
(test/equal ""
(html->xexp "<hr noshade />")
`(*TOP* (hr (@ (noshade)))))
(test/equal ""
(html->xexp "<hr noshade / >")
`(*TOP* (hr (@ (noshade)))))
(test/equal ""
(html->xexp "<hr noshade=1 />")
`(*TOP* (hr (@ (noshade "1")))))
(test/equal ""
(html->xexp "<hr noshade=1/>")
`(*TOP* (hr (@ (noshade "1/")))))
(test/equal ""
(html->xexp "<q>aaa<p/>bbb</q>ccc</p>ddd")
`(*TOP* (q "aaa" (p) "bbb") "ccc" "ddd"))
(test/equal "" (html->xexp "<") `(*TOP* "<"))
(test/equal "" (html->xexp ">") `(*TOP* ">"))
(test/equal ""
(html->xexp "Gilbert & Sullivan")
`(*TOP* "Gilbert & Sullivan"))
(test/equal ""
(html->xexp "Gilbert & Sullivan")
`(*TOP* "Gilbert & Sullivan"))
(test/equal ""
(html->xexp "Gilbert & Sullivan")
`(*TOP* "Gilbert & Sullivan"))
(test/equal ""
(html->xexp "Copyright © Foo")
`(*TOP* "Copyright "
(& copy)
" Foo"))
(test/equal ""
(html->xexp "aaa©bbb")
`(*TOP*
"aaa" (& copy) "bbb"))
(test/equal ""
(html->xexp "aaa©")
`(*TOP*
"aaa" (& copy)))
(test/equal "" (html->xexp "*") '(*TOP* "*"))
(test/equal "" (html->xexp "*") '(*TOP* "*"))
(test/equal "" (html->xexp "*x") '(*TOP* "*x"))
(test/equal "" (html->xexp "—") `(*TOP* ,(integer->char 151)))
(test/equal "" (html->xexp "Ϩ") `(*TOP* ,(integer->char 1000)))
(test/equal "" (html->xexp "B") '(*TOP* "B"))
(test/equal "" (html->xexp "¢") `(*TOP* ,(integer->char 162)))
(test/equal "" (html->xexp "ÿ") `(*TOP* ,(integer->char 255)))
(test/equal "" (html->xexp "Ā") `(*TOP* ,(integer->char 256)))
(test/equal "" (html->xexp "B") '(*TOP* "B"))
(test/equal "" (html->xexp "&42;") '(*TOP* "&42;"))
(test/equal ""
(html->xexp (string-append "aaa©bbb&ccc<ddd&>"
"eee*fffϨgggZhhh"))
`(*TOP*
"aaa"
(& copy)
"bbb&ccc<ddd&>eee*fff"
,(integer->char 1000)
"gggZhhh"))
(test/equal ""
(html->xexp
(string-append
"<IMG src=\"http://e.e/aw/pics/listings/"
"ebayLogo_38x16.gif\" border=0 width=\"38\" height=\"16\" "
"HSPACE=5 VSPACE=0\">2</FONT>"))
`(*TOP*
(img (@
(src
"http://e.e/aw/pics/listings/ebayLogo_38x16.gif")
(border "0") (width "38") (height "16")
(hspace "5") (vspace "0")))
"2"))
(test/equal ""
(html->xexp "<aaa bbb=ccc\"ddd>eee")
`(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))
(test/equal ""
(html->xexp "<aaa bbb=ccc \"ddd>eee")
`(*TOP* (aaa (@ (bbb "ccc") (ddd)) "eee")))
(test/equal ""
(html->xexp
(string-append
"<HTML><Head><Title>My Title</Title></Head>"
"<Body BGColor=\"white\" Foo=42>"
"This is a <B><I>bold-italic</B></I> test of </Erk>"
"broken HTML.<br>Yes it is.</Body></HTML>"))
`(*TOP*
(html (head (title "My Title"))
(body (@ (bgcolor "white") (foo "42"))
"This is a "
(b (i "bold-italic"))
" test of "
"broken HTML."
(br)
"Yes it is."))))
(test/equal ""
(html->xexp
(string-append
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"))
`(*TOP*
(*DECL*
DOCTYPE
html
PUBLIC
"-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd")))
(test/equal ""
(html->xexp
(string-append
"<html xmlns=\"http://www.w3.org/1999/xhtml\" "
"xml:lang=\"en\" "
"lang=\"en\">"))
`(*TOP*
(html (@ (xmlns "http://www.w3.org/1999/xhtml")
(xml:lang "en") (lang "en")))))
(test/equal
""
(html->xexp
(string-append
"<html:html xmlns:html=\"http://www.w3.org/TR/REC-html40\">"
"<html:head><html:title>Frobnostication</html:title></html:head>"
"<html:body><html:p>Moved to <html:a href=\"http://frob.com\">"
"here.</html:a></html:p></html:body></html:html>"))
`(*TOP*
(html (@ (xmlns:html "http://www.w3.org/TR/REC-html40"))
(head (title "Frobnostication"))
(body (p "Moved to "
(a (@ (href "http://frob.com"))
"here."))))))
(test/equal ""
(html->xexp
(string-append
"<RESERVATION xmlns:HTML=\"http://www.w3.org/TR/REC-html40\">"
"<NAME HTML:CLASS=\"largeSansSerif\">Layman, A</NAME>"
"<SEAT CLASS=\"Y\" HTML:CLASS=\"largeMonotype\">33B</SEAT>"
"<HTML:A HREF=\"/cgi-bin/ResStatus\">Check Status</HTML:A>"
"<DEPARTURE>1997-05-24T07:55:00+1</DEPARTURE></RESERVATION>"))
`(*TOP*
(reservation (@ (,(string->symbol "xmlns:HTML")
"http://www.w3.org/TR/REC-html40"))
(name (@ (class "largeSansSerif"))
"Layman, A")
(seat (@ (class "Y") (class "largeMonotype"))
"33B")
(a (@ (href "/cgi-bin/ResStatus"))
"Check Status")
(departure "1997-05-24T07:55:00+1"))))
(test/equal
""
(html->xexp
(string-append
"<html><head><title></title><title>whatever</title></head><body>"
"<a href=\"url\">link</a><p align=center><ul compact style=\"aa\">"
"<p>BLah<!-- comment <comment> --> <i> italic <b> bold <tt> ened </i>"
" still < bold </b></body><P> But not done yet..."))
`(*TOP*
(html (head (title) (title "whatever"))
(body (a (@ (href "url")) "link")
(p (@ (align "center"))
(ul (@ (compact) (style "aa"))))
(p "BLah"
(*COMMENT* " comment <comment> ")
" "
(i " italic " (b " bold " (tt " ened ")))
" still < bold "))
(p " But not done yet..."))))
(test/equal ""
(html->xexp "<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
`(*TOP*
(*PI* xml "version=\"1.0\" encoding=\"UTF-8\"")))
(test/equal ""
(html->xexp "<?php php_info(); ?>")
`(*TOP* (*PI* php "php_info(); ")))
(test/equal ""
(html->xexp "<?php php_info(); ?")
`(*TOP* (*PI* php "php_info(); ?")))
(test/equal ""
(html->xexp "<?php php_info(); ")
`(*TOP* (*PI* php "php_info(); ")))
(test/equal ""
(html->xexp "<?foo bar ? baz > blort ?>")
`(*TOP*
(*PI* foo "bar ? baz > blort ")))
(test/equal ""
(html->xexp "<?foo b?>x")
`(*TOP* (*PI* foo "b") "x"))
(test/equal ""
(html->xexp "<?foo ?>x")
`(*TOP* (*PI* foo "") "x"))
(test/equal ""
(html->xexp "<?foo ?>x")
`(*TOP* (*PI* foo "") "x"))
(test/equal ""
(html->xexp "<?foo?>x")
`(*TOP* (*PI* foo "") "x"))
(test/equal ""
(html->xexp "<?f?>x")
`(*TOP* (*PI* f "") "x"))
(test/equal ""
(html->xexp "<??>x")
`(*TOP* (*PI* #f "") "x"))
(test/equal ""
(html->xexp "<?>x")
`(*TOP* (*PI* #f ">x")))
(test/equal ""
(html->xexp "<foo bar=\"baz\">blort")
`(*TOP* (foo (@ (bar "baz")) "blort")))
(test/equal ""
(html->xexp "<foo bar='baz'>blort")
`(*TOP* (foo (@ (bar "baz")) "blort")))
(test/equal ""
(html->xexp "<foo bar=\"baz'>blort")
`(*TOP* (foo (@ (bar "baz'>blort")))))
(test/equal ""
(html->xexp "<foo bar='baz\">blort")
`(*TOP* (foo (@ (bar "baz\">blort")))))
(test/equal ""
(html->xexp (string-append "<p>A</p>"
"<script>line0 <" lf
"line1" lf
"<line2></script>"
"<p>B</p>"))
`(*TOP* (p "A")
(script ,(string-append "line0 <" lf)
,(string-append "line1" lf)
"<line2>")
(p "B")))
(test/equal ""
(html->xexp "<xmp>a<b>c</XMP>d")
`(*TOP* (xmp "a<b>c") "d"))
(test/equal ""
(html->xexp "<XMP>a<b>c</xmp>d")
`(*TOP* (xmp "a<b>c") "d"))
(test/equal ""
(html->xexp "<xmp>a<b>c</foo:xmp>d")
`(*TOP* (xmp "a<b>c") "d"))
(test/equal ""
(html->xexp "<foo:xmp>a<b>c</xmp>d")
`(*TOP* (xmp "a<b>c") "d"))
(test/equal ""
(html->xexp "<foo:xmp>a<b>c</foo:xmp>d")
`(*TOP* (xmp "a<b>c") "d"))
(test/equal ""
(html->xexp "<foo:xmp>a<b>c</bar:xmp>d")
`(*TOP* (xmp "a<b>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</b>c</xmp>d")
`(*TOP* (xmp "a</b>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</b >c</xmp>d")
`(*TOP* (xmp "a</b >c") "d"))
(test/equal ""
(html->xexp "<xmp>a</ b>c</xmp>d")
`(*TOP* (xmp "a</ b>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</ b >c</xmp>d")
`(*TOP* (xmp "a</ b >c") "d"))
(test/equal ""
(html->xexp "<xmp>a</b:x>c</xmp>d")
`(*TOP* (xmp "a</b:x>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</b::x>c</xmp>d")
`(*TOP* (xmp "a</b::x>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</b:::x>c</xmp>d")
`(*TOP* (xmp "a</b:::x>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</b:>c</xmp>d")
`(*TOP* (xmp "a</b:>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</b::>c</xmp>d")
`(*TOP* (xmp "a</b::>c") "d"))
(test/equal ""
(html->xexp "<xmp>a</xmp:b>c</xmp>d")
`(*TOP* (xmp "a</xmp:b>c") "d"))
(test-define "expected output for next two tests"
expected
`(*TOP* (p "real1")
,lf
(xmp ,lf
,(string-append "alpha" lf)
,(string-append "<P>fake</P>" lf)
,(string-append "bravo" lf))
(p "real2")))
(test/equal ""
(html->xexp (string-append "<P>real1</P>" lf
"<XMP>" lf
"alpha" lf
"<P>fake</P>" lf
"bravo" lf
"</XMP " lf
"<P>real2</P>"))
expected)
(test/equal ""
(html->xexp (string-append "<P>real1</P>" lf
"<XMP>" lf
"alpha" lf
"<P>fake</P>" lf
"bravo" lf
"</XMP" lf
"<P>real2</P>"))
expected)
(test/equal ""
(html->xexp "<xmp>a</xmp>x")
`(*TOP* (xmp "a") "x"))
(test/equal ""
(html->xexp (string-append "<xmp>a" lf "</xmp>x"))
`(*TOP* (xmp ,(string-append "a" lf)) "x"))
(test/equal ""
(html->xexp "<xmp></xmp>x")
`(*TOP* (xmp) "x"))
(test/equal ""
(html->xexp "<xmp>a</xmp") `(*TOP* (xmp "a")))
(test/equal ""
(html->xexp "<xmp>a</xm") `(*TOP* (xmp "a</xm")))
(test/equal ""
(html->xexp "<xmp>a</x") `(*TOP* (xmp "a</x")))
(test/equal ""
(html->xexp "<xmp>a</") `(*TOP* (xmp "a</")))
(test/equal ""
(html->xexp "<xmp>a<") `(*TOP* (xmp "a<")))
(test/equal ""
(html->xexp "<xmp>a") `(*TOP* (xmp "a")))
(test/equal ""
(html->xexp "<xmp>") `(*TOP* (xmp)))
(test/equal ""
(html->xexp "<xmp") `(*TOP* (xmp)))
(test/equal ""
(html->xexp "<xmp x=42 ")
`(*TOP* (xmp (@ (x "42")))))
(test/equal ""
(html->xexp "<xmp x= ") `(*TOP* (xmp (@ (x)))))
(test/equal ""
(html->xexp "<xmp x ") `(*TOP* (xmp (@ (x)))))
(test/equal ""
(html->xexp "<xmp x") `(*TOP* (xmp (@ (x)))))
(test/equal ""
(html->xexp "<script>xxx")
`(*TOP* (script "xxx")))
(test/equal ""
(html->xexp "<script/>xxx")
`(*TOP* (script) "xxx"))
(test/equal ""
(html->xexp "<html xml:lang=\"en\" lang=\"en\">")
`(*TOP* (html (@ (xml:lang "en") (lang "en")))))
(test/equal ""
(html->xexp "<a href=/foo.html>")
`(*TOP* (a (@ (href "/foo.html")))))
(test/equal ""
(html->xexp "<a href=/>foo.html")
`(*TOP* (a (@ (href "/")) "foo.html")))
(test/equal ""
(html->xexp "©")
`(*TOP* (& copy)))
(test/equal ""
(html->xexp "⇒")
`(*TOP* (& ,(string->symbol "rArr"))))
(test/equal ""
(html->xexp "—")
`(*TOP* ,(integer->char 151)))
(test/equal ""
(html->xexp "ϧ")
`(*TOP* ,(integer->char 999)))
(test/equal ""
(html->xexp "xxx<![CDATA[abc]]>yyy")
`(*TOP* "xxx" "abc" "yyy"))
(test/equal ""
(html->xexp "xxx<![CDATA[ab]c]]>yyy")
`(*TOP* "xxx" "ab]c" "yyy"))
(test/equal ""
(html->xexp "xxx<![CDATA[ab]]c]]>yyy")
`(*TOP* "xxx" "ab]]c" "yyy"))
(test/equal ""
(html->xexp "xxx<![CDATA[]]]>yyy")
`(*TOP* "xxx" "]" "yyy"))
(test/equal ""
(html->xexp "xxx<![CDATAyyy")
`(*TOP* "xxx" "<![CDATA" "yyy"))
(test/equal "parent constraints with div"
(html->xexp "<html><div><p>P1</p><p>P2</p></div><p>P3</p>")
`(*TOP* (html (div (p "P1")
(p "P2"))
(p "P3"))))
(test/equal "we no longer convert character references above 126 to string"
(html->xexp "—")
`(*TOP* ,(integer->char 151)))
(test/equal "p element can be child of li element"
(html->xexp "<ul><li>a<p>b</p>")
`(*TOP* (ul (li "a" (p "b")))))
)