#lang racket/base
(require (planet neil/xexp:1:0))
(define %html-parsing:empty-token-symbol '*empty*)
(define %html-parsing:end-token-symbol '*end*)
(define %html-parsing:start-token-symbol '*start*)
(define %html-parsing:entity-token-symbol '*entity*)
(define %html-parsing:text-string-token-symbol '*text-string*)
(define %html-parsing:text-char-token-symbol '*text-char*)
(define %html-parse:make-html-tokenizer
(letrec ((no-token '())
(verbatim-to-eof-elems '(plaintext))
(verbatim-pair-elems '(script server style xmp))
(ws-chars (list #\space
(integer->char 9)
(integer->char 10)
(integer->char 11)
(integer->char 12)
(integer->char 13)))
(gosc/string-or-false
(lambda (os)
(let ((s (get-output-string os)))
(if (string=? s "") #f s))))
(gosc/symbol-or-false
(lambda (os)
(let ((s (gosc/string-or-false os)))
(if s (string->symbol s) #f))))
)
(lambda (in normalized?)
(letrec
(
(c #f)
(next-c #f)
(c-consumed? #t)
(read-c (lambda ()
(if c-consumed?
(if next-c
(begin (set! c next-c)
(set! next-c #f))
(set! c (read-char in)))
(set! c-consumed? #t))))
(unread-c (lambda ()
(if c-consumed?
(set! c-consumed? #f)
(error '%html-parse:make-html-tokenizer
"already unread: ~S"
c))))
(push-c (lambda (new-c)
(if c-consumed?
(begin (set! c new-c)
(set! c-consumed? #f))
(if next-c
(error '%html-parse:make-html-tokenizer
"pushback full: ~S"
c)
(begin (set! next-c c)
(set! c new-c)
(set! c-consumed? #f))))))
(c-eof? (lambda () (eof-object? c)))
(c-amp? (lambda () (eqv? c #\&)))
(c-apos? (lambda () (eqv? c #\')))
(c-bang? (lambda () (eqv? c #\!)))
(c-colon? (lambda () (eqv? c #\:)))
(c-quot? (lambda () (eqv? c #\")))
(c-equals? (lambda () (eqv? c #\=)))
(c-gt? (lambda () (eqv? c #\>)))
(c-lsquare? (lambda () (eqv? c #\[)))
(c-lt? (lambda () (eqv? c #\<)))
(c-minus? (lambda () (eqv? c #\-)))
(c-pound? (lambda () (eqv? c #\#)))
(c-ques? (lambda () (eqv? c #\?)))
(c-semi? (lambda () (eqv? c #\;)))
(c-slash? (lambda () (eqv? c #\/)))
(c-splat? (lambda () (eqv? c #\*)))
(c-lf? (lambda () (eqv? c #\newline)))
(c-angle? (lambda () (memv c '(#\< #\>))))
(c-ws? (lambda () (memv c ws-chars)))
(c-alpha? (lambda () (char-alphabetic? c)))
(c-digit? (lambda () (char-numeric? c)))
(c-alphanum? (lambda () (or (c-alpha?) (c-digit?))))
(c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f
#\A #\B #\C #\D #\E #\F))))
(skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c))))
(if-read-chars
(lambda (match-chars yes-thunk no-proc)
(let loop ((chars match-chars)
(match-count 0))
(if (null? chars)
(yes-thunk)
(begin (read-c)
(if (eqv? c (car chars))
(begin (loop (cdr chars) (+ 1 match-count)))
(begin (unread-c)
(no-proc match-chars match-count))))))))
(write-chars-count
(lambda (chars count port)
(let loop ((chars chars)
(count count))
(or (zero? count)
(begin (write-char (car chars) port)
(loop (cdr chars)
(- count 1)))))))
(make-start-token
(if normalized?
(lambda (name ns attrs)
(list name (cons '@ attrs)))
(lambda (name ns attrs)
(if (null? attrs)
(list name)
(list name (cons '@ attrs))))))
(make-empty-token
(lambda (name ns attrs)
(cons %html-parsing:empty-token-symbol
(make-start-token name ns attrs))))
(make-end-token
(if normalized?
(lambda (name ns attrs)
(list %html-parsing:end-token-symbol
name
(cons '@ attrs)))
(lambda (name ns attrs)
(if (null? attrs)
(list %html-parsing:end-token-symbol name)
(list %html-parsing:end-token-symbol
name
(cons '@ attrs))))))
(make-comment-token
(lambda (str) (list '*COMMENT* str)))
(make-decl-token
(lambda (parts) (cons '*DECL* parts)))
(scan-qname
(lambda (verbatim-safe?)
(letrec ((os #f)
(ns '())
(vcolons 0)
(good-os (lambda ()
(or os
(begin (set! os (open-output-string))
os)))))
(let loop ()
(read-c)
(cond ((c-eof?) #f)
((or (c-ws?) (c-splat?))
(if verbatim-safe?
(unread-c)
#f))
((or (c-angle?) (c-equals?) (c-quot?) (c-apos?))
(unread-c))
((c-colon?)
(or (null? ns)
(set! ns (cons ":" ns)))
(if os
(begin
(set! ns (cons (get-output-string os)
ns))
(set! os #f))
#f)
(loop))
((c-slash?)
(read-c)
(cond ((or (c-eof?)
(c-ws?)
(c-equals?)
(c-apos?)
(c-quot?)
(c-angle?)
(c-splat?))
(unread-c)
(push-c #\/))
(else (write-char #\/ (good-os))
(write-char c os)
(loop))))
(else (write-char c (good-os))
(loop))))
(let ((ns (if (null? ns)
#f
(apply string-append
(reverse ns))))
(localname (if os (get-output-string os) #f)))
(if verbatim-safe?
(cons ns localname)
(if localname
(if ns
(if (or (string=? ns "xml")
(string=? ns "xmlns"))
(string->symbol (string-append ns
":"
localname))
(cons ns
(string->symbol (string-downcase
localname))))
(string->symbol (string-downcase localname)))
(if ns
(string->symbol (string-downcase ns))
#f)))))))
(scan-tag
(lambda (start?)
(skip-ws)
(let ((tag-name (scan-qname #f))
(tag-ns #f)
(tag-attrs #f)
(tag-empty? #f))
(if (pair? tag-name)
(begin (set! tag-ns (car tag-name))
(set! tag-name (cdr tag-name)))
#f)
(set! tag-attrs
(let scan-attr-list ()
(read-c)
(cond ((c-eof?) '())
((c-angle?) (unread-c) '())
((c-slash?)
(set! tag-empty? #t)
(scan-attr-list))
((c-alpha?)
(unread-c)
(let ((attr (scan-attr)))
(cons attr (scan-attr-list))))
(else (scan-attr-list)))))
(let loop ()
(read-c)
(cond ((c-eof?) no-token)
((c-slash?) (set! tag-empty? #t) (loop))
((c-gt?) #f)
((c-ws?) (loop))
(else (unread-c))))
(cond ((not start?) #f)
(tag-empty? #f)
((memq tag-name verbatim-to-eof-elems)
(set! nexttok verbeof-nexttok))
((memq tag-name verbatim-pair-elems)
(set! nexttok (make-verbpair-nexttok tag-name))))
(if start?
(if tag-empty?
(make-empty-token tag-name tag-ns tag-attrs)
(make-start-token tag-name tag-ns tag-attrs))
(make-end-token tag-name tag-ns tag-attrs)))))
(scan-attr
(lambda ()
(let ((name (scan-qname #f))
(val #f))
(if (pair? name)
(set! name (cdr name))
#f)
(let loop-equals-or-end ()
(read-c)
(cond ((c-eof?) no-token)
((c-ws?) (loop-equals-or-end))
((c-equals?)
(let loop-quote-or-unquoted ()
(read-c)
(cond ((c-eof?) no-token)
((c-ws?) (loop-quote-or-unquoted))
((or (c-apos?) (c-quot?))
(let ((term c))
(set! val (open-output-string))
(let loop-quoted-val ()
(read-c)
(cond ((c-eof?) #f)
((eqv? c term) #f)
(else (write-char c val)
(loop-quoted-val))))))
((c-angle?) (unread-c))
(else
(set! val (open-output-string))
(write-char c val)
(let loop-unquoted-val ()
(read-c)
(cond ((c-eof?) no-token)
((c-apos?) #f)
((c-quot?) #f)
((or (c-ws?) (c-angle?)
)
(unread-c))
(else (write-char c val)
(loop-unquoted-val))))))))
(else (unread-c))))
(if normalized?
(list name (if val
(get-output-string val)
(symbol->string name)))
(if val
(list name (get-output-string val))
(list name))))))
(scan-comment
(lambda ()
(let ((os (open-output-string))
(state 'start-minus))
(let loop ()
(read-c)
(cond ((c-eof?) #f)
((c-minus?)
(set! state
(case state
((start-minus) 'start-minus-minus)
((start-minus-minus body) 'end-minus)
((end-minus) 'end-minus-minus)
((end-minus-minus) (write-char #\- os) state)
(else (error '<%html-parse:make-html-tokenizer>
"invalid state: ~S"
state))))
(loop))
((and (c-gt?) (eq? state 'end-minus-minus)) #f)
(else (case state
((end-minus) (write-char #\- os))
((end-minus-minus) (display "--" os)))
(set! state 'body)
(write-char c os)
(loop))))
(make-comment-token (get-output-string os)))))
(scan-possible-cdata
(lambda ()
(if-read-chars
'(#\C #\D #\A #\T #\A #\[)
(lambda ()
(scan-cdata))
(lambda (chars count)
(let ((os (open-output-string)))
(display "<![" os)
(write-chars-count chars count os)
(get-output-string os))))))
(scan-cdata
(lambda ()
(let ((os (open-output-string)))
(let loop ()
(if-read-chars
'(#\] #\] #\>)
(lambda () (get-output-string os))
(lambda (chars count)
(if (zero? count)
(if (eof-object? c)
(get-output-string os)
(begin (write-char c os)
(read-c)
(loop)))
(begin (write-char #\] os)
(if (= count 2)
(push-c #\])
#f)
(loop)))))))))
(scan-pi
(lambda ()
(skip-ws)
(let ((name (open-output-string))
(val (open-output-string)))
(let scan-name ()
(read-c)
(cond ((c-eof?) #f)
((c-ws?) #f)
((c-alpha?) (write-char c name) (scan-name))
(else (unread-c))))
(set! name (gosc/symbol-or-false name))
(let scan-val ()
(read-c)
(cond ((c-eof?) #f)
((c-ques?)
(read-c)
(cond ((c-eof?) (write-char #\? val))
((c-gt?) #f)
(else (write-char #\? val)
(unread-c)
(scan-val))))
(else (write-char c val) (scan-val))))
(list '*PI*
name
(get-output-string val)))))
(scan-decl
(letrec
((scan-parts
(lambda ()
(let ((part (open-output-string))
(nonsymbol? #f)
(state 'before)
(last? #f))
(let loop ()
(read-c)
(cond ((c-eof?) #f)
((c-ws?)
(case state
((before) (loop))
((quoted) (write-char c part) (loop))))
((and (c-gt?) (not (eq? state 'quoted)))
(set! last? #t))
((and (c-lt?) (not (eq? state 'quoted)))
(unread-c))
((c-quot?)
(case state
((before) (set! state 'quoted) (loop))
((unquoted) (unread-c))
((quoted) #f)))
(else
(if (eq? state 'before)
(set! state 'unquoted)
#f)
(set! nonsymbol? (or nonsymbol?
(not (c-alphanum?))))
(write-char c part)
(loop))))
(set! part (get-output-string part))
(if (string=? part "")
'()
(cons (if (or (eq? state 'quoted) nonsymbol?)
part
(string->symbol part))
(if last?
'()
(scan-parts))))))))
(lambda () (make-decl-token (scan-parts)))))
(scan-entity
(lambda ()
(read-c)
(cond ((c-eof?) "&")
((c-alpha?)
(let ((name (open-output-string)))
(write-char c name)
(let loop ()
(read-c)
(cond ((c-eof?) #f)
((c-alpha?) (write-char c name) (loop))
((c-semi?) #f)
(else (unread-c))))
(set! name (string->symbol (get-output-string name)))
(let ((pair (assq name '((amp . "&")
(apos . "'")
(gt . ">")
(lt . "<")
(quot . "\"")))))
(if pair
(cdr pair)
(make-xexp-char-ref name)))))
((c-pound?)
(let ((num (open-output-string))
(hex? #f))
(read-c)
(cond ((c-eof?) #f)
((memv c '(#\x #\X)) (set! hex? #t) (read-c)))
(let loop ()
(cond ((c-eof?) #f)
((c-semi?) #f)
((or (c-digit?) (and hex? (c-hexlet?)))
(write-char c num)
(read-c)
(loop))
(else (unread-c))))
(set! num (get-output-string num))
(if (string=? num "")
"&#;"
(let ((n (string->number num (if hex? 16 10))))
(if (<= 32 n 126)
(string (integer->char n))
(integer->char n))))))
(else (unread-c) "&"))))
(normal-nexttok
(lambda ()
(read-c)
(cond ((c-eof?) no-token)
((c-lt?)
(let loop ()
(read-c)
(cond ((c-eof?) "<")
((c-slash?) (scan-tag #f))
((c-ques?) (scan-pi))
((c-alpha?) (unread-c) (scan-tag #t))
((c-bang?)
(read-c)
(if (c-lsquare?)
(scan-possible-cdata)
(let loop ()
(cond ((c-eof?) no-token)
((c-ws?) (read-c) (loop))
((c-minus?) (scan-comment))
(else (unread-c)
(scan-decl))))))
(else (unread-c) "<"))))
((c-gt?) ">")
(else (let ((os (open-output-string)))
(let loop ()
(cond ((c-eof?) #f)
((c-angle?) (unread-c))
((c-amp?)
(let ((entity (scan-entity)))
(if (string? entity)
(begin (display entity os)
(read-c)
(loop))
(let ((saved-nexttok nexttok))
(set! nexttok
(lambda ()
(set! nexttok
saved-nexttok)
entity))))))
(else (write-char c os)
(or (c-lf?)
(begin (read-c) (loop))))))
(let ((text (get-output-string os)))
(if (equal? text "")
(nexttok)
text)))))))
(verbeof-nexttok
(lambda ()
(read-c)
(if (c-eof?)
no-token
(let ((os (open-output-string)))
(let loop ()
(or (c-eof?)
(begin (write-char c os)
(or (c-lf?)
(begin (read-c) (loop))))))
(get-output-string os)))))
(make-verbpair-nexttok
(lambda (elem-name)
(lambda ()
(let ((os (open-output-string)))
(let loop ()
(read-c)
(cond ((c-eof?)
(set! nexttok normal-nexttok))
((c-lt?)
(read-c)
(cond ((c-eof?)
(set! nexttok normal-nexttok)
(write-char #\< os))
((c-slash?)
(read-c)
(cond
((c-eof?)
(display "</" os))
((c-alpha?)
(unread-c)
(let* ((vqname (scan-qname #t))
(ns (car vqname))
(local (cdr vqname)))
(if (and local
(eqv? (string->symbol
(string-downcase local))
elem-name))
(begin
(let scan-to-end ()
(read-c)
(cond ((c-eof?) #f)
((c-gt?) #f)
((c-lt?) (unread-c))
((c-alpha?)
(unread-c)
(scan-attr)
(scan-to-end))
(else (scan-to-end))))
(set! nexttok
(lambda ()
(set! nexttok
normal-nexttok)
(make-end-token
elem-name #f '()))))
(begin
(display "</" os)
(if ns
(begin (display ns os)
(display ":" os))
#f)
(if local
(display local os)
#f)
(loop)))))
(else
(unread-c)
(display "</" os)
(loop))))
(else
(unread-c)
(write-char #\< os)
(loop))))
(else
(write-char c os)
(or (c-lf?) (loop)))))
(or (gosc/string-or-false os) (nexttok))))))
(nexttok #f))
(set! nexttok normal-nexttok)
(lambda () (nexttok))))))
(define (%html-parse:tokenize-html in normalized?)
(let ((next-tok (%html-parse:make-html-tokenizer in normalized?)))
(let loop ((tok (next-tok)))
(if (null? tok)
'()
(cons tok (loop (next-tok)))))))
(define (%html-parse:xexp-token-kind token)
(cond ((string? token) %html-parsing:text-string-token-symbol)
((char? token) %html-parsing:text-char-token-symbol)
((list? token)
(let ((s (car token)))
(if (memq s `(*COMMENT*
*DECL*
*PI*
,%html-parsing:empty-token-symbol
,%html-parsing:end-token-symbol
,%html-parsing:entity-token-symbol))
s
%html-parsing:start-token-symbol)))
(else (error '%html-parse:xexp-token-kind
"unrecognized token kind: ~S"
token))))
(define %html-parse:empty-elements
(cons '& always-empty-html-elements))
(define %html-parse:parse-html/tokenizer
(letrec ((empty-elements
%html-parse:empty-elements)
(parent-constraints
'((area . (map))
(body . (html))
(caption . (table))
(colgroup . (table))
(dd . (dl))
(dt . (dl))
(frame . (frameset))
(head . (html))
(isindex . (head))
(li . (dir menu ol ul))
(meta . (head))
(noframes . (frameset))
(option . (select))
(p . (body li td th))
(param . (applet))
(tbody . (table))
(td . (tr))
(th . (tr))
(thead . (table))
(title . (head))
(tr . (table tbody thead))))
(token-kinds-that-always-get-added
`(*COMMENT*
*DECL*
*PI*
,%html-parsing:entity-token-symbol
,%html-parsing:text-string-token-symbol
,%html-parsing:text-char-token-symbol))
(start-tag-name (lambda (tag-token) (car tag-token)))
(end-tag-name (lambda (tag-token) (list-ref tag-token 1))))
(lambda (tokenizer normalized?)
(let ((begs (list (vector #f '()))))
(letrec ((add-thing-as-child-of-current-beg
(lambda (tok)
(let ((beg (car begs)))
(vector-set! beg 1 (cons tok (vector-ref beg 1))))))
(beg->elem
(lambda (beg)
(let ((elem-name (vector-ref beg 0))
(attrs-and-contents (reverse (vector-ref beg 1))))
(cons elem-name attrs-and-contents))))
(finish-current-beg-and-return-elem
(lambda ()
(let ((elem (beg->elem (car begs))))
(set! begs (cdr begs))
(or (null? begs)
(add-thing-as-child-of-current-beg elem))
elem)))
(finish-current-beg
(lambda ()
(finish-current-beg-and-return-elem)))
(finish-all-begs-and-return-top
(lambda ()
(let loop ()
(let ((elem (finish-current-beg-and-return-elem)))
(if (car elem)
(loop)
(cdr elem))))))
(finish-begs-up-to-and-including-name
(lambda (name)
(let loop-find-name ((find-begs begs)
(depth 1))
(let ((beg-name (vector-ref (car find-begs) 0)))
(cond ((not beg-name)
(void))
((eqv? name beg-name)
(let loop-finish ((depth depth))
(or (zero? depth)
(begin
(finish-current-beg)
(loop-finish (- depth 1))))))
(else
(loop-find-name (cdr find-begs)
(+ depth 1))))))))
(finish-begs-upto-but-not-including-names
(lambda (names)
(let loop-find-name ((find-begs begs)
(depth 0))
(let ((beg-name (vector-ref (car find-begs) 0)))
(cond ((not beg-name)
(void))
((memq beg-name names)
(let loop-finish ((depth depth))
(or (zero? depth)
(begin
(finish-current-beg)
(loop-finish (- depth 1))))))
(else
(loop-find-name (cdr find-begs)
(+ depth 1)))))))))
(let loop ()
(let ((tok (tokenizer)))
(if (null? tok)
(finish-all-begs-and-return-top)
(let ((kind (%html-parse:xexp-token-kind tok)))
(cond ((memv kind token-kinds-that-always-get-added)
(add-thing-as-child-of-current-beg tok))
((eqv? kind %html-parsing:start-token-symbol)
(let* ((name (start-tag-name tok))
(cell (assq name parent-constraints)))
(and cell
(finish-begs-upto-but-not-including-names
(cons 'div (cdr cell))))
(if (memq name empty-elements)
(add-thing-as-child-of-current-beg tok)
(set! begs (cons (vector (car tok)
(cdr tok))
begs)))))
((eqv? kind %html-parsing:empty-token-symbol)
(add-thing-as-child-of-current-beg (cdr tok)))
((eqv? kind %html-parsing:end-token-symbol)
(let ((name (end-tag-name tok)))
(if name
(finish-begs-up-to-and-including-name
name)
(and (vector-ref (car begs) 0)
(finish-current-beg)))))
(else (error 'parse-html/tokenizer
"unknown tag kind: ~S"
kind)))
(loop))))))))))
(define (%html-parse:parse-html input normalized? top?)
(let ((parse
(lambda ()
(%html-parse:parse-html/tokenizer
(%html-parse:make-html-tokenizer
(cond ((input-port? input) input)
((string? input) (open-input-string input))
(else (error
'%html-parse:parse-html
"invalid input type: ~S"
input)))
normalized?)
normalized?))))
(if top?
(cons '*TOP* (parse))
(parse))))
(define (html->xexp input)
(%html-parse:parse-html input #f #t))
(provide html->xexp)