;;; @Package HtmlPrag ;;; @Subtitle Pragmatic Parsing and Emitting of HTML using SXML and SHTML ;;; @HomePage http://www.neilvandyke.org/htmlprag/ ;;; @Author Neil W. Van Dyke ;;; @AuthorEmail neil@@neilvandyke.org ;;; @Version 0.16 ;;; @Date 2005-12-18 ;; $Id: htmlprag.scm,v 1.385 2005/12/19 03:28:28 neil Exp $ ;;; @legal ;;; Copyright @copyright{} 2003 - 2005 Neil W. Van Dyke. This program is Free ;;; Software; you can redistribute it and/or modify it under the terms of the ;;; GNU Lesser General Public License as published by the Free Software ;;; Foundation; either version 2.1 of the License, or (at your option) any ;;; later version. This program is distributed in the hope that it will be ;;; useful, but without any warranty; without even the implied warranty of ;;; merchantability or fitness for a particular purpose. See ;;; @indicateurl{http://www.gnu.org/copyleft/lesser.html} for details. For ;;; other license options and consulting, contact the author. ;;; @end legal (define-syntax %htmlprag:testeez (syntax-rules () ((_ x ...) ;; (testeez x ...) (error "Tests disabled.") ))) ;;; @section Introduction ;;; HtmlPrag provides permissive HTML parsing and emitting capability to Scheme ;;; programs. The parser is useful for software agent extraction of ;;; information from Web pages, for programmatically transforming HTML files, ;;; and for implementing interactive Web browsers. HtmlPrag emits ``SHTML,'' ;;; which is an encoding of HTML in ;;; @uref{http://pobox.com/~oleg/ftp/Scheme/SXML.html, SXML}, so that ;;; conventional HTML may be processed with XML tools such as ;;; @uref{http://pair.com/lisovsky/query/sxpath/, SXPath}. Like Oleg ;;; Kiselyov's @uref{http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser, ;;; SSAX-based HTML parser}, HtmlPrag provides a permissive tokenizer, but also ;;; attempts to recover structure. HtmlPrag also includes procedures for ;;; encoding SHTML in HTML syntax. ;;; ;;; The HtmlPrag parsing behavior is permissive in that it accepts erroneous ;;; HTML, handling several classes of HTML syntax errors gracefully, without ;;; yielding a parse error. This is crucial for parsing arbitrary real-world ;;; Web pages, since many pages actually contain syntax errors that would ;;; defeat a strict or validating parser. HtmlPrag's handling of errors is ;;; intended to generally emulate popular Web browsers' interpretation of the ;;; structure of erroneous HTML. We euphemistically term this kind of parse ;;; ``pragmatic.'' ;;; ;;; HtmlPrag also has some support for XHTML, although XML namespace qualifiers ;;; are currently accepted but stripped from the resulting SHTML. Note that ;;; valid XHTML input is of course better handled by a validating XML parser ;;; like Kiselyov's ;;; @uref{http://pobox.com/~oleg/ftp/Scheme/xml.html#XML-parser, SSAX}. ;;; ;;; HtmlPrag requires R5RS, SRFI-6, and SRFI-23. ;; The following bindings are used internally by HtmlPrag for portability, with ;; the intention that packagings of HtmlPrag use more efficient procedures for ;; the particular Scheme implementation. This is waiting on universal support ;; of SRFI-0. ;; @defproc %htmlprag:a2c num ;; ;; Returns the character with ASCII value @var{num}. In most Scheme ;; implementations, this is the same as @code{integer->char}. Two exceptions ;; are Scheme 48 0.57 and Scsh 0.6.3, for which the user must manually edit ;; file @code{htmlprag.scm} to bind this variable to @code{ascii->char}. A ;; future version of HtmlPrag will automatically use @code{ascii->char} where ;; available. (define %htmlprag:a2c integer->char) ;; @defproc %htmlprag:append! a b ;; ;; Returns a concatenation of lists @var{a} and @var{b}, modifying the tail of ;; @var{a} to point to the head of @var{b} if both lists are non-null. A ;; future version should use the more general @code{append!} where available. (define (%htmlprag:append! a b) (cond ((null? a) b) ((null? b) a) (else (let loop ((sub a)) (if (null? (cdr sub)) (begin (set-cdr! sub b) a) (loop (cdr sub))))))) ;; @defproc %htmlprag:reverse!ok lst ;; ;; Returns a reversed list @var{lst}, possibly destructive. A future version ;; will use @code{reverse!} where available, and @code{reverse} elsewhere. (define %htmlprag:reverse!ok reverse) ;; @defproc %htmlprag:down str ;; ;; Returns a string that is equivalent to @var{str} with all characters mapped ;; to lowercase, as if by @code{char-downcase}, without mutating @var{str}. A ;; future version should use the Scheme implementation's native nondestructive ;; procedure where available. ;; @defproc %htmlprag:error proc-str msg obj ;; ;; For Bigloo, this is changed to: ;; ;; @lisp ;; (define %htmlprag:error error) ;; @end lisp ;; TODO: Maybe go back to requiring a SRFI-23 "error". (define-syntax %htmlprag:error (syntax-rules () ((_ p m o) (error (string-append p " : " m) o)) ;; ((_ p m o) (error p m o)))) )) (define (%htmlprag:down s) (list->string (map char-downcase (string->list s)))) ;; @defproc %htmlprag:down!ok str ;; ;; Returns a string that is equivalent to @var{str} with all characters mapped ;; to lowercase, as if by @code{char-downcase}, possibly mutating @var{str}. ;; A future version should use the Scheme implementation's native destructive ;; or nondestructive procedure where available. (define %htmlprag:down!ok %htmlprag:down) ;; @defproc %htmlprag:gosc os ;; ;; One-shot version of the conventional @code{get-output-string}. The result ;; of any subsequent attempt to write to the port or get the output string is ;; undefined. This may or may not free up resources. (define (%htmlprag:gosc os) (let ((str (get-output-string os))) ;; Note: By default, we don't call close-output-port, since at least one ;; tested Scheme implementation barfs on that. ;; ;; (close-output-port os) str)) ;;; @section SHTML and SXML ;;; SHTML is a variant of SXML, with two minor but useful extensions: ;;; ;;; @enumerate ;;; ;;; @item ;;; The SXML keyword symbols, such as @code{*TOP*}, are defined to be in all ;;; uppercase, regardless of the case-sensitivity of the reader of the hosting ;;; Scheme implementation in any context. This avoids several pitfalls. ;;; ;;; @item ;;; Since not all character entity references used in HTML can be converted to ;;; Scheme characters in all R5RS Scheme implementations, nor represented in ;;; conventional text files or other common external text formats to which one ;;; might wish to write SHTML, SHTML adds a special @code{&} syntax for ;;; non-ASCII (or non-Extended-ASCII) characters. The syntax is @code{(& ;;; @var{val})}, where @var{val} is a symbol or string naming with the symbolic ;;; name of the character, or an integer with the numeric value of the ;;; character. ;;; ;;; @end enumerate ;;; @defvar shtml-comment-symbol ;;; @defvarx shtml-decl-symbol ;;; @defvarx shtml-empty-symbol ;;; @defvarx shtml-end-symbol ;;; @defvarx shtml-entity-symbol ;;; @defvarx shtml-pi-symbol ;;; @defvarx shtml-start-symbol ;;; @defvarx shtml-text-symbol ;;; @defvarx shtml-top-symbol ;;; ;;; These variables are bound to the following case-sensitive symbols used in ;;; SHTML, respectively: @code{*COMMENT*}, @code{*DECL*}, @code{*EMPTY*}, ;;; @code{*END*}, @code{*ENTITY*}, @code{*PI*}, @code{*START*}, @code{*TEXT*}, ;;; and @code{*TOP*}. These can be used in lieu of the literal symbols in ;;; programs read by a case-insensitive Scheme reader.@footnote{Scheme ;;; implementators who have not yet made @code{read} case-sensitive by default ;;; are encouraged to do so.} (define shtml-comment-symbol (string->symbol "*COMMENT*")) (define shtml-decl-symbol (string->symbol "*DECL*")) (define shtml-empty-symbol (string->symbol "*EMPTY*")) (define shtml-end-symbol (string->symbol "*END*")) (define shtml-entity-symbol (string->symbol "*ENTITY*")) (define shtml-pi-symbol (string->symbol "*PI*")) (define shtml-start-symbol (string->symbol "*START*")) (define shtml-text-symbol (string->symbol "*TEXT*")) (define shtml-top-symbol (string->symbol "*TOP*")) ;;; @defvar shtml-named-char-id ;;; @defvarx shtml-numeric-char-id ;;; ;;; These variables are bound to the SHTML entity public identifier strings ;;; used in SHTML @code{*ENTITY*} named and numeric character entity ;;; references. (define shtml-named-char-id "shtml-named-char") (define shtml-numeric-char-id "shtml-numeric-char") ;;; @defproc make-shtml-entity val ;;; ;;; Yields an SHTML character entity reference for @var{val}. For example: ;;; ;;; @lisp ;;; (make-shtml-entity "rArr") @result{} (& rArr) ;;; (make-shtml-entity (string->symbol "rArr")) @result{} (& rArr) ;;; (make-shtml-entity 151) @result{} (& 151) ;;; @end lisp (define (make-shtml-entity val) (list '& (cond ((symbol? val) val) ((integer? val) val) ((string? val) (string->symbol val)) (else (%htmlprag:error "make-shtml-entity" "invalid SHTML entity value:" val))))) ;; TODO: ;; ;; (define (shtml-entity? x) ;; (and (shtml-entity-value entity) #t)) ;;; @defproc shtml-entity-value obj ;;; ;;; Yields the value for the SHTML entity @var{obj}, or @code{#f} if @var{obj} ;;; is not a recognized entity. Values of named entities are symbols, and ;;; values of numeric entities are numbers. An error may raised if @var{obj} ;;; is an entity with system ID inconsistent with its public ID. For example: ;;; ;;; @lisp ;;; (define (f s) (shtml-entity-value (cadr (html->shtml s)))) ;;; (f " ") @result{} nbsp ;;; (f "ߐ") @result{} 2000 ;;; @end lisp (define (shtml-entity-value entity) (cond ((not (pair? entity)) #f) ((null? (cdr entity)) #f) ((eqv? (car entity) '&) ;; TODO: Error-check for extraneous list members? (let ((val (cadr entity))) (cond ((symbol? val) val) ((integer? val) val) ((string? val) (string->symbol val)) (else #f)))) ((eqv? (car entity) shtml-entity-symbol) (if (null? (cddr entity)) #f (let ((public-id (list-ref entity 1)) (system-id (list-ref entity 2))) ;; TODO: Error-check for extraneous list members? (cond ((equal? public-id shtml-named-char-id) (string->symbol system-id)) ((equal? public-id shtml-numeric-char-id) (string->number system-id)) (else #f))))) (else #f))) ;;; @section Tokenizing ;;; The tokenizer is used by the higher-level structural parser, but can also ;;; be called directly for debugging purposes or unusual applications. Some of ;;; the list structure of tokens, such as for start tag tokens, is mutated and ;;; incorporated into the SHTML list structure emitted by the parser. ;; TODO: Document the token format. ;;; @defproc make-html-tokenizer in normalized? ;;; ;;; Constructs an HTML tokenizer procedure on input port @var{in}. If boolean ;;; @var{normalized?} is true, then tokens will be in a format conducive to use ;;; with a parser emitting normalized SXML. Each call to the resulting ;;; procedure yields a successive token from the input. When the tokens have ;;; been exhausted, the procedure returns the null list. For example: ;;; ;;; @lisp ;;; (define input (open-input-string "bar")) ;;; (define next (make-html-tokenizer input #f)) ;;; (next) @result{} (a (@@ (href "foo"))) ;;; (next) @result{} "bar" ;;; (next) @result{} (*END* a) ;;; (next) @result{} () ;;; (next) @result{} () ;;; @end lisp (define make-html-tokenizer ;; TODO: Have the tokenizer replace contiguous whitespace within individual ;; text tokens with single space characters (except for when in `pre' and ;; verbatim elements). The parser will introduce new contiguous whitespace ;; (e.g., when text tokens are concatenated, invalid end tags are removed, ;; whitespace is irrelevant between certain elements), but then the parser ;; only has to worry about the first and last character of each string. ;; Perhaps the text tokens should have both leading and trailing whitespace ;; stripped, and contain flags for whether or not leading and trailing ;; whitespace occurred. (letrec ((no-token '()) ;; TODO: Maybe make these three variables options. (verbatim-to-eof-elems '(plaintext)) (verbatim-pair-elems '(script server style xmp)) (ws-chars (list #\space (%htmlprag:a2c 9) (%htmlprag:a2c 10) (%htmlprag:a2c 11) (%htmlprag:a2c 12) (%htmlprag:a2c 13))) (gosc/string-or-false (lambda (os) (let ((s (%htmlprag:gosc os))) (if (string=? s "") #f s)))) (gosc/symbol-or-false (lambda (os) (let ((s (gosc/string-or-false os))) (if s (string->symbol s) #f)))) ) (lambda (in normalized?) ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to ;; be ignored. (letrec ( ;; Port buffer with inexpensive unread of one character and slightly ;; more expensive pushback of second character to unread. The ;; procedures themselves do no consing. The tokenizer currently ;; needs two-symbol lookahead, due to ambiguous "/" while parsing ;; element and attribute names, which could be either empty-tag ;; syntax or XML qualified names. (c #f) (next-c #f) (c-consumed? #t) (read-c (lambda () (if c-consumed? (if next-c (begin (set! c next-c) (set! next-c #f)) (set! c (read-char in))) (set! c-consumed? #t)))) (unread-c (lambda () (if c-consumed? (set! c-consumed? #f) ;; TODO: Procedure name in error message really ;; isn't "make-html-tokenizer"... (%htmlprag:error "make-html-tokenizer" "already unread:" c)))) (push-c (lambda (new-c) (if c-consumed? (begin (set! c new-c) (set! c-consumed? #f)) (if next-c (%htmlprag:error "make-html-tokenizer" "pushback full:" c) (begin (set! next-c c) (set! c new-c) (set! c-consumed? #f)))))) ;; TODO: These procedures are a temporary convenience for ;; enumerating the pertinent character classes, with an eye towards ;; removing redundant tests of character class. These procedures ;; should be eliminated in a future version. (c-eof? (lambda () (eof-object? c))) (c-amp? (lambda () (eqv? c #\&))) (c-apos? (lambda () (eqv? c #\'))) (c-bang? (lambda () (eqv? c #\!))) (c-colon? (lambda () (eqv? c #\:))) (c-quot? (lambda () (eqv? c #\"))) (c-equals? (lambda () (eqv? c #\=))) (c-gt? (lambda () (eqv? c #\>))) (c-lsquare? (lambda () (eqv? c #\[))) (c-lt? (lambda () (eqv? c #\<))) (c-minus? (lambda () (eqv? c #\-))) (c-pound? (lambda () (eqv? c #\#))) (c-ques? (lambda () (eqv? c #\?))) (c-semi? (lambda () (eqv? c #\;))) (c-slash? (lambda () (eqv? c #\/))) (c-splat? (lambda () (eqv? c #\*))) (c-lf? (lambda () (eqv? c #\newline))) (c-angle? (lambda () (memv c '(#\< #\>)))) (c-ws? (lambda () (memv c ws-chars))) (c-alpha? (lambda () (char-alphabetic? c))) (c-digit? (lambda () (char-numeric? c))) (c-alphanum? (lambda () (or (c-alpha?) (c-digit?)))) (c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f #\A #\B #\C #\D #\E #\F)))) (skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c)))) (if-read-chars (lambda (match-chars yes-thunk no-proc) (let loop ((chars match-chars) (match-count 0)) (if (null? chars) (yes-thunk) (begin (read-c) (if (eqv? c (car chars)) (begin (loop (cdr chars) (+ 1 match-count))) (begin (unread-c) (no-proc match-chars match-count)))))))) (write-chars-count (lambda (chars count port) (let loop ((chars chars) (count count)) (or (zero? count) (begin (write-char (car chars) port) (loop (cdr chars) (- count 1))))))) (make-start-token (if normalized? (lambda (name ns attrs) (list name (cons '@ attrs))) (lambda (name ns attrs) (if (null? attrs) (list name) (list name (cons '@ attrs)))))) (make-empty-token (lambda (name ns attrs) (cons shtml-empty-symbol (make-start-token name ns attrs)))) (make-end-token (if normalized? (lambda (name ns attrs) (list shtml-end-symbol name (cons '@ attrs))) (lambda (name ns attrs) (if (null? attrs) (list shtml-end-symbol name) (list shtml-end-symbol name (cons '@ attrs)))))) (make-comment-token (lambda (str) (list shtml-comment-symbol str))) (make-decl-token (lambda (parts) (cons shtml-decl-symbol parts))) (scan-qname ;; TODO: Make sure we don't accept local names that have "*", since ;; this can break SXML tools. Have to validate this afterwards if ;; "verbatim-safe?". Also check for "@" and maybe "@@". Check ;; qname parsing code, especially for verbatim mode. This is ;; important! (lambda (verbatim-safe?) ;; Note: If we accept some invalid local names, we only need two ;; symbols of lookahead to determine the end of a qname. (letrec ((os #f) (ns '()) (vcolons 0) (good-os (lambda () (or os (begin (set! os (open-output-string)) os))))) (let loop () (read-c) (cond ((c-eof?) #f) ((or (c-ws?) (c-splat?)) (if verbatim-safe? (unread-c))) ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?)) (unread-c)) ((c-colon?) (or (null? ns) (set! ns (cons ":" ns))) (if os (begin (set! ns (cons (%htmlprag:gosc os) ns)) (set! os #f))) (loop)) ((c-slash?) (read-c) (cond ((or (c-eof?) (c-ws?) (c-equals?) (c-apos?) (c-quot?) (c-angle?) (c-splat?)) (unread-c) (push-c #\/)) (else (write-char #\/ (good-os)) (write-char c os) (loop)))) (else (write-char c (good-os)) (loop)))) (let ((ns (if (null? ns) #f (apply string-append (%htmlprag:reverse!ok ns)))) (local (if os (%htmlprag:gosc os) #f))) (if verbatim-safe? ;; TODO: Make sure we don't have ambiguous ":" or drop ;; any characters! (cons ns local) ;; Note: We represent "xml:" and "xmlns:" syntax as ;; normal qnames, for lack of something better to do with ;; them when we don't support XML namespaces. ;; ;; TODO: Local names are currently forced to lowercase, ;; since HTML is usually case-insensitive. If XML ;; namespaces are used, we might wish to keep local names ;; case-sensitive. (if local (if ns (if (or (string=? ns "xml") (string=? ns "xmlns")) (string->symbol (string-append ns ":" local)) (cons ns (string->symbol (%htmlprag:down!ok local)))) (string->symbol (%htmlprag:down!ok local))) (if ns (string->symbol (%htmlprag:down!ok ns)) ;; TODO: Ensure in rest of code that returning #f ;; as a name here is OK. #f))))))) (scan-tag (lambda (start?) (skip-ws) (let ((tag-name (scan-qname #f)) (tag-ns #f) (tag-attrs #f) (tag-empty? #f)) ;; Scan element name. (if (pair? tag-name) (begin (set! tag-ns (car tag-name)) (set! tag-name (cdr tag-name)))) ;; TODO: Ensure there's no case in which a #f tag-name isn't ;; compensated for later. ;; ;; Scan element attributes. (set! tag-attrs (let scan-attr-list () (read-c) (cond ((c-eof?) '()) ((c-angle?) (unread-c) '()) ((c-slash?) (set! tag-empty? #t) (scan-attr-list)) ((c-alpha?) (unread-c) (let ((attr (scan-attr))) (cons attr (scan-attr-list)))) (else (scan-attr-list))))) ;; Find ">" or unnatural end. (let loop () (read-c) (cond ((c-eof?) no-token) ((c-slash?) (set! tag-empty? #t) (loop)) ((c-gt?) #f) ((c-ws?) (loop)) (else (unread-c)))) ;; Change the tokenizer mode if necessary. (cond ((not start?) #f) (tag-empty? #f) ;; TODO: Maybe make one alist lookup here, instead of ;; two. ((memq tag-name verbatim-to-eof-elems) (set! nexttok verbeof-nexttok)) ((memq tag-name verbatim-pair-elems) (set! nexttok (make-verbpair-nexttok tag-name)))) ;; Return a token object. (if start? (if tag-empty? (make-empty-token tag-name tag-ns tag-attrs) (make-start-token tag-name tag-ns tag-attrs)) (make-end-token tag-name tag-ns tag-attrs))))) (scan-attr (lambda () (let ((name (scan-qname #f)) (val #f)) (if (pair? name) (set! name (cdr name))) (let loop-equals-or-end () (read-c) (cond ((c-eof?) no-token) ((c-ws?) (loop-equals-or-end)) ((c-equals?) (let loop-quote-or-unquoted () (read-c) (cond ((c-eof?) no-token) ((c-ws?) (loop-quote-or-unquoted)) ((or (c-apos?) (c-quot?)) (let ((term c)) (set! val (open-output-string)) (let loop-quoted-val () (read-c) (cond ((c-eof?) #f) ((eqv? c term) #f) (else (write-char c val) (loop-quoted-val)))))) ((c-angle?) (unread-c)) (else (set! val (open-output-string)) (write-char c val) (let loop-unquoted-val () (read-c) (cond ((c-eof?) no-token) ((c-apos?) #f) ((c-quot?) #f) ((or (c-ws?) (c-angle?) ;;(c-slash?) ) (unread-c)) ;; Note: We can treat a slash in an ;; unquoted attribute value as a ;; value constituent because the ;; slash is specially-handled only ;; for XHTML, and XHTML attribute ;; values must always be quoted. We ;; could do lookahead for "/>", but ;; that wouldn't let us parse HTML ;; "" correctly, so this is ;; an easier and more correct way to ;; do things. (else (write-char c val) (loop-unquoted-val)))))))) (else (unread-c)))) (if normalized? (list name (if val (%htmlprag:gosc val) (symbol->string name))) (if val (list name (%htmlprag:gosc val)) (list name)))))) (scan-comment ;; TODO: Rewrite this to use tail recursion rather than a state ;; variable. (lambda () (let ((os (open-output-string)) (state 'start-minus)) (let loop () (read-c) (cond ((c-eof?) #f) ((c-minus?) (set! state (case state ((start-minus) 'start-minus-minus) ((start-minus-minus body) 'end-minus) ((end-minus) 'end-minus-minus) ((end-minus-minus) (write-char #\- os) state) (else (%htmlprag:error "make-html-tokenizer" "invalid state:" state)))) (loop)) ((and (c-gt?) (eq? state 'end-minus-minus)) #f) (else (case state ((end-minus) (write-char #\- os)) ((end-minus-minus) (display "--" os))) (set! state 'body) (write-char c os) (loop)))) (make-comment-token (%htmlprag:gosc os))))) (scan-possible-cdata (lambda () ;; Read ") (lambda () (%htmlprag:gosc os)) (lambda (chars count) (if (zero? count) (if (eof-object? c) (%htmlprag:gosc os) (begin (write-char c os) (read-c) (loop))) (begin (write-char #\] os) (if (= count 2) (push-c #\])) (loop))))))))) (scan-pi (lambda () (skip-ws) (let ((name (open-output-string)) (val (open-output-string))) (let scan-name () (read-c) (cond ((c-eof?) #f) ((c-ws?) #f) ((c-alpha?) (write-char c name) (scan-name)) (else (unread-c)))) ;; TODO: Do we really want to emit #f for PI name? (set! name (gosc/symbol-or-false name)) (let scan-val () (read-c) (cond ((c-eof?) #f) ;; ((c-amp?) (display (scan-entity) val) ;; (scan-val)) ((c-ques?) (read-c) (cond ((c-eof?) (write-char #\? val)) ((c-gt?) #f) (else (write-char #\? val) (unread-c) (scan-val)))) (else (write-char c val) (scan-val)))) (list shtml-pi-symbol name (%htmlprag:gosc val))))) (scan-decl ;; TODO: Find if SXML includes declaration forms, and if so, use ;; whatever format SXML wants. ;; ;; TODO: Rewrite to eliminate state variables. (letrec ((scan-parts (lambda () (let ((part (open-output-string)) (nonsymbol? #f) (state 'before) (last? #f)) (let loop () (read-c) (cond ((c-eof?) #f) ((c-ws?) (case state ((before) (loop)) ((quoted) (write-char c part) (loop)))) ((and (c-gt?) (not (eq? state 'quoted))) (set! last? #t)) ((and (c-lt?) (not (eq? state 'quoted))) (unread-c)) ((c-quot?) (case state ((before) (set! state 'quoted) (loop)) ((unquoted) (unread-c)) ((quoted) #f))) (else (if (eq? state 'before) (set! state 'unquoted)) (set! nonsymbol? (or nonsymbol? (not (c-alphanum?)))) (write-char c part) (loop)))) (set! part (%htmlprag:gosc part)) (if (string=? part "") '() (cons (if (or (eq? state 'quoted) nonsymbol?) part ;; TODO: Normalize case of things we make ;; into symbols here. (string->symbol part)) (if last? '() (scan-parts)))))))) (lambda () (make-decl-token (scan-parts))))) (scan-entity (lambda () (read-c) (cond ((c-eof?) "&") ((c-alpha?) ;; TODO: Do entity names have a maximum length? (let ((name (open-output-string))) (write-char c name) (let loop () (read-c) (cond ((c-eof?) #f) ((c-alpha?) (write-char c name) (loop)) ((c-semi?) #f) (else (unread-c)))) (set! name (%htmlprag:gosc name)) ;; TODO: Make the entity map an option. (let ((pair (assoc name '(("amp" . "&") ("apos" . "'") ("gt" . ">") ("lt" . "<") ("quot" . "\""))))) (if pair (cdr pair) (make-shtml-entity name))))) ((c-pound?) (let ((num (open-output-string)) (hex? #f)) (read-c) (cond ((c-eof?) #f) ((memv c '(#\x #\X)) (set! hex? #t) (read-c))) (let loop () (cond ((c-eof?) #f) ((c-semi?) #f) ((or (c-digit?) (and hex? (c-hexlet?))) (write-char c num) (read-c) (loop)) (else (unread-c)))) (set! num (%htmlprag:gosc num)) (if (string=? num "") "&#;" (let ((n (string->number num (if hex? 16 10)))) (if (<= 32 n 126) ;; (and (<= 32 n 255) (not (= n 127))) (string (%htmlprag:a2c n)) (make-shtml-entity n)))))) (else (unread-c) "&")))) (normal-nexttok (lambda () (read-c) (cond ((c-eof?) no-token) ((c-lt?) (let loop () (read-c) (cond ((c-eof?) "<") ((c-ws?) (loop)) ((c-slash?) (scan-tag #f)) ((c-ques?) (scan-pi)) ((c-alpha?) (unread-c) (scan-tag #t)) ((c-bang?) (read-c) (if (c-lsquare?) (scan-possible-cdata) (let loop () (cond ((c-eof?) no-token) ((c-ws?) (read-c) (loop)) ((c-minus?) (scan-comment)) (else (unread-c) (scan-decl)))))) (else (unread-c) "<")))) ((c-gt?) ">") (else (let ((os (open-output-string))) (let loop () (cond ((c-eof?) #f) ((c-angle?) (unread-c)) ((c-amp?) (let ((entity (scan-entity))) (if (string? entity) (begin (display entity os) (read-c) (loop)) (let ((saved-nexttok nexttok)) (set! nexttok (lambda () (set! nexttok saved-nexttok) entity)))))) (else (write-char c os) (or (c-lf?) (begin (read-c) (loop)))))) (let ((text (%htmlprag:gosc os))) (if (equal? text "") (nexttok) text))))))) (verbeof-nexttok (lambda () (read-c) (if (c-eof?) no-token (let ((os (open-output-string))) (let loop () (or (c-eof?) (begin (write-char c os) (or (c-lf?) (begin (read-c) (loop)))))) (%htmlprag:gosc os))))) (make-verbpair-nexttok (lambda (elem-name) (lambda () (let ((os (open-output-string))) ;; Accumulate up to a newline-terminated line. (let loop () (read-c) (cond ((c-eof?) ;; Got EOF in verbatim context, so set the normal ;; nextok procedure, then fall out of loop. (set! nexttok normal-nexttok)) ((c-lt?) ;; Got "<" in verbatim context, so get next ;; character. (read-c) (cond ((c-eof?) ;; Got "<" then EOF, so set to the normal ;; nexttok procedure, add the "<" to the ;; verbatim string, and fall out of loop. (set! nexttok normal-nexttok) (write-char #\< os)) ((c-slash?) ;; Got "symbol (%htmlprag:down local)) elem-name)) ;; This is the terminator tag, so ;; scan to the end of it, set the ;; nexttok, and fall out of the loop. (begin (let scan-to-end () (read-c) (cond ((c-eof?) #f) ((c-gt?) #f) ((c-lt?) (unread-c)) ((c-alpha?) (unread-c) ;; Note: This is an ;; expensive way to skip ;; over an attribute, but ;; in practice more ;; verbatim end tags will ;; not have attributes. (scan-attr) (scan-to-end)) (else (scan-to-end)))) (set! nexttok (lambda () (set! nexttok normal-nexttok) (make-end-token elem-name #f '())))) ;; This isn't the terminator tag, so ;; add to the verbatim string the ;; "bar") #f) ;;; @result{} ((a (@@ (href "foo"))) "bar" (*END* a)) ;;; @end lisp (define (tokenize-html in normalized?) (let ((next-tok (make-html-tokenizer in normalized?))) (let loop ((tok (next-tok))) (if (null? tok) '() (cons tok (loop (next-tok))))))) ;;; @defproc shtml-token-kind token ;;; ;;; Returns a symbol indicating the kind of tokenizer @var{token}: ;;; @code{*COMMENT*}, @code{*DECL*}, @code{*EMPTY*}, @code{*END*}, ;;; @code{*ENTITY*}, @code{*PI*}, @code{*START*}, @code{*TEXT*}. ;;; This is used by higher-level parsing code. For example: ;;; ;;; @lisp ;;; (map shtml-token-kind ;;; (tokenize-html (open-input-string ">shtml} rather than calling the tokenizer directly. ;; @defvar %htmlprag:empty-elements ;; ;; List of names of HTML element types that have no content, represented as a ;; list of symbols. This is used internally by the parser and encoder. The ;; effect of mutating this list is undefined. ;; TODO: Document exactly which elements these are, after we make the new ;; parameterized parser constructor. (define %htmlprag:empty-elements '(& area base br frame hr img input isindex keygen link meta object param spacer wbr)) ;;; @defproc parse-html/tokenizer tokenizer normalized? ;;; ;;; Emits a parse tree like @code{html->shtml} and related procedures, except ;;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an ;;; input port. This procedure is used internally, and generally should not be ;;; called directly. (define parse-html/tokenizer ;; TODO: Document the algorithm, then see if rewriting as idiomatic Scheme ;; can make it more clear. (letrec ((empty-elements ;; TODO: Maybe make this an option. This might also be an ;; acceptable way to parse old HTML that uses the `p' element as a ;; paragraph terminator. %htmlprag:empty-elements) (parent-constraints ;; TODO: Maybe make this an option. '((area . (map)) (body . (html)) (caption . (table)) (colgroup . (table)) (dd . (dl)) (dt . (dl)) (frame . (frameset)) (head . (html)) (isindex . (head)) (li . (dir menu ol ul)) (meta . (head)) (noframes . (frameset)) (option . (select)) (p . (body td th)) (param . (applet)) (tbody . (table)) (td . (tr)) (th . (tr)) (thead . (table)) (title . (head)) (tr . (table tbody thead)))) (start-tag-name (lambda (tag-token) (car tag-token))) (end-tag-name (lambda (tag-token) (list-ref tag-token 1)))) (lambda (tokenizer normalized?) ;; Example `begs' value: ;; ;; ( ((head ...) . ( (title ...) )) ;; ((html ...) . ( (head ...) (*COMMENT* ...) )) ;; (#f . ( (html ...) (*DECL* doctype ...) )) ) (let ((begs (list (cons #f '())))) (letrec ((add-to-current-beg (lambda (tok) (set-cdr! (car begs) (cons tok (cdr (car begs)))))) (finish-all-begs (lambda () (let ((toplist #f)) (map (lambda (beg) (set! toplist (finish-beg beg))) begs) toplist))) (finish-beg (lambda (beg) (let ((start-tok (car beg))) (if start-tok (%htmlprag:append! (car beg) (%htmlprag:reverse!ok (cdr beg))) (%htmlprag:reverse!ok (cdr beg)))))) (finish-begs-to (lambda (name lst) (let* ((top (car lst)) (starttag (car top))) (cond ((not starttag) #f) ((eqv? name (start-tag-name starttag)) (set! begs (cdr lst)) (finish-beg top) #t) (else (if (finish-begs-to name (cdr lst)) (begin (finish-beg top) #t) #f)))))) (finish-begs-upto (lambda (parents lst) (let* ((top (car lst)) (starttag (car top))) (cond ((not starttag) #f) ((memq (start-tag-name starttag) parents) (set! begs lst) #t) (else (if (finish-begs-upto parents (cdr lst)) (begin (finish-beg top) #t) #f))))))) (let loop () (let ((tok (tokenizer))) (if (null? tok) (finish-all-begs) (let ((kind (shtml-token-kind tok))) (cond ((memv kind `(,shtml-comment-symbol ,shtml-decl-symbol ,shtml-entity-symbol ,shtml-pi-symbol ,shtml-text-symbol)) (add-to-current-beg tok)) ((eqv? kind shtml-start-symbol) (let* ((name (start-tag-name tok)) (cell (assq name parent-constraints))) (and cell (finish-begs-upto (cons 'div (cdr cell)) begs)) (add-to-current-beg tok) (or (memq name empty-elements) (set! begs (cons (cons tok '()) begs))))) ((eqv? kind shtml-empty-symbol) ;; Empty tag token, so just add it to current ;; beginning while stripping off leading `*EMPTY*' ;; symbol so that the token becomes normal SXML ;; element syntax. (add-to-current-beg (cdr tok))) ((eqv? kind shtml-end-symbol) (let ((name (end-tag-name tok))) (if name ;; Try to finish to a start tag matching this ;; end tag. If none, just drop the token, ;; though we used to add it to the current ;; beginning. (finish-begs-to name begs) ;; We have an anonymous end tag, so match it ;; with the most recent beginning. If no ;; beginning to match, then just drop the ;; token, though we used to add it to the ;; current beginning. (and (car (car begs)) (begin (finish-beg (car begs)) (set! begs (cdr begs))))))) (else (%htmlprag:error "parse-html/tokenizer" "unknown tag kind:" kind))) (loop)))))))))) ;; @defproc %htmlprag:parse-html input normalized? top? ;; ;; This procedure is now used internally by @code{html->shtml} and its ;; variants, and should not be used directly by programs. The interface is ;; likely to change in future versions of HtmlPrag. (define (%htmlprag:parse-html input normalized? top?) (let ((parse (lambda () (parse-html/tokenizer (make-html-tokenizer (cond ((input-port? input) input) ((string? input) (open-input-string input)) (else (%htmlprag:error "%htmlprag:parse-html" "invalid input type:" input))) normalized?) normalized?)))) (if top? (cons shtml-top-symbol (parse)) (parse)))) ;;; @defproc html->sxml-0nf input ;;; @defprocx html->sxml-1nf input ;;; @defprocx html->sxml-2nf input ;;; @defprocx html->sxml input ;;; @defprocx html->shtml input ;;; ;;; Permissively parse HTML from @var{input}, which is either an input port or ;;; a string, and emit an SHTML equivalent or approximation. To borrow and ;;; slightly modify an example from Kiselyov's discussion of his HTML parser: ;;; ;;; @lisp ;;; (html->shtml ;;; "whatever ;;; link