#lang scheme/base ;;; @Package HtmlPrag ;;; @Subtitle Pragmatic Parsing and Emitting of HTML using SXML and SHTML ;;; @HomePage http://www.neilvandyke.org/htmlprag/ ;;; @Author Neil Van Dyke ;;; @Version 0.17 ;;; @Date 2009-08-16 ;;; @PLaneT neil/htmlprag:1:4 ;; $Id: htmlprag.ss,v 1.408 2009/08/16 15:16:19 neilpair Exp $ ;;; @legal ;;; Copyright @copyright{} 2003 -- 2009 Neil Van Dyke. This program is ;;; Software; you can redistribute it and/or modify it under the terms of the ;;; GNU Lesser General Public License as published by the Free Software ;;; Foundation; either version 3 of the License (LGPL 3), or (at your option) ;;; any later version. This program is distributed in the hope that it will be ;;; useful, but without any warranty; without even the implied warranty of ;;; merchantability or fitness for a particular purpose. See ;;; @indicateurl{http://www.gnu.org/licenses/} for details. For other licenses ;;; and consulting, please contact the author. ;;; @end legal ;;; @section Introduction ;;; HtmlPrag provides permissive HTML parsing and emitting capability to Scheme ;;; programs. The parser is useful for software agent extraction of ;;; information from Web pages, for programmatically transforming HTML files, ;;; and for implementing interactive Web browsers. HtmlPrag emits ``SHTML,'' ;;; which is an encoding of HTML in ;;; @uref{http://pobox.com/~oleg/ftp/Scheme/SXML.html, SXML}, so that ;;; conventional HTML may be processed with XML tools such as ;;; @uref{http://pair.com/lisovsky/query/sxpath/, SXPath}. Like Oleg ;;; Kiselyov's @uref{http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser, ;;; SSAX-based HTML parser}, HtmlPrag provides a permissive tokenizer, but also ;;; attempts to recover structure. HtmlPrag also includes procedures for ;;; encoding SHTML in HTML syntax. ;;; ;;; The HtmlPrag parsing behavior is permissive in that it accepts erroneous ;;; HTML, handling several classes of HTML syntax errors gracefully, without ;;; yielding a parse error. This is crucial for parsing arbitrary real-world ;;; Web pages, since many pages actually contain syntax errors that would ;;; defeat a strict or validating parser. HtmlPrag's handling of errors is ;;; intended to generally emulate popular Web browsers' interpretation of the ;;; structure of erroneous HTML. We euphemistically term this kind of parse ;;; ``pragmatic.'' ;;; ;;; HtmlPrag also has some support for XHTML, although XML namespace qualifiers ;;; are currently accepted but stripped from the resulting SHTML. Note that ;;; valid XHTML input is of course better handled by a strict XML parser. ;;; ;;; HtmlPrag requires R5RS, SRFI-6, and SRFI-23. This version of HtmlPrag is ;;; specific to PLT Scheme, due to a transition period in how portability is ;;; handled, but the exceedingly portable version 0.16 is available at: ;;; @uref{http://www.neilvandyke.org/htmlprag/htmlprag-0-16.scm} (define (%gosc os) (begin0 (get-output-string os) (close-output-port os))) ;;; @section SHTML and SXML ;;; SHTML is a variant of SXML, with two minor but useful extensions: ;;; ;;; @itemize ;;; ;;; @item ;;; The SXML keyword symbols, such as @code{*TOP*}, are defined to be in all ;;; uppercase, regardless of the case-sensitivity of the reader of the hosting ;;; Scheme implementation in any context. This avoids several pitfalls. ;;; ;;; @item ;;; Since not all character entity references used in HTML can be converted to ;;; Scheme characters in all R5RS Scheme implementations, nor represented in ;;; conventional text files or other common external text formats to which one ;;; might wish to write SHTML, SHTML adds a special @code{&} syntax for ;;; non-ASCII (or non-Extended-ASCII) characters. The syntax is @code{(& ;;; @var{val})}, where @var{val} is a symbol or string naming with the symbolic ;;; name of the character, or an integer with the numeric value of the ;;; character. ;;; ;;; @end itemize ;;; @defvar shtml-comment-symbol ;;; @defvarx shtml-decl-symbol ;;; @defvarx shtml-empty-symbol ;;; @defvarx shtml-end-symbol ;;; @defvarx shtml-entity-symbol ;;; @defvarx shtml-pi-symbol ;;; @defvarx shtml-start-symbol ;;; @defvarx shtml-text-symbol ;;; @defvarx shtml-top-symbol ;;; ;;; These variables are bound to the following case-sensitive symbols used in ;;; SHTML, respectively: @code{*COMMENT*}, @code{*DECL*}, @code{*EMPTY*}, ;;; @code{*END*}, @code{*ENTITY*}, @code{*PI*}, @code{*START*}, @code{*TEXT*}, ;;; and @code{*TOP*}. These can be used in lieu of the literal symbols in ;;; programs read by a case-insensitive Scheme reader. (define shtml-comment-symbol (string->symbol "*COMMENT*")) (define shtml-decl-symbol (string->symbol "*DECL*")) (define shtml-empty-symbol (string->symbol "*EMPTY*")) (define shtml-end-symbol (string->symbol "*END*")) (define shtml-entity-symbol (string->symbol "*ENTITY*")) (define shtml-pi-symbol (string->symbol "*PI*")) (define shtml-start-symbol (string->symbol "*START*")) (define shtml-text-symbol (string->symbol "*TEXT*")) (define shtml-top-symbol (string->symbol "*TOP*")) ;;; @defvar shtml-named-char-id ;;; @defvarx shtml-numeric-char-id ;;; ;;; These variables are bound to the SHTML entity public identifier strings ;;; used in SHTML @code{*ENTITY*} named and numeric character entity ;;; references. (define shtml-named-char-id "shtml-named-char") (define shtml-numeric-char-id "shtml-numeric-char") ;;; @defproc make-shtml-entity val ;;; ;;; Yields an SHTML character entity reference for @var{val}. For example: ;;; ;;; @lisp ;;; (make-shtml-entity "rArr") @result{} (& rArr) ;;; (make-shtml-entity (string->symbol "rArr")) @result{} (& rArr) ;;; (make-shtml-entity 151) @result{} (& 151) ;;; @end lisp (define (make-shtml-entity val) (list '& (cond ((symbol? val) val) ((integer? val) val) ((string? val) (string->symbol val)) (else (error 'make-shtml-entity "invalid SHTML entity value: ~S" val))))) ;; TODO: ;; ;; (define (shtml-entity? x) ;; (and (shtml-entity-value entity) #t)) ;;; @defproc shtml-entity-value obj ;;; ;;; Yields the value for the SHTML entity @var{obj}, or @code{#f} if @var{obj} ;;; is not a recognized entity. Values of named entities are symbols, and ;;; values of numeric entities are numbers. An error may raised if @var{obj} ;;; is an entity with system ID inconsistent with its public ID. For example: ;;; ;;; @lisp ;;; (define (f s) (shtml-entity-value (cadr (html->shtml s)))) ;;; (f " ") @result{} nbsp ;;; (f "ߐ") @result{} 2000 ;;; @end lisp (define (shtml-entity-value entity) (cond ((not (pair? entity)) #f) ((null? (cdr entity)) #f) ((eqv? (car entity) '&) ;; TODO: Error-check for extraneous list members? (let ((val (cadr entity))) (cond ((symbol? val) val) ((integer? val) val) ((string? val) (string->symbol val)) (else #f)))) ((eqv? (car entity) shtml-entity-symbol) (if (null? (cddr entity)) #f (let ((public-id (list-ref entity 1)) (system-id (list-ref entity 2))) ;; TODO: Error-check for extraneous list members? (cond ((equal? public-id shtml-named-char-id) (string->symbol system-id)) ((equal? public-id shtml-numeric-char-id) (string->number system-id)) (else #f))))) (else #f))) ;;; @section Tokenizing ;;; The tokenizer is used by the higher-level structural parser, but can also ;;; be called directly for debugging purposes or unusual applications. Some of ;;; the list structure of tokens, such as for start tag tokens, is mutated and ;;; incorporated into the SHTML list structure emitted by the parser. ;;; @defproc make-html-tokenizer in normalized? ;;; ;;; Constructs an HTML tokenizer procedure on input port @var{in}. If boolean ;;; @var{normalized?} is true, then tokens will be in a format conducive to use ;;; with a parser emitting normalized SXML. Each call to the resulting ;;; procedure yields a successive token from the input. When the tokens have ;;; been exhausted, the procedure returns the null list. For example: ;;; ;;; @lisp ;;; (define input (open-input-string "<a href=\"foo\">bar</a>")) ;;; (define next (make-html-tokenizer input #f)) ;;; (next) @result{} (a (@@ (href "foo"))) ;;; (next) @result{} "bar" ;;; (next) @result{} (*END* a) ;;; (next) @result{} () ;;; (next) @result{} () ;;; @end lisp (define make-html-tokenizer ;; TODO: Have the tokenizer replace contiguous whitespace within individual ;; text tokens with single space characters (except for when in `pre' and ;; verbatim elements). The parser will introduce new contiguous whitespace ;; (e.g., when text tokens are concatenated, invalid end tags are removed, ;; whitespace is irrelevant between certain elements), but then the parser ;; only has to worry about the first and last character of each string. ;; Perhaps the text tokens should have both leading and trailing whitespace ;; stripped, and contain flags for whether or not leading and trailing ;; whitespace occurred. (letrec ((no-token '()) ;; TODO: Maybe make these three variables options. (verbatim-to-eof-elems '(plaintext)) (verbatim-pair-elems '(script server style xmp)) (ws-chars (list #\space (integer->char 9) (integer->char 10) (integer->char 11) (integer->char 12) (integer->char 13))) (gosc/string-or-false (lambda (os) (let ((s (%gosc os))) (if (string=? s "") #f s)))) (gosc/symbol-or-false (lambda (os) (let ((s (gosc/string-or-false os))) (if s (string->symbol s) #f)))) ) (lambda (in normalized?) ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to ;; be ignored. (letrec ( ;; Port buffer with inexpensive unread of one character and slightly ;; more expensive pushback of second character to unread. The ;; procedures themselves do no consing. The tokenizer currently ;; needs two-symbol lookahead, due to ambiguous "/" while parsing ;; element and attribute names, which could be either empty-tag ;; syntax or XML qualified names. (c #f) (next-c #f) (c-consumed? #t) (read-c (lambda () (if c-consumed? (if next-c (begin (set! c next-c) (set! next-c #f)) (set! c (read-char in))) (set! c-consumed? #t)))) (unread-c (lambda () (if c-consumed? (set! c-consumed? #f) ;; TODO: Procedure name in error message really ;; isn't "make-html-tokenizer"... (error 'make-html-tokenizer "already unread: ~S" c)))) (push-c (lambda (new-c) (if c-consumed? (begin (set! c new-c) (set! c-consumed? #f)) (if next-c (error 'make-html-tokenizer "pushback full: ~S" c) (begin (set! next-c c) (set! c new-c) (set! c-consumed? #f)))))) ;; TODO: These procedures are a temporary convenience for ;; enumerating the pertinent character classes, with an eye towards ;; removing redundant tests of character class. These procedures ;; should be eliminated in a future version. (c-eof? (lambda () (eof-object? c))) (c-amp? (lambda () (eqv? c #\&))) (c-apos? (lambda () (eqv? c #\'))) (c-bang? (lambda () (eqv? c #\!))) (c-colon? (lambda () (eqv? c #\:))) (c-quot? (lambda () (eqv? c #\"))) (c-equals? (lambda () (eqv? c #\=))) (c-gt? (lambda () (eqv? c #\>))) (c-lsquare? (lambda () (eqv? c #\[))) (c-lt? (lambda () (eqv? c #\<))) (c-minus? (lambda () (eqv? c #\-))) (c-pound? (lambda () (eqv? c #\#))) (c-ques? (lambda () (eqv? c #\?))) (c-semi? (lambda () (eqv? c #\;))) (c-slash? (lambda () (eqv? c #\/))) (c-splat? (lambda () (eqv? c #\*))) (c-lf? (lambda () (eqv? c #\newline))) (c-angle? (lambda () (memv c '(#\< #\>)))) (c-ws? (lambda () (memv c ws-chars))) (c-alpha? (lambda () (char-alphabetic? c))) (c-digit? (lambda () (char-numeric? c))) (c-alphanum? (lambda () (or (c-alpha?) (c-digit?)))) (c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f #\A #\B #\C #\D #\E #\F)))) (skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c)))) (if-read-chars (lambda (match-chars yes-thunk no-proc) (let loop ((chars match-chars) (match-count 0)) (if (null? chars) (yes-thunk) (begin (read-c) (if (eqv? c (car chars)) (begin (loop (cdr chars) (+ 1 match-count))) (begin (unread-c) (no-proc match-chars match-count)))))))) (write-chars-count (lambda (chars count port) (let loop ((chars chars) (count count)) (or (zero? count) (begin (write-char (car chars) port) (loop (cdr chars) (- count 1))))))) (make-start-token (if normalized? (lambda (name ns attrs) (list name (cons '@ attrs))) (lambda (name ns attrs) (if (null? attrs) (list name) (list name (cons '@ attrs)))))) (make-empty-token (lambda (name ns attrs) (cons shtml-empty-symbol (make-start-token name ns attrs)))) (make-end-token (if normalized? (lambda (name ns attrs) (list shtml-end-symbol name (cons '@ attrs))) (lambda (name ns attrs) (if (null? attrs) (list shtml-end-symbol name) (list shtml-end-symbol name (cons '@ attrs)))))) (make-comment-token (lambda (str) (list shtml-comment-symbol str))) (make-decl-token (lambda (parts) (cons shtml-decl-symbol parts))) (scan-qname ;; TODO: Make sure we don't accept local names that have "*", since ;; this can break SXML tools. Have to validate this afterwards if ;; "verbatim-safe?". Also check for "@" and maybe "@@". Check ;; qname parsing code, especially for verbatim mode. This is ;; important! (lambda (verbatim-safe?) ;; Note: If we accept some invalid local names, we only need two ;; symbols of lookahead to determine the end of a qname. (letrec ((os #f) (ns '()) (vcolons 0) (good-os (lambda () (or os (begin (set! os (open-output-string)) os))))) (let loop () (read-c) (cond ((c-eof?) #f) ((or (c-ws?) (c-splat?)) (if verbatim-safe? (unread-c) #f)) ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?)) (unread-c)) ((c-colon?) (or (null? ns) (set! ns (cons ":" ns))) (if os (begin (set! ns (cons (%gosc os) ns)) (set! os #f)) #f) (loop)) ((c-slash?) (read-c) (cond ((or (c-eof?) (c-ws?) (c-equals?) (c-apos?) (c-quot?) (c-angle?) (c-splat?)) (unread-c) (push-c #\/)) (else (write-char #\/ (good-os)) (write-char c os) (loop)))) (else (write-char c (good-os)) (loop)))) (let ((ns (if (null? ns) #f (apply string-append (reverse ns)))) (local (if os (%gosc os) #f))) (if verbatim-safe? ;; TODO: Make sure we don't have ambiguous ":" or drop ;; any characters! (cons ns local) ;; Note: We represent "xml:" and "xmlns:" syntax as ;; normal qnames, for lack of something better to do with ;; them when we don't support XML namespaces. ;; ;; TODO: Local names are currently forced to lowercase, ;; since HTML is usually case-insensitive. If XML ;; namespaces are used, we might wish to keep local names ;; case-sensitive. (if local (if ns (if (or (string=? ns "xml") (string=? ns "xmlns")) (string->symbol (string-append ns ":" local)) (cons ns (string->symbol (string-downcase local)))) (string->symbol (string-downcase local))) (if ns (string->symbol (string-downcase ns)) ;; TODO: Ensure in rest of code that returning #f ;; as a name here is OK. #f))))))) (scan-tag (lambda (start?) (skip-ws) (let ((tag-name (scan-qname #f)) (tag-ns #f) (tag-attrs #f) (tag-empty? #f)) ;; Scan element name. (if (pair? tag-name) (begin (set! tag-ns (car tag-name)) (set! tag-name (cdr tag-name))) #f) ;; TODO: Ensure there's no case in which a #f tag-name isn't ;; compensated for later. ;; ;; Scan element attributes. (set! tag-attrs (let scan-attr-list () (read-c) (cond ((c-eof?) '()) ((c-angle?) (unread-c) '()) ((c-slash?) (set! tag-empty? #t) (scan-attr-list)) ((c-alpha?) (unread-c) (let ((attr (scan-attr))) (cons attr (scan-attr-list)))) (else (scan-attr-list))))) ;; Find ">" or unnatural end. (let loop () (read-c) (cond ((c-eof?) no-token) ((c-slash?) (set! tag-empty? #t) (loop)) ((c-gt?) #f) ((c-ws?) (loop)) (else (unread-c)))) ;; Change the tokenizer mode if necessary. (cond ((not start?) #f) (tag-empty? #f) ;; TODO: Maybe make one alist lookup here, instead of ;; two. ((memq tag-name verbatim-to-eof-elems) (set! nexttok verbeof-nexttok)) ((memq tag-name verbatim-pair-elems) (set! nexttok (make-verbpair-nexttok tag-name)))) ;; Return a token object. (if start? (if tag-empty? (make-empty-token tag-name tag-ns tag-attrs) (make-start-token tag-name tag-ns tag-attrs)) (make-end-token tag-name tag-ns tag-attrs))))) (scan-attr (lambda () (let ((name (scan-qname #f)) (val #f)) (if (pair? name) (set! name (cdr name)) #f) (let loop-equals-or-end () (read-c) (cond ((c-eof?) no-token) ((c-ws?) (loop-equals-or-end)) ((c-equals?) (let loop-quote-or-unquoted () (read-c) (cond ((c-eof?) no-token) ((c-ws?) (loop-quote-or-unquoted)) ((or (c-apos?) (c-quot?)) (let ((term c)) (set! val (open-output-string)) (let loop-quoted-val () (read-c) (cond ((c-eof?) #f) ((eqv? c term) #f) (else (write-char c val) (loop-quoted-val)))))) ((c-angle?) (unread-c)) (else (set! val (open-output-string)) (write-char c val) (let loop-unquoted-val () (read-c) (cond ((c-eof?) no-token) ((c-apos?) #f) ((c-quot?) #f) ((or (c-ws?) (c-angle?) ;;(c-slash?) ) (unread-c)) ;; Note: We can treat a slash in an ;; unquoted attribute value as a ;; value constituent because the ;; slash is specially-handled only ;; for XHTML, and XHTML attribute ;; values must always be quoted. We ;; could do lookahead for "/>", but ;; that wouldn't let us parse HTML ;; "<a href=/>" correctly, so this is ;; an easier and more correct way to ;; do things. (else (write-char c val) (loop-unquoted-val)))))))) (else (unread-c)))) (if normalized? (list name (if val (%gosc val) (symbol->string name))) (if val (list name (%gosc val)) (list name)))))) (scan-comment ;; TODO: Rewrite this to use tail recursion rather than a state ;; variable. (lambda () (let ((os (open-output-string)) (state 'start-minus)) (let loop () (read-c) (cond ((c-eof?) #f) ((c-minus?) (set! state (case state ((start-minus) 'start-minus-minus) ((start-minus-minus body) 'end-minus) ((end-minus) 'end-minus-minus) ((end-minus-minus) (write-char #\- os) state) (else (error '<make-html-tokenizer> "invalid state: ~S" state)))) (loop)) ((and (c-gt?) (eq? state 'end-minus-minus)) #f) (else (case state ((end-minus) (write-char #\- os)) ((end-minus-minus) (display "--" os))) (set! state 'body) (write-char c os) (loop)))) (make-comment-token (%gosc os))))) (scan-possible-cdata (lambda () ;; Read "<!" and current character is "[", so try to read the ;; rest of the CDATA start delimeter. (if-read-chars '(#\C #\D #\A #\T #\A #\[) (lambda () ;; Successfully read CDATA section start delimiter, so read ;; the section. (scan-cdata)) (lambda (chars count) ;; Did not read rest of CDATA section start delimiter, so ;; return a string for what we did read. (let ((os (open-output-string))) (display "<![" os) (write-chars-count chars count os) (%gosc os)))))) (scan-cdata (lambda () (let ((os (open-output-string))) (let loop () (if-read-chars '(#\] #\] #\>) (lambda () (%gosc os)) (lambda (chars count) (if (zero? count) (if (eof-object? c) (%gosc os) (begin (write-char c os) (read-c) (loop))) (begin (write-char #\] os) (if (= count 2) (push-c #\]) #f) (loop))))))))) (scan-pi (lambda () (skip-ws) (let ((name (open-output-string)) (val (open-output-string))) (let scan-name () (read-c) (cond ((c-eof?) #f) ((c-ws?) #f) ((c-alpha?) (write-char c name) (scan-name)) (else (unread-c)))) ;; TODO: Do we really want to emit #f for PI name? (set! name (gosc/symbol-or-false name)) (let scan-val () (read-c) (cond ((c-eof?) #f) ;; ((c-amp?) (display (scan-entity) val) ;; (scan-val)) ((c-ques?) (read-c) (cond ((c-eof?) (write-char #\? val)) ((c-gt?) #f) (else (write-char #\? val) (unread-c) (scan-val)))) (else (write-char c val) (scan-val)))) (list shtml-pi-symbol name (%gosc val))))) (scan-decl ;; TODO: Find if SXML includes declaration forms, and if so, use ;; whatever format SXML wants. ;; ;; TODO: Rewrite to eliminate state variables. (letrec ((scan-parts (lambda () (let ((part (open-output-string)) (nonsymbol? #f) (state 'before) (last? #f)) (let loop () (read-c) (cond ((c-eof?) #f) ((c-ws?) (case state ((before) (loop)) ((quoted) (write-char c part) (loop)))) ((and (c-gt?) (not (eq? state 'quoted))) (set! last? #t)) ((and (c-lt?) (not (eq? state 'quoted))) (unread-c)) ((c-quot?) (case state ((before) (set! state 'quoted) (loop)) ((unquoted) (unread-c)) ((quoted) #f))) (else (if (eq? state 'before) (set! state 'unquoted) #f) (set! nonsymbol? (or nonsymbol? (not (c-alphanum?)))) (write-char c part) (loop)))) (set! part (%gosc part)) (if (string=? part "") '() (cons (if (or (eq? state 'quoted) nonsymbol?) part ;; TODO: Normalize case of things we make ;; into symbols here. (string->symbol part)) (if last? '() (scan-parts)))))))) (lambda () (make-decl-token (scan-parts))))) (scan-entity (lambda () (read-c) (cond ((c-eof?) "&") ((c-alpha?) ;; TODO: Do entity names have a maximum length? (let ((name (open-output-string))) (write-char c name) (let loop () (read-c) (cond ((c-eof?) #f) ((c-alpha?) (write-char c name) (loop)) ((c-semi?) #f) (else (unread-c)))) (set! name (%gosc name)) ;; TODO: Make the entity map an option. (let ((pair (assoc name '(("amp" . "&") ("apos" . "'") ("gt" . ">") ("lt" . "<") ("quot" . "\""))))) (if pair (cdr pair) (make-shtml-entity name))))) ((c-pound?) (let ((num (open-output-string)) (hex? #f)) (read-c) (cond ((c-eof?) #f) ((memv c '(#\x #\X)) (set! hex? #t) (read-c))) (let loop () (cond ((c-eof?) #f) ((c-semi?) #f) ((or (c-digit?) (and hex? (c-hexlet?))) (write-char c num) (read-c) (loop)) (else (unread-c)))) (set! num (%gosc num)) (if (string=? num "") "&#;" (let ((n (string->number num (if hex? 16 10)))) (if (<= 32 n 126) ;; (and (<= 32 n 255) (not (= n 127))) (string (integer->char n)) (make-shtml-entity n)))))) (else (unread-c) "&")))) (normal-nexttok (lambda () (read-c) (cond ((c-eof?) no-token) ((c-lt?) (let loop () (read-c) (cond ((c-eof?) "<") ((c-ws?) (loop)) ((c-slash?) (scan-tag #f)) ((c-ques?) (scan-pi)) ((c-alpha?) (unread-c) (scan-tag #t)) ((c-bang?) (read-c) (if (c-lsquare?) (scan-possible-cdata) (let loop () (cond ((c-eof?) no-token) ((c-ws?) (read-c) (loop)) ((c-minus?) (scan-comment)) (else (unread-c) (scan-decl)))))) (else (unread-c) "<")))) ((c-gt?) ">") (else (let ((os (open-output-string))) (let loop () (cond ((c-eof?) #f) ((c-angle?) (unread-c)) ((c-amp?) (let ((entity (scan-entity))) (if (string? entity) (begin (display entity os) (read-c) (loop)) (let ((saved-nexttok nexttok)) (set! nexttok (lambda () (set! nexttok saved-nexttok) entity)))))) (else (write-char c os) (or (c-lf?) (begin (read-c) (loop)))))) (let ((text (%gosc os))) (if (equal? text "") (nexttok) text))))))) (verbeof-nexttok (lambda () (read-c) (if (c-eof?) no-token (let ((os (open-output-string))) (let loop () (or (c-eof?) (begin (write-char c os) (or (c-lf?) (begin (read-c) (loop)))))) (%gosc os))))) (make-verbpair-nexttok (lambda (elem-name) (lambda () (let ((os (open-output-string))) ;; Accumulate up to a newline-terminated line. (let loop () (read-c) (cond ((c-eof?) ;; Got EOF in verbatim context, so set the normal ;; nextok procedure, then fall out of loop. (set! nexttok normal-nexttok)) ((c-lt?) ;; Got "<" in verbatim context, so get next ;; character. (read-c) (cond ((c-eof?) ;; Got "<" then EOF, so set to the normal ;; nexttok procedure, add the "<" to the ;; verbatim string, and fall out of loop. (set! nexttok normal-nexttok) (write-char #\< os)) ((c-slash?) ;; Got "</", so... (read-c) (cond ((c-eof?) (display "</" os)) ((c-alpha?) ;; Got "</" followed by alpha, so unread ;; the alpha, scan qname, compare... (unread-c) (let* ((vqname (scan-qname #t)) (ns (car vqname)) (local (cdr vqname))) ;; Note: We ignore XML namespace ;; qualifier for purposes of comparison. ;; ;; Note: We're interning strings here for ;; comparison when in theory there could ;; be many such unique interned strings ;; in a valid HTML document, although in ;; practice this should not be a problem. (if (and local (eqv? (string->symbol (string-downcase local)) elem-name)) ;; This is the terminator tag, so ;; scan to the end of it, set the ;; nexttok, and fall out of the loop. (begin (let scan-to-end () (read-c) (cond ((c-eof?) #f) ((c-gt?) #f) ((c-lt?) (unread-c)) ((c-alpha?) (unread-c) ;; Note: This is an ;; expensive way to skip ;; over an attribute, but ;; in practice more ;; verbatim end tags will ;; not have attributes. (scan-attr) (scan-to-end)) (else (scan-to-end)))) (set! nexttok (lambda () (set! nexttok normal-nexttok) (make-end-token elem-name #f '())))) ;; This isn't the terminator tag, so ;; add to the verbatim string the ;; "</" and the characters of what we ;; were scanning as a qname, and ;; recurse in the loop. (begin (display "</" os) (if ns (begin (display ns os) (display ":" os)) #f) (if local (display local os) #f) (loop))))) (else ;; Got "</" and non-alpha, so unread new ;; character, add the "</" to verbatim ;; string, then loop. (unread-c) (display "</" os) (loop)))) (else ;; Got "<" and non-slash, so unread the new ;; character, write the "<" to the verbatim ;; string, then loop. (unread-c) (write-char #\< os) (loop)))) (else ;; Got non-"<" in verbatim context, so just add it ;; to the buffer, then, if it's not a linefeed, fall ;; out of the loop so that the token can be ;; returned. (write-char c os) (or (c-lf?) (loop))))) ;; Return the accumulated line string, if non-null, or call ;; nexttok. (or (gosc/string-or-false os) (nexttok)))))) (nexttok #f)) (set! nexttok normal-nexttok) (lambda () (nexttok)))))) ;;; @defproc tokenize-html in normalized? ;;; ;;; Returns a list of tokens from input port @var{in}, normalizing according to ;;; boolean @var{normalized?}. This is probably most useful as a debugging ;;; convenience. For example: ;;; ;;; @lisp ;;; (tokenize-html (open-input-string "<a href=\"foo\">bar</a>") #f) ;;; @result{} ((a (@@ (href "foo"))) "bar" (*END* a)) ;;; @end lisp (define (tokenize-html in normalized?) (let ((next-tok (make-html-tokenizer in normalized?))) (let loop ((tok (next-tok))) (if (null? tok) '() (cons tok (loop (next-tok))))))) ;;; @defproc shtml-token-kind token ;;; ;;; Returns a symbol indicating the kind of tokenizer @var{token}: ;;; @code{*COMMENT*}, @code{*DECL*}, @code{*EMPTY*}, @code{*END*}, ;;; @code{*ENTITY*}, @code{*PI*}, @code{*START*}, @code{*TEXT*}. ;;; This is used by higher-level parsing code. For example: ;;; ;;; @lisp ;;; (map shtml-token-kind ;;; (tokenize-html (open-input-string "<a<b>><c</</c") #f)) ;;; @result{} (*START* *START* *TEXT* *START* *END* *END*) ;;; @end lisp (define (shtml-token-kind token) (cond ((string? token) shtml-text-symbol) ((list? token) (let ((s (list-ref token 0))) (if (memq s `(,shtml-comment-symbol ,shtml-decl-symbol ,shtml-empty-symbol ,shtml-end-symbol ,shtml-entity-symbol ,shtml-pi-symbol)) s shtml-start-symbol))) (else (error 'shtml-token-kind "unrecognized token kind: ~S" token)))) ;;; @section Parsing ;;; Most applications will call a parser procedure such as ;;; @code{html->shtml} rather than calling the tokenizer directly. ;; @defvar %empty-elements ;; ;; List of names of HTML element types that have no content, represented as a ;; list of symbols. This is used internally by the parser and encoder. The ;; effect of mutating this list is undefined. ;; TODO: Document exactly which elements these are, after we make the new ;; parameterized parser constructor. (define %empty-elements '(& area base br frame hr img input isindex keygen link meta object param spacer wbr)) ;;; @defproc parse-html/tokenizer tokenizer normalized? ;;; ;;; Emits a parse tree like @code{html->shtml} and related procedures, except ;;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an ;;; input port. This procedure is used internally, and generally should not be ;;; called directly. (define parse-html/tokenizer ;; Note: This algorithm was originally written in 2001 (as part of the first ;; Scheme library the author ever wrote), and then on 2009-08-16 was revamped ;; to not use mutable pairs, for PLT 4 compatibility. It could still use ;; some work to be more functional, but it works for now. (letrec ((empty-elements ;; TODO: Maybe make this an option. This might also be an ;; acceptable way to parse old HTML that uses the `p' element as a ;; paragraph terminator. %empty-elements) (parent-constraints ;; TODO: Maybe make this an option. '((area . (map)) (body . (html)) (caption . (table)) (colgroup . (table)) (dd . (dl)) (dt . (dl)) (frame . (frameset)) (head . (html)) (isindex . (head)) (li . (dir menu ol ul)) (meta . (head)) (noframes . (frameset)) (option . (select)) (p . (body td th)) (param . (applet)) (tbody . (table)) (td . (tr)) (th . (tr)) (thead . (table)) (title . (head)) (tr . (table tbody thead)))) (token-kinds-that-always-get-added `(,shtml-comment-symbol ,shtml-decl-symbol ,shtml-entity-symbol ,shtml-pi-symbol ,shtml-text-symbol)) (start-tag-name (lambda (tag-token) (car tag-token))) (end-tag-name (lambda (tag-token) (list-ref tag-token 1)))) (lambda (tokenizer normalized?) (let ((begs (list (vector #f '())))) (letrec ((add-thing-as-child-of-current-beg (lambda (tok) (let ((beg (car begs))) (vector-set! beg 1 (cons tok (vector-ref beg 1)))))) (beg->elem (lambda (beg) (let ((elem-name (vector-ref beg 0)) (attrs-and-contents (reverse (vector-ref beg 1)))) (cons elem-name attrs-and-contents)))) (finish-current-beg-and-return-elem (lambda () (let ((elem (beg->elem (car begs)))) (set! begs (cdr begs)) (or (null? begs) (add-thing-as-child-of-current-beg elem)) elem))) (finish-current-beg (lambda () (finish-current-beg-and-return-elem))) (finish-all-begs-and-return-top (lambda () (let loop () (let ((elem (finish-current-beg-and-return-elem))) (if (car elem) (loop) (cdr elem)))))) (finish-begs-up-to-and-including-name (lambda (name) (let loop-find-name ((find-begs begs) (depth 1)) (let ((beg-name (vector-ref (car find-begs) 0))) (cond ((not beg-name) ;; We reached the root without finding a ;; matching beg, so simply discard it. (void)) ((eqv? name beg-name) ;; We found a match, so finish the begs up to ;; depth. (let loop-finish ((depth depth)) (or (zero? depth) (begin (finish-current-beg) (loop-finish (- depth 1)))))) (else ;; Didn't find a match yet, and there's still at ;; least one more beg to look at, so recur. (loop-find-name (cdr find-begs) (+ depth 1)))))))) (finish-begs-upto-but-not-including-names (lambda (names) (let loop-find-name ((find-begs begs) (depth 0)) (let ((beg-name (vector-ref (car find-begs) 0))) (cond ((not beg-name) ;; We reached the root without finding a ;; matching beg, so simply discard it. (void)) ((memq beg-name names) ;; We found a match, so finish the begs up to ;; depth. (let loop-finish ((depth depth)) (or (zero? depth) (begin (finish-current-beg) (loop-finish (- depth 1)))))) (else ;; Didn't find a match yet, and there's still at ;; least one more beg to look at, so recur. (loop-find-name (cdr find-begs) (+ depth 1))))))))) (let loop () (let ((tok (tokenizer))) (if (null? tok) (finish-all-begs-and-return-top) (let ((kind (shtml-token-kind tok))) (cond ((memv kind token-kinds-that-always-get-added) (add-thing-as-child-of-current-beg tok)) ((eqv? kind shtml-start-symbol) (let* ((name (start-tag-name tok)) (cell (assq name parent-constraints))) (and cell (finish-begs-upto-but-not-including-names (cons 'div (cdr cell)))) (if (memq name empty-elements) (add-thing-as-child-of-current-beg tok) (set! begs (cons (vector (car tok) (cdr tok)) begs))))) ((eqv? kind shtml-empty-symbol) ;; Empty tag token, so just add it to current ;; beginning while stripping off leading `*EMPTY*' ;; symbol so that the token becomes normal SXML ;; element syntax. (add-thing-as-child-of-current-beg (cdr tok))) ((eqv? kind shtml-end-symbol) (let ((name (end-tag-name tok))) (if name ;; Try to finish to a start tag matching this ;; end tag. If none, just drop the token, ;; though we used to add it to the current ;; beginning. (finish-begs-up-to-and-including-name name) ;; We have an anonymous end tag, so match it ;; with the most recent beginning. If no ;; beginning to match, then just drop the ;; token, though we used to add it to the ;; current beginning. (and (vector-ref (car begs) 0) (finish-current-beg))))) (else (error 'parse-html/tokenizer "unknown tag kind: ~S" kind))) (loop)))))))))) ;; TODO: Quote of message to a user: ;; ;; >I think this behavior is due to HtmlPrag's use in "parse-html/tokenizer" ;; >of its local "parent-constraints" variable. ;; > ;; >The following line of code from the variable binding expresses the ;; >constraint that any "p" element can have as immediate parent element ;; >only "body", "td", or "th": ;; > ;; > (p . (body td th)) ;; > ;; >I think I know a good heuristic for dealing with unfamiliar but ;; >seemingly well-formed elements, like "page" in this case, but I'm afraid ;; >I don't have time to implement it right now. (I am job-hunting right ;; >now, and there are many other coding things I need to do first.) ;; > ;; >Would adding "page" to the above line of the HtmlPrag source code work ;; >around the current problem, or do you need a better solution right now? ;; @defproc %parse-html input normalized? top? ;; ;; This procedure is now used internally by @code{html->shtml} and its ;; variants, and should not be used directly by programs. The interface is ;; likely to change in future versions of HtmlPrag. (define (%parse-html input normalized? top?) (let ((parse (lambda () (parse-html/tokenizer (make-html-tokenizer (cond ((input-port? input) input) ((string? input) (open-input-string input)) (else (error '%parse-html "invalid input type: ~S" input))) normalized?) normalized?)))) (if top? (cons shtml-top-symbol (parse)) (parse)))) ;;; @defproc html->sxml-0nf input ;;; @defprocx html->sxml-1nf input ;;; @defprocx html->sxml-2nf input ;;; @defprocx html->sxml input ;;; @defprocx html->shtml input ;;; ;;; Permissively parse HTML from @var{input}, which is either an input port or ;;; a string, and emit an SHTML equivalent or approximation. To borrow and ;;; slightly modify an example from Kiselyov's discussion of his HTML parser: ;;; ;;; @lisp ;;; (html->shtml ;;; "<html><head><title></title><title>whatever</title></head><body> ;;; <a href=\"url\">link</a><p align=center><ul compact style=\"aa\"> ;;; <p>BLah<!-- comment <comment> --> <i> italic <b> bold <tt> ened</i> ;;; still < bold </b></body><P> But not done yet...") ;;; @result{} ;;; (*TOP* (html (head (title) (title "whatever")) ;;; (body "\n" ;;; (a (@@ (href "url")) "link") ;;; (p (@@ (align "center")) ;;; (ul (@@ (compact) (style "aa")) "\n")) ;;; (p "BLah" ;;; (*COMMENT* " comment <comment> ") ;;; " " ;;; (i " italic " (b " bold " (tt " ened"))) ;;; "\n" ;;; "still < bold ")) ;;; (p " But not done yet..."))) ;;; @end lisp ;;; ;;; Note that in the emitted SHTML the text token @code{"still < bold"} is ;;; @emph{not} inside the @code{b} element, which represents an unfortunate ;;; failure to emulate all the quirks-handling behavior of some popular Web ;;; browsers. ;;; ;;; The procedures @code{html->sxml-@var{n}nf} for @var{n} 0 through 2 ;;; correspond to 0th through 2nd normal forms of SXML as specified in SXML, ;;; and indicate the minimal requirements of the emitted SXML. ;;; ;;; @code{html->sxml} and @code{html->shtml} are currently aliases for ;;; @code{html->sxml-0nf}, and can be used in scripts and interactively, when ;;; terseness is important and any normal form of SXML would suffice. (define (html->sxml-0nf input) (%parse-html input #f #t)) (define (html->sxml-1nf input) (%parse-html input #f #t)) (define (html->sxml-2nf input) (%parse-html input #t #t)) (define html->sxml html->sxml-0nf) (define html->shtml html->sxml-0nf) ;;; @section Emitting HTML ;;; Two procedures encoding the SHTML representation as conventional HTML, ;;; @code{write-shtml-as-html} and @code{shtml->html}. These are perhaps most ;;; useful for emitting the result of parsed and transformed input HTML. They ;;; can also be used for emitting HTML from generated or handwritten SHTML. (define (%write-shtml-as-html/fixed shtml out foreign-filter) (letrec ((write-shtml-text (lambda (str out) (let ((len (string-length str))) (let loop ((i 0)) (if (< i len) (begin (display (let ((c (string-ref str i))) (case c ;; ((#\") """) ((#\&) "&") ((#\<) "<") ((#\>) ">") (else c))) out) (loop (+ 1 i))) #f))))) (do-thing (lambda (thing) (cond ((string? thing) (write-shtml-text thing out)) ((list? thing) (if (not (null? thing)) (do-list-thing thing) #f)) (else (do-thing (foreign-filter thing #f)))))) (do-list-thing (lambda (thing) (let ((head (car thing))) (cond ((symbol? head) ;; Head is a symbol, so... (cond ((eq? head shtml-comment-symbol) ;; TODO: Make sure the comment text doesn't contain a ;; comment end sequence. (display "<!-- " out) (let ((text (car (cdr thing)))) (if (string? text) ;; TODO: Enforce whitespace safety without ;; padding unnecessarily. ;; ;; (let ((len (string-length text))) ;; (if (= len 0) ;; (display #\space out) ;; (begin (if (not (eqv? ;; (string-ref text 0) ;; #\space)) (display text out) (error 'write-shtml-as-html "invalid SHTML comment text: ~S" thing))) (or (null? (cdr (cdr thing))) (error 'write-shtml-as-html "invalid SHTML comment body: ~S" thing)) (display " -->" out)) ((eq? head shtml-decl-symbol) (let ((head (car (cdr thing)))) (display "<!" out) (display (symbol->string head) out) (for-each (lambda (n) (cond ((symbol? n) (display #\space out) (display (symbol->string n) out)) ((string? n) (display " \"" out) (%write-dquote-ampified n out) (display #\" out)) (else (error 'write-shtml-as-html "invalid SHTML decl: ~S" thing)))) (cdr (cdr thing))) (display #\> out))) ((eq? head shtml-pi-symbol) (display "<?" out) (display (symbol->string (car (cdr thing))) out) (display #\space out) (display (car (cdr (cdr thing))) out) ;; TODO: Error-check that no more rest of PI. (display "?>" out)) ((eq? head shtml-top-symbol) (for-each do-thing (cdr thing))) ((eq? head shtml-empty-symbol) #f) ((eq? head '@) (error 'write-shtml-as-html "illegal position of SHTML attributes: ~S" thing)) ((or (eq? head '&) (eq? head shtml-entity-symbol)) (let ((val (shtml-entity-value thing))) (if val (begin (write-char #\& out) (and (integer? val) (write-char #\# out)) (display val out) (write-char #\; out)) (error 'write-shtml-as-html "invalid SHTML entity reference: ~S" thing)))) ((memq head `(,shtml-end-symbol ,shtml-start-symbol ,shtml-text-symbol)) (error 'write-shtml-as-html "invalid SHTML symbol: ~S" head)) (else (display #\< out) (display head out) (let* ((rest (cdr thing))) (or (null? rest) (let ((second (car rest))) (and (list? second) (not (null? second)) (eq? (car second) '@) (begin (for-each do-attr (cdr second)) (set! rest (cdr rest)))))) (if (memq head %empty-elements) ;; TODO: Error-check to make sure the element ;; has no content other than attributes. We ;; have to test for cases like: (br (@) () ;; (())) (display " />" out) (begin (display #\> out) (for-each do-thing rest) (display "</" out) (display (symbol->string head) out) (display #\> out))))))) ;; ((or (list? head) (string? head)) ;; ;; Head is a list or string, which might occur as the result ;; of an SXML transform, so we'll cope. (else ;; Head is not a symbol, which might occur as the result of ;; an SXML transform, so we'll cope. (for-each do-thing thing)) ;;(else ;; ;; Head is NOT a symbol, list, or string, so error. ;; (error 'write-shtml-as-html ;; "invalid SHTML list: ~S" ;; thing)) )))) (write-attr-val-dquoted (lambda (str out) (display #\" out) (display str out) (display #\" out))) (write-attr-val-squoted (lambda (str out) (display #\' out) (display str out) (display #\' out))) (write-attr-val-dquoted-and-amped (lambda (str out) (display #\" out) (%write-dquote-ampified str out) (display #\" out))) (write-attr-val (lambda (str out) (let ((len (string-length str))) (let find-dquote-and-squote ((i 0)) (if (= i len) (write-attr-val-dquoted str out) (let ((c (string-ref str i))) (cond ((eqv? c #\") (let find-squote ((i (+ 1 i))) (if (= i len) (write-attr-val-squoted str out) (if (eqv? (string-ref str i) #\') (write-attr-val-dquoted-and-amped str out) (find-squote (+ 1 i)))))) ((eqv? c #\') (let find-dquote ((i (+ 1 i))) (if (= i len) (write-attr-val-dquoted str out) (if (eqv? (string-ref str i) #\") (write-attr-val-dquoted-and-amped str out) (find-dquote (+ 1 i)))))) (else (find-dquote-and-squote (+ 1 i)))))))))) (collect-and-write-attr-val ;; TODO: Take another look at this. (lambda (lst out) (let ((os #f)) (let do-list ((lst lst)) (for-each (lambda (thing) (let do-thing ((thing thing)) (cond ((string? thing) (or os (set! os (open-output-string))) (display thing os)) ((list? thing) (do-list thing)) ((eq? thing #t) #f) (else (do-thing (foreign-filter thing #t)))))) lst)) (and os (begin (display #\= out) (write-attr-val (%gosc os) out)))))) (do-attr (lambda (attr) (or (list? attr) (error 'write-shtml-as-html "invalid SHTML attribute: ~S" attr)) (or (null? attr) (let ((name (car attr))) (or (symbol? name) (error 'write-shtml-as-html "invalid name in SHTML attribute: ~S" attr)) (or (eq? name '@) (begin (display #\space out) (display name out) (collect-and-write-attr-val (cdr attr) out) ))))))) (do-thing shtml) #f)) (define (%write-dquote-ampified str out) ;; TODO: If we emit """, we really should parse it, and HTML 4.01 says ;; we should, but anachronisms in HTML create the potential for nasty ;; mutilation of URI in attribute values. (let ((len (string-length str))) (let loop ((i 0)) (if (< i len) (begin (display (let ((c (string-ref str i))) (if (eqv? c #\") """ c)) out) (loop (+ 1 i))) #f)))) ;;; @defproc write-shtml-as-html shtml [out [foreign-filter]] ;;; ;;; Writes a conventional HTML transliteration of the SHTML @var{shtml} to ;;; output port @var{out}. If @var{out} is not specified, the default is the ;;; current output port. HTML elements of types that are always empty are ;;; written using HTML4-compatible XHTML tag syntax. ;;; ;;; If @var{foreign-filter} is specified, it is a procedure of two argument ;;; that is applied to any non-SHTML (``foreign'') object encountered in ;;; @var{shtml}, and should yield SHTML. The first argument is the object, and ;;; the second argument is a boolean for whether or not the object is part of ;;; an attribute value. ;;; ;;; No inter-tag whitespace or line breaks not explicit in @var{shtml} is ;;; emitted. The @var{shtml} should normally include a newline at the end of ;;; the document. For example: ;;; ;;; @lisp ;;; (write-shtml-as-html ;;; '((html (head (title "My Title")) ;;; (body (@@ (bgcolor "white")) ;;; (h1 "My Heading") ;;; (p "This is a paragraph.") ;;; (p "This is another paragraph."))))) ;;; @end lisp ;;; ;;; outputs: ;;; ;;; @example ;;; <html><head><title>My Title</title></head><body bgcolor="whi ;;; te"><h1>My Heading</h1><p>This is a paragraph.</p><p>This is ;;; another paragraph.</p></body></html> ;;; @end example (define write-shtml-as-html (letrec ((error-foreign-filter (lambda (foreign-object in-attribute-value?) (error 'write-shtml-as-html (if in-attribute-value? "unhandled foreign object in shtml attribute value: ~S" "unhandled foreign object in shtml: ~S") foreign-object)))) (lambda (shtml . rest) (case (length rest) ((0) (%write-shtml-as-html/fixed shtml (current-output-port) error-foreign-filter)) ((1) (%write-shtml-as-html/fixed shtml (car rest) error-foreign-filter)) ((2) (%write-shtml-as-html/fixed shtml (car rest) (cadr rest))) (else (error 'write-shtml-as-html "extraneous arguments: ~S" (cddr rest))))))) ;;; @defproc shtml->html shtml ;;; ;;; Yields an HTML encoding of SHTML @var{shtml} as a string. For example: ;;; ;;; @lisp ;;; (shtml->html ;;; (html->shtml ;;; "<P>This is<br<b<I>bold </foo>italic</ b > text.</p>")) ;;; @result{} "<p>This is<br /><b><i>bold italic</i></b> text.</p>" ;;; @end lisp ;;; ;;; Note that, since this procedure constructs a string, it should normally ;;; only be used when the HTML is relatively small. When encoding HTML ;;; documents of conventional size and larger, @code{write-shtml-as-html} is ;;; much more efficient. (define (shtml->html shtml) (let ((os (open-output-string))) (write-shtml-as-html shtml os) (%gosc os))) ;;; @unnumberedsec History ;;; @table @asis ;;; ;;; @item Version 0.17 --- 2009-08-16 --- PLaneT @code{(1 4)} ;;; License is now LGPL3. Converted to author's new Scheme management system. ;;; Revamped high-level parser to not use mutable pairs, for PLT Scheme 4.x ;;; compatibility. Until the new portability mechanism is in place, the ;;; previous portable version of HtmlPrag is available at: ;;; @uref{http://www.neilvandyke.org/htmlprag/htmlprag-0-16.scm} ;;; ;;; @item Version 0.16 --- 2005-12-18 ;;; Documentation fix. ;;; ;;; @item Version 0.15 --- 2005-12-18 ;;; In the HTML parent element constraints that are used for structure ;;; recovery, @code{div} is now always permitted as a parent, as a stopgap ;;; measure until substantial time can be spent reworking the algorithm to ;;; better support @code{div} (bug reported by Corey Sweeney and Jepri). Also ;;; no longer convert to Scheme character any HTML numeric character reference ;;; with value above 126, to avoid Unicode problem with PLT 299/300 (bug ;;; reported by Corey Sweeney). ;;; ;;; @item Version 0.14 --- 2005-06-16 ;;; XML CDATA sections are now tokenized. Thanks to Alejandro Forero Cuervo ;;; for suggesting this feature. The deprecated procedures @code{sxml->html} ;;; and @code{write-sxml-html} have been removed. Minor documentation changes. ;;; ;;; @item Version 0.13 --- 2005-02-23 ;;; HtmlPrag now requires @code{syntax-rules}, and a reader that can read the ;;; at-sign character as a symbol. SHTML now has a special @code{&} element ;;; for character entities, and it is emitted by the parser rather than the old ;;; @code{*ENTITY*} kludge. @code{shtml-entity-value} supports both the new ;;; and the old character entity representations. @code{shtml-entity-value} ;;; now yields @code{#f} on invalid SHTML entity, rather than raising an error. ;;; @code{write-shtml-as-html} now has a third argument, @code{foreign-filter}. ;;; @code{write-shtml-as-html} now emits SHTML @code{&} entity references. ;;; Changed @code{shtml-named-char-id} and @code{shtml-numeric-char-id}, as ;;; previously warned. Testeez is now used for the test suite. Test procedure ;;; is now the internal @code{%htmlprag:test}. Documentation changes. ;;; Notably, much documentation about using HtmlPrag under various particular ;;; Scheme implementations has been removed. ;;; ;;; @item Version 0.12 --- 2004-07-12 ;;; Forward-slash in an unquoted attribute value is now considered a value ;;; constituent rather than an unconsumed terminator of the value (thanks to ;;; Maurice Davis for reporting and a suggested fix). @code{xml:} is now ;;; preserved as a namespace qualifier (thanks to Peter Barabas for ;;; reporting). Output port term of @code{write-shtml-as-html} is now ;;; optional. Began documenting loading for particular implementation-specific ;;; packagings. ;;; ;;; @item Version 0.11 --- 2004-05-13 ;;; To reduce likely namespace collisions with SXML tools, and in anticipation ;;; of a forthcoming set of new features, introduced the concept of ``SHTML,'' ;;; which will be elaborated upon in a future version of HtmlPrag. Renamed ;;; @code{sxml-@var{x}-symbol} to @code{shtml-@var{x}-symbol}, ;;; @code{sxml-html-@var{x}} to @code{shtml-@var{x}}, and ;;; @code{sxml-token-kind} to @code{shtml-token-kind}. @code{html->shtml}, ;;; @code{shtml->html}, and @code{write-shtml-as-html} have been added as ;;; names. Considered deprecated but still defined (see the ``Deprecated'' ;;; section of this documentation) are @code{sxml->html} and ;;; @code{write-sxml-html}. The growing pains should now be all but over. ;;; Internally, @code{htmlprag-internal:error} introduced for Bigloo ;;; portability. SISC returned to the test list; thanks to Scott G. Miller ;;; for his help. Fixed a new character @code{eq?} bug, thanks to SISC. ;;; ;;; @item Version 0.10 --- 2004-05-11 ;;; All public identifiers have been renamed to drop the ``@code{htmlprag:}'' ;;; prefix. The portability identifiers have been renamed to begin with an ;;; @code{htmlprag-internal:} prefix, are now considered strictly ;;; internal-use-only, and have otherwise been changed. @code{parse-html} and ;;; @code{always-empty-html-elements} are no longer public. ;;; @code{test-htmlprag} now tests @code{html->sxml} rather than ;;; @code{parse-html}. SISC temporarily removed from the test list, until an ;;; open source Java that works correctly is found. ;;; ;;; @item Version 0.9 --- 2004-05-07 ;;; HTML encoding procedures added. Added ;;; @code{htmlprag:sxml-html-entity-value}. Upper-case @code{X} in hexadecimal ;;; character entities is now parsed, in addition to lower-case @code{x}. ;;; Added @code{htmlprag:always-empty-html-elements}. Added additional ;;; portability bindings. Added more test cases. ;;; ;;; @item Version 0.8 --- 2004-04-27 ;;; Entity references (symbolic, decimal numeric, hexadecimal numeric) are now ;;; parsed into @code{*ENTITY*} SXML. SXML symbols like @code{*TOP*} are now ;;; always upper-case, regardless of the Scheme implementation. Identifiers ;;; such as @code{htmlprag:sxml-top-symbol} are bound to the upper-case ;;; symbols. Procedures @code{htmlprag:html->sxml-0nf}, ;;; @code{htmlprag:html->sxml-1nf}, and @code{htmlprag:html->sxml-2nf} have ;;; been added. @code{htmlprag:html->sxml} now an alias for ;;; @code{htmlprag:html->sxml-0nf}. @code{htmlprag:parse} has been refashioned ;;; as @code{htmlprag:parse-html} and should no longer be directly. A number ;;; of identifiers have been renamed to be more appropriate when the ;;; @code{htmlprag:} prefix is dropped in some implementation-specific ;;; packagings of HtmlPrag: @code{htmlprag:make-tokenizer} to ;;; @code{htmlprag:make-html-tokenizer}, @code{htmlprag:parse/tokenizer} to ;;; @code{htmlprag:parse-html/tokenizer}, @code{htmlprag:html->token-list} to ;;; @code{htmlprag:tokenize-html}, @code{htmlprag:token-kind} to ;;; @code{htmlprag:sxml-token-kind}, and @code{htmlprag:test} to ;;; @code{htmlprag:test-htmlprag}. Verbatim elements with empty-element tag ;;; syntax are handled correctly. New versions of Bigloo and RScheme tested. ;;; ;;; @item Version 0.7 --- 2004-03-10 ;;; Verbatim pair elements like @code{script} and @code{xmp} are now parsed ;;; correctly. Two Scheme implementations have temporarily been dropped from ;;; regression testing: Kawa, due to a Java bytecode verifier error likely due ;;; to a Java installation problem on the test machine; and SXM 1.1, due to ;;; hitting a limit on the number of literals late in the test suite code. ;;; Tested newer versions of Bigloo, Chicken, Gauche, Guile, MIT Scheme, PLT ;;; MzScheme, RScheme, SISC, and STklos. RScheme no longer requires the ;;; ``@code{(define get-output-string close-output-port)}'' workaround. ;;; ;;; @item Version 0.6 --- 2003-07-03 ;;; Fixed uses of @code{eq?} in character comparisons, thanks to Scott G. ;;; Miller. Added @code{htmlprag:html->normalized-sxml} and ;;; @code{htmlprag:html->nonnormalized-sxml}. Started to add ;;; @code{close-output-port} to uses of output strings, then reverted due to ;;; bug in one of the supported dialects. Tested newer versions of Bigloo, ;;; Gauche, PLT MzScheme, RScheme. ;;; ;;; @item Version 0.5 --- 2003-02-26 ;;; Removed uses of @code{call-with-values}. Re-ordered top-level definitions, ;;; for portability. Now tests under Kawa 1.6.99, RScheme 0.7.3.2, Scheme 48 ;;; 0.57, SISC 1.7.4, STklos 0.54, and SXM 1.1. ;;; ;;; @item Version 0.4 --- 2003-02-19 ;;; Apostrophe-quoted element attribute values are now handled. A bug that ;;; incorrectly assumed left-to-right term evaluation order has been fixed ;;; (thanks to MIT Scheme for confronting us with this). Now also tests OK ;;; under Gauche 0.6.6 and MIT Scheme 7.7.1. Portability improvement for ;;; implementations (e.g., RScheme 0.7.3.2.b6, Stalin 0.9) that cannot read the ;;; at-sign character as a symbol (although those implementations tend to ;;; present other portability issues, as yet unresolved). ;;; ;;; @item Version 0.3 --- 2003-02-05 ;;; A test suite with 66 cases has been added, and necessary changes have been ;;; made for the suite to pass on five popular Scheme implementations. XML ;;; processing instructions are now parsed. Parent constraints have been added ;;; for @code{colgroup}, @code{tbody}, and @code{thead} elements. Erroneous ;;; input, including invalid hexadecimal entity reference syntax and extraneous ;;; double quotes in element tags, is now parsed better. ;;; @code{htmlprag:token-kind} emits symbols more consistent with SXML. ;;; ;;; @item Version 0.2 --- 2003-02-02 ;;; Portability improvements. ;;; ;;; @item Version 0.1 --- 2003-01-31 ;;; Dusted off author's old Guile-specific code from April 2001, converted to ;;; emit SXML, mostly ported to R5RS and SRFI-6, added some XHTML support and ;;; documentation. A little preliminary testing has been done, and the package ;;; is already useful for some applications, but this release should be ;;; considered a preview to invite comments. ;;; ;;; @end table (provide html->shtml html->sxml html->sxml-0nf html->sxml-1nf html->sxml-2nf make-html-tokenizer make-shtml-entity parse-html/tokenizer shtml->html shtml-comment-symbol shtml-decl-symbol shtml-empty-symbol shtml-end-symbol shtml-entity-symbol shtml-entity-value shtml-named-char-id shtml-numeric-char-id shtml-pi-symbol shtml-start-symbol shtml-text-symbol shtml-token-kind shtml-top-symbol tokenize-html write-shtml-as-html)