parser.rkt
;; This is a Racket-based parser for a wiki syntax based closely on
;; WikiCreole, as described below.  To use it, require it with:
;;
;;     (require "parser.ss")
;;
;; Provided functions:
;;
;;   PARSE : port-or-string -> wikitext
;;     Parse input, returning a wikitext object
;;
;;   BODY : wikitext -> list
;;     Return the parsed SEXP from the source document
;;
;;   LOOKUP : wikitext symbol -> string or false
;;     Return named metadata
;;
;;   LOOKUP-PARSED : wikitext symbol -> string or false
;;     Return named metadata.
;;     The keywords 'date' and 'updated' are treated specially,
;;     and this procedure, when retrieving those,
;;     returns a (srfi-19) date object, rather than a string.  For
;;     other keywords, simply the string form is returned, just as
;;     with plain LOOKUP.
;;
;;   SET-METADATA! : wikitext symbol string -> void
;;     force metadata key to have a given value
;;
;;   WIKITEXT? : any -> boolean
;;     Return true if the argument is a wikitext object
;;
;; For an example, see src/wiki2html.scm
;;
;;
;;
;; The dialect parsed here is the consensus
;; WikiCreole syntax of <http://www.wikicreole.org/>.
;; It handles all of the test cases at
;; <http://www.wikicreole.org/wiki/Creole1.0TestCases>,
;; except for one test of wiki-internal links (which is in any case somewhat underspecified).
;;
;; In particular, the supported syntax is
;; //italics//
;; **bold**
;; ##monospace## (the Creole spec doesn't commit itself, but implies this)
;;  * bulleted list (including sublists)
;;  # numbered list (including sublists)
;; >quoted paragraph (including multiple levels)
;; [[link to wikipage]]
;; [[URL|description]]
;; {{image.png}} or {{image.png|alt text}} or {{image.png|att=value;att=value}}
;; == heading
;; === subheading
;; ==== subsubheading
;; line\\break
;; {{{in-line literal text}}}
;; {{{
;; preformatted text
;; }}}
;; ~escaped character, and ~http://url which isn't linked
;;
;; A line which begins with "**", with possible whitespace either
;; side, is a bulletted list if the line before it is a bulleted list,
;; but is a paragraph starting with bold text otherwise.  The
;; analogous statement is true of a line starting with "##".
;;
;; The '>' paragraph quoting is probably an extension to the
;; WikiCreole syntax (the documentation is not not clear).
;; The behaviour of a line starting "##" is not specified in the
;; WikiCreole definition, but this is clearly compatible with it.
;;
;; To this I add syntax:
;;
;;   ::foo bar baz
;;         adds the keyword 'foo' with the string 'bar baz'
;;         (the syntax "^:tag" works, for example, but the doubled
;;         colon fits in a bit more naturally with the WikiCreole syntax).
;;         If a keyword is given once, then its value is the string
;;         after the keyword; if a keyword is given twice, its value
;;         is a list of the strings given on successive occasions.
;;   "quoted"
;;        corresponds to <q>quoted</q> (note that's a double-quote
;;        character, not two single quotes)
;;   <<element-name   content>>
;;         adds <element-name>content</element-name>
  
#lang at-exp racket

(require parser-tools/lex
         parser-tools/yacc
         (prefix-in : parser-tools/lex-sre)
         racket/contract
         scribble/srcdoc
         srfi/9                         ;records
         (prefix-in srfi19: (only-in srfi/19 string->date))
         (only-in srfi/13 substring/shared string-trim-right)
         (prefix-in xml: (only-in xml xexpr?)))

(require/doc scheme/base
             scribble/manual)

;; contract predicates
(define-record-type :wikitext
  (wikitext m b)
  wikitext?
  (b wikitext-body)
  (m wikitext-metadata set-wikitext-metadata!))

(define wikitext/c
  (flat-named-contract "wiki-text" wikitext?))

(provide/doc
 (proc-doc/names wikitext? (-> any/c boolean?) (x)
                 @{Returns @scheme[#t] if @var{x} is a parsed wikitext object.})
 (proc-doc/names parse (-> (or/c port? string?) wikitext/c) (source)
                 @{Parse the @var{source} into a wikitext object.})
 (proc-doc/names body (-> wikitext/c (listof xml:xexpr?)) (wikitext)
                 @{Extract the body of the document from the wikitext object.})
 (proc-doc/names lookup (-> wikitext/c symbol? (or/c string? false/c)) (wikitext key)
                 @{Retrieve the metadata value corresponding to key @var{key}.
                            If several were specified, they are concatenated.})
 (proc-doc/names lookup/multiple
                 (-> wikitext/c symbol? (listof string?))
                 (wikitext key)
                 @{Retrieve the multiple metadata values corresponding to @var{key},
                            or an empty list if there was none.})
 (proc-doc/names lookup-parsed
                 (-> wikitext/c symbol? any)
                 (wikitext key)
                 @{Like @scheme[lookup], except that, depending on the key,
                        the value is returned as a parsed object.  The only ones treated specially
                        are @scheme['date] and @scheme['updated], which are returned as
                        SRFI-19 date objects.})
 (proc-doc/names lookup-keys (-> wikitext/c (listof symbol?)) (wikitext)
                 @{Return the list of available keys.})
 (proc-doc/names set-metadata!
                 (-> wikitext/c symbol? string? any)
                 (wikitext key value)
                 @{Set a metadata key to have the given value.
                       This appends the value, so that the value changes for LOOKUP,
                       but is extended for LOOKUP/MULTIPLE.}))

(provide list-lexemes)                  ;debugging only

;; SRFI-8 receive
(define-syntax receive
  (syntax-rules ()
    ((receive formals expression body ...)
     (call-with-values (lambda () expression)
       (lambda formals body ...)))))

;; ASSERT : predicate
;; Simple assert macro: throws an error if the test isn't true
;; (sophistications would include indicating the source position, but that's included in the traceback)
(define-syntax assert
  (syntax-rules ()
    ((_ test)
     (when (not test)
           (error (format "Assertion failed: ~s => ~s" (quote test) test))))))

;; IN-TYPED-BLOCK? : symbol -> boolean
;; IN-TYPED-BLOCK? : symbol any -> any
;; Given one argument, return true if we are in the given block type
;; Given two arguments, set the block type to the first, and return the second
(define in-typed-block?
  (let ((in-block #f))
    (case-lambda
     ((block-type)
      (and in-block (eqv? block-type in-block)))
     ((new-in value)
      (set! in-block new-in)
      value))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; lexer definitions

(define-lex-abbrevs
  (horizontal-whitespace (:or #\space #\tab))
  ;; simple-punctuation is the set of characters which don't have
  ;; special meanings in wikitext, and which aren't HTML special
  (simple-punctuation (:or #\, #\. #\? #\' #\; #\: #\! #\@ #\-))
  ;; The following are more restrictive URL regexps.  I don't think I
  ;; need them at present, but keep them around for the mo.
  ;; (url-path-element (:+ (:or alphabetic
  ;;                            (:/ #\0 #\9)
  ;;                            #\/ #\& #\+ #\_ #\= #\# #\@ #\: #\. #\, #\- #\~
  ;;                            ;; include space, so that we match wiki links, too
  ;;                            #\space
  ;;                            ;; Are brackets legit in URLs? (they appear in Wikipedia URLs)
  ;;                            #\( #\)
  ;;                            ;; question marks and percents certainly are (perhaps I should be more systematic here!)
  ;;                            #\? #\%
  ;;                            )))
  ;; (full-url-string (:: (:+ lower-case ) "://"
  ;;                      (:* (:or alphabetic (:/ #\0 #\9) #\. #\: #\@)) ;host, port, and usernames
  ;;                      url-path-string))
  ;; (url-path-string (:: (:* (:: #\/ url-path-element))
  ;;                      url-path-element))
  (url-string (:: (:? (:+ lower-case ) "://")
                  (:+ (:or alphabetic
                           (:/ #\0 #\9)
                           #\/ #\& #\+ #\_ #\= #\# #\@ #\: #\. #\, #\- #\~
                           ;; include space, so that we match wiki links, too
                           #\space
                           ;; Are brackets legit in URLs? (they appear in Wikipedia URLs)
                           #\( #\)
                           ;; question marks and percents certainly are (perhaps I should be more systematic here!)
                           #\? #\%
                           ))))
  (eol-char (union #\newline #\return))
  (chars-to-line-end (:* (char-complement eol-char))))
;; url-regexp-string is used for spotting URLs in text, not in [[...]].
;; The following should arguably match URL-STRING, though that isn't assumed.
;; The place where it doesn't is where a non-enclosed URL has a query string -- we don't include the query.
(define url-regexp-string "(http|ftp)://[A-Za-z0-9/&+_=#@:.,~-]+[A-Za-z0-9/#]")
(define-lex-abbrev string-contents
  (:or alphabetic numeric simple-punctuation horizontal-whitespace))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; block parser -- break the text into chunks

(define-tokens block-value-tokens
  (HEADER METADATA-KEYWORD BESCAPED-TEXT
          QUOTED-PARAGRAPH-LINE QUOTED-PARAGRAPH-BLANK-LINE
          UL-ITEM OL-ITEM
          TABLE-ROW
          ORDINARY-LINE))
(define-empty-tokens block-empty-tokens
  (blank-line horizontal-line EOF))

(define block-lexer
  (lexer-src-pos
   ((eof) 'EOF)

   ((:+ (:: (:* horizontal-whitespace) eol-char)) ;multiple blank lines are equivalent to one
    (token-blank-line))

   ((:: (:>= 1 #\=) chars-to-line-end eol-char)
    (let ((parts (regexp-match #px"^(=+)\\s*(.*[^=\\s])[=\\s]*$" lexeme)))
      (if parts
          (token-HEADER (cons (string-length (list-ref parts 1))
                              (list-ref parts 2)))
          (token-HEADER (cons (count-chars #\= lexeme) ;return a dummy header -- is this the best thing to do?
                              "HEADER")))))

   ((:: "{{{" (:* horizontal-whitespace) eol-char)
    (let loop ((lines '())
               (first-line? #t))
      (cond ((regexp-try-match #rx"^}}}" input-port)
             => (λ (m) (token-BESCAPED-TEXT (apply string-append (reverse lines)))))
            (else
             (let ((line (read-line input-port 'any)))
               (if (eof-object? line)
                   (token-BESCAPED-TEXT (apply string-append (reverse lines)))
                   (loop (cons (if first-line? line (string-append "\n" line))
                               lines)
                         #f)))))))

   ((:: (:* horizontal-whitespace) (:+ (:: ">" (:* horizontal-whitespace))))
    (let ((line (read-line input-port 'any))
          (depth (count-chars #\> lexeme)))
      (if (= (string-length line) 0)
          (token-QUOTED-PARAGRAPH-BLANK-LINE (list depth))
          (token-QUOTED-PARAGRAPH-LINE (list depth line)))))

   ((:: "::" (:+ graphic) (:* horizontal-whitespace) chars-to-line-end eol-char)
    (cond ((regexp-match #px"^::(\\S+)\\s+(.*[^\\s])\\s*$" lexeme)
           => (λ (parts)
                 (token-METADATA-KEYWORD (cons (string->symbol (list-ref parts 1))
                                               (list-ref parts 2)))))
          ((regexp-match #px"^::(\\S+)" lexeme)
           => (λ (parts)
                 (token-METADATA-KEYWORD (cons (string->symbol (list-ref parts 1)) ""))))
          (else (error (format "What? Lexeme ~s matched keywords and didn't match!" lexeme)))))

   ((:: (:* horizontal-whitespace) (:+ "*") chars-to-line-end eol-char)
    (let* ((parts (regexp-match #px"^\\s*(\\*+)\\s*(.*)" lexeme))
           (n-hashes (count-chars #\* (list-ref parts 1))))
      (assert parts)
      (if (and (= n-hashes 2)
               (not (in-typed-block? 'block-ul)))
          (token-ORDINARY-LINE (chop lexeme)) ;this is a **bold** start to a line, not a UL
          (in-typed-block? 'block-ul    ;it's a UL, and we are now in a block-ul
                           (token-UL-ITEM (list n-hashes
                                                (chop (list-ref parts 2))))))))

   ((:: (:* horizontal-whitespace) (:+ "#") chars-to-line-end eol-char)
    (let* ((parts (regexp-match #px"^\\s*(#+)\\s*(.*)" lexeme))
           (n-stars (count-chars #\# (list-ref parts 1))))
      (assert parts)
      (if (and (= n-stars 2)
               (not (in-typed-block? 'block-ol)))
          (token-ORDINARY-LINE (chop lexeme)) ;this is a ##monospace## start to a line, not a OL
          (in-typed-block? 'block-ol    ;it's a OL, and we are now in a block-ol
                           (token-OL-ITEM (list n-stars
                                                (chop (list-ref parts 2))))))))

   ((:: (:* horizontal-whitespace) "|" chars-to-line-end (:? eol-char))
    (token-TABLE-ROW lexeme))

   ((repetition 4 +inf.0 "-")
    (token-horizontal-line))

   ((char-complement eol-char)
    ;; We've matched a single character (ie not the empty string),
    ;; which isn't any of the above, so it's an ordinary line.
    ;; (NB: this has to be the shortest regexp, or else it would match
    ;; all of the above)
    (token-ORDINARY-LINE (string-append lexeme (read-line input-port 'any))))

   ))

(define block-parser
  (parser
   (start start)
   (end EOF)
   (tokens block-value-tokens block-empty-tokens)
   (error (lambda (grammar? token value start end)
            (error 'parser "block ~a error at ~a:~a token=~s~%"
                   (if grammar? "grammar" "lexer")
                   (position-line start) (position-col start)
                   (if value
                       (cons token value)
                       token))))
   ;;(debug "block-debug.txt")
   (src-pos)

   (grammar
    (start (() 'empty)
           ((error start) (error 'parser "Fatal block parser error: ~s" $2))
           ((block-sequence)
            (let loop ((md (make-immutable-hasheqv '()))
                       (content $1)
                       (res '()))
              (cond ((null? content)    ;end
                     (wikitext md (reverse res)))
                    ((eqv? (caar content) '*METADATA*)
                     (let ((k (cadar content))
                           (v (cddar content)))
                       (loop (hash-update md
                                          k
                                          (λ (orig)
                                             (cond ((pair? orig) (append orig (list v)))
                                                   (orig (list orig v))
                                                   (else v)))
                                          #f)
                             (cdr content)
                             res)))
                    (else
                     (loop md
                           (cdr content)
                           (cons (car content) res)))))))

    (block-sequence
     ((block-element block-sequence) (cons $1 $2))
     ((block-element blank-line block-sequence) (cons $1 $3))
     ((block-element blank-line) (list $1)) ;this will be a trailing blank line
     ((block-element) (list $1)))

    (block-element
     ((metadata-line) `(*METADATA* . ,$1))
     ((header) $1)
     ((paragraph) `(p . ,(parse-paragraph (apply string-append $1))))
     ((quoted-block-of-paragraphs) (pack-paragraphs 'blockquote 'p (collect-paragraphs $1)))
     ((block-ul) (in-typed-block? #f       ;as of now, we are no longer in a block-ul
                                  (pack-paragraphs 'ul 'li (collect-paragraphs $1))))
     ((block-ol) (in-typed-block? #f
                                  (pack-paragraphs 'ol 'li (collect-paragraphs $1))))
     ((table) `(table . ,$1))
     ((horizontal-line) '(hr))
     ((BESCAPED-TEXT) (list 'pre $1)))

    (header
     ((HEADER) (cons (case (car $1)
                       ((1) 'h1)
                       ((2) 'h2)
                       ((3) 'h3)
                       ((4) 'h4)
                       ((5) 'h5)
                       (else (error 'parser "Unrecognised header with ~a '='" (car $1))))
                     (parse-paragraph (cdr $1)))))

    (paragraph
     ((ORDINARY-LINE paragraph) `(,$1 " " . ,$2))
     ((ORDINARY-LINE) (list $1)))

    (quoted-block-of-paragraphs
     ((quoted-paragraph quoted-block-of-paragraphs) (append $1 $2))
     ((quoted-paragraph) $1))
    ;; -> (list (list number string ...) ...), where the number is the indentation level
    ;; successive paragraph lines with the same indentation level are coalesced
    (quoted-paragraph
     ((QUOTED-PARAGRAPH-BLANK-LINE) (list $1))
     ((QUOTED-PARAGRAPH-LINE quoted-paragraph)
      (if (= (car $1) (caar $2))
          (cons (if (null? (cdar $2)) $1 (append $1 (list " ") (cdar $2)))
                (cdr $2))
          (cons $1 $2)))
     ((QUOTED-PARAGRAPH-LINE) (list $1)))

    (block-ul
     ((ul-item block-ul) (cons $1 $2))
     ((ul-item) (list $1)))
    (ul-item
     ((UL-ITEM paragraph) (append $1 (list " ") $2))
     ((UL-ITEM) $1))
    (block-ol
     ((ol-item block-ol) (cons $1 $2))
     ((ol-item) (list $1)))
    (ol-item
     ((OL-ITEM paragraph) (append $1 (list " ") $2))
     ((OL-ITEM) $1))

    (table
     ((TABLE-ROW table) (cons `(tr . ,(parse-paragraph $1 #t)) $2))
     ((TABLE-ROW)      `((tr . ,(parse-paragraph $1 #t)))))

    (metadata-line
     ((METADATA-KEYWORD) $1)))))

;; helper functions for the grammar above

;; COUNT-CHARS : char string -> integer
;; Count the number of times that the given character appears in the string
(define (count-chars ch s)
  (let loop ((i (- (string-length s) 1))
             (n 0))
    (cond ((< i 0) n)
          ((char=? (string-ref s i) ch) (loop (- i 1) (+ n 1)))
          (else (loop (- i 1) n)))))

;; COLLECT-PARAGRAPHS : list -> list
;; Paras is a list of (list integer string ...).
;; Transform each of the elements of this list into a list of strings contained inside
;; n-1 lists, where n is the integer at the beginning of the initial list.
;; Also (the hard bit) successive lists with the same n should be run together.
;; For example:
;; turn '((1 "a") (2 "b") (2 "c") (1 "d") (3 "e") (3 "f" "g") (3 "h"))
;; into '(("a") (("b") ("c")) ("d") ((("e") ("f" "g") ("h"))))
(define (collect-paragraphs paras)
  (define (get-sublists l)
    (cond ((null? l) (values 0 '() '()))
          ((null? (cdr l)) (values (caar l) (list (cdar l)) '()))
          (else
           (let ((collect-level (caar l)))
             (let loop ((tail-of-l l)
                        (head-of-l '()))
               (if (and (pair? tail-of-l)
                        (= collect-level (caar tail-of-l)))
                   (loop (cdr tail-of-l) (cons (car tail-of-l) head-of-l))
                   (values collect-level (map cdr (reverse head-of-l)) tail-of-l)))))))
  (define (enlist n l)
    ;; Pack L N levels deep in a list
    (if (> n 1)
        (list (enlist (- n 1) l))
        l))
  (let loop ((rest paras)
             (result '()))
    (receive (level head tail)
        (get-sublists rest)
      (if (null? head)
          (apply append (reverse result))
          (loop tail (cons (enlist level head) result))))))

;; PACK-PARAGRAPHS : symbol symbol list
;; PARAGRAPHS is a list of (lists of) strings
;; Wrap each list of strings in (inner ...), and each list of those in (outer ...)
(define (pack-paragraphs outer inner paragraphs)
  (define (pack-em para)
    (if (list? (car para))
        (cons outer (map pack-em para))
        (cons inner (parse-paragraph (apply string-append para)))))
  (pack-em paragraphs))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Paragraph parser -- parse the contents of paragraphs
;;
;; Note that the grammar isn't expressed in a LALR grammar, like above,
;; because that can't do the error recovery we require, such that
;; //italics// and **bold** don't have to be closed in a paragraph.
;; Instead, we use a stack and express the grammar procedurally.

(define-tokens paragraph-value-tokens
  (PSTRING PCHAR PESCAPED-TEXT OLINK OIMG OGENERAL))
(define-empty-tokens paragraph-empty-tokens
  (EMPH STRONG QUOTE LINEBREAK LINKSEP TABLE-HEADER-SEPARATOR CLINK CIMG))

(define paragraph-lexer
  (lexer
   ((eof) 'EOF)
   ((:: "~" any-char)
    (token-PCHAR (string-ref lexeme 1)))
   ("//" 'EMPH)
   ("**" 'STRONG)
   ("##" 'MONOSPACE)
   ("\"" 'QUOTE)
   ("\\\\" 'LINEBREAK)                 ;double backslash
   ((:: "[[" url-string)
    (token-OLINK (substring/shared lexeme 2 (string-length lexeme))))
   ((:: (:* horizontal-whitespace) "|=" (:* horizontal-whitespace))
    'TABLE-HEADER-SEPARATOR)
   ((:: (:* horizontal-whitespace) "|" (:* horizontal-whitespace))
    'LINKSEP)
   ("]]" 'CLINK)
   ((:: "<<" (:+ alphabetic) (:* horizontal-whitespace))
    (token-OGENERAL (car (regexp-match #rx"[a-zA-Z]+" lexeme))))
   (">>" 'CGENERAL)
   ("{{{"
    (let ((escaped-text (regexp-match #rx"(.*?)(}}}|$)" input-port)))
      (token-PESCAPED-TEXT (bytes->string/utf-8 (cadr escaped-text)))))
   ("{{"                                ;this must be shorter than the "{{{" match above
    (let ((escaped-text
           (regexp-try-match #px"^[[:alnum:]/][[:alnum:]/&+_=#@:.,~()?%-]*"; url-characters
                             input-port)))
      (if escaped-text
          (token-OIMG (bytes->string/utf-8 (car escaped-text)))
          (token-PSTRING "{{"))))
   ("}}" 'CIMG)
   ((:+ string-contents)
    (token-PSTRING lexeme))
   (any-char
    (token-PCHAR (string-ref lexeme 0)))))

;; We implement the grammar by pushing items onto a stack as we
;; discover them and then, when we discover 'formatting tokens' such
;; as '//', '**', or '[[', either pushing a suitable symbol onto the
;; stack, or popping multiple items to uncover that symbol, if it was
;; present already.

;; PUSH-STACK : list any -> list
;; Return a stack (a list) which has the given argument pushed onto it.
;; Special case: If the argument and the stack top are both strings,
;; just append them
(define (push-stack stack obj)
  (if (and (string? obj)
           (not (null? stack))
           (string? (car stack)))
      (cons (string-append (car stack) obj) (cdr stack))
      (cons obj stack)))

;; POP-STACK : list           -> list-or-false list
;; POP-STACK : list symbol... -> list-or-false list
;; If given one or more symbol arguments, and any of those symbols is the first symbol found
;; when working down the stack, then L is a list containing that symbol
;; and everything above it in the stack, in that order, and S is the
;; stack without this list; if the first symbol found in the stack
;; is not this symbol, then L is #f and S is the original stack.
;;
;; If there is no second argument, and there is at least one symbol in the stack,
;; then L is a list containing that symbol and everything above it in the stack,
;; in that order, and S is the remainder of the stack.
;;
;; If there is no second argument, and there is no symbol in the stack,
;; then L is a list containing the complete stack, and S is '().
;;
;; Return L and S as multiple values.
(define (pop-stack stack . args)
  (let ((args? (and (not (null? args))
                    (andmap symbol? args))))
    (if args?
        (let loop ((res '())
                   (s stack))
          (cond ((null? s)
                 (values #f stack))
                ((memv (car s) args)
                 => (λ (l)
                       (values (cons (car l) res)
                               (cdr s))))
                ((symbol? (car s))
                 (values #f stack))
                (else
                 (loop (cons (car s) res)
                       (cdr s)))))
        (let loop ((res '())
                   (s stack))
          (cond ((null? s)
                 (values res '()))
                ((symbol? (car s))
                 (values (cons (car s) res)
                         (cdr s)))
                (else
                 (loop (cons (car s) res)
                       (cdr s))))))))



;; PARSE-PARAGRAPH : string -> sexp
(define (parse-paragraph str . rest)
  (define (spot-urls s)
    ;; Spot URLs in the string S, avoiding ~escaped ones, and ones already in [[...]]
    (apply string-append
           (let loop ((start 0)
                      (p (regexp-match-positions* url-regexp-string s))
                      (res '()))
             (cond ((null? p)
                    ;; End of the list -- add the remainder of the string to the end, and return it
                    (append res (list (substring s start (string-length s)))))
                   ((and (>= (caar p) 1)
                         (char=? (string-ref s (- (caar p) 1)) #\~))
                    ;; This URL is preceded by a tilde
                    (let ((url-string (regexp-replace* "//" (substring s (caar p) (cdar p)) "~//")))
                      (loop (cdar p)
                            (cdr p)
                            (append res
                                    (list (substring s start (- (caar p) 1)) url-string)))))
                   ((and (>= (caar p) 2)
                         (string=? (substring s (- (caar p) 2) (caar p)) "[["))
                    ;; This URL is preceded by "[[" -- that is, it's explicitly marked up as a URL
                    (loop (cdar p)
                          (cdr p)
                          (append res
                                  (list (substring s start (cdar p))))))
                   (else
                    ;; Normal: add it to the returned list
                    (loop (cdar p)
                          (cdr p)
                          (append res
                                  (list (substring s start (caar p))
                                        "[[" ;... wrapping it in [[...]], to be parsed later
                                        (substring s (caar p) (cdar p))
                                        "]]"))))))))
  ;; CLEAR-STACK : list -> list
  (define (clear-stack stack)
    ;; Clear the given stack, returning the complete stack in reverse order
    ;; (ie, first things first).
    ;; This is the formatted result of the parse.
    ;; If we find a symbol in the stack, this is because the input
    ;; didn't close something, so 'close' it.
    ;; (this automatic closing doesn't work with [[...]] or <<...>>)
    (receive (content popped-stack)
        (pop-stack stack)
      (cond ((null? content)
             (error "clear-stack called with null stack"))
            ((symbol? (car content))
             ;; unclosed elements - re-push the list we received
             ;; ... apart from a few special cases
             (case (car content)
               ((OLINK)    (clear-stack (push-stack popped-stack (make-a-link content))))
               ((OIMG)     (clear-stack (push-stack popped-stack (make-an-img content))))
               ((OGENERAL) (clear-stack (push-stack popped-stack (make-general-element content))))
               ((td)
                (let ((last-el (list-ref content (- (length content) 1))))
                  (cond ((null? (cdr content)) ;empty (td) -- end of file
                         (clear-stack popped-stack))
                        ((and (string? last-el)
                              (string-trim-right last-el))
                         => (λ (trimmed)
                               ;; trim whitespace from the last string element of the (td)
                               (if (= (string-length trimmed) 0)
                                   (clear-stack popped-stack) ;it was empty, so skip this completely
                                   (clear-stack
                                    (push-stack popped-stack
                                                (append (drop-right content 1) (list trimmed)))))))
                        (else
                         (clear-stack (push-stack popped-stack content))))))
               (else
                ;; The 'normal' case.
                ;; Start clearing again.
                (clear-stack (push-stack popped-stack content)))))
            (else
             ;; CONTENT does not start with a symbol,
             ;; because there is no symbol remaining in the stack;
             ;; therefore popped-stack is '() (see pop-stack)
             content))))
  ;; MAKE-A-LINK : list => list
  ;; The CONTENT is '(OLINK ("link text") ?link-content...)
  (define (make-a-link content)
    (assert (and (pair? content)        ; (>= (length content) 2)
                 (pair? (cdr content))))
    (cond ((null? (cddr content));(= (length content) 2)
           `(a ((href ,(normalise-link (caadr content)))) ,(caadr content)))
          (else
           `(a ((href ,(normalise-link (caadr content)))) ,@(cddr content)))))
  ;; MAKE-AN-IMG : list -> list
  ;; The content is '(OIMG ("url-text") ?alt-content)
  (define (make-an-img content)
    (assert (and (pair? content)
                 (pair? (cdr content))))
    `(img ((src ,(caadr content))
           . ,(if (null? (cddr content))
                  '()
                  (map (λ (s)
                          (cond ((regexp-match #rx"^([a-zA-Z0-9]+)=(.+)" s)
                                 => (λ (m) `(,(string->symbol (cadr m)) ,(caddr m))))
                                (else
                                 `(alt ,s))))
                       (regexp-split #px";\\b" (caddr content)))))))
  ;; MAKE-GENERAL-ELEMENT : list => list
  ;; The CONTENT is '(OGENERAL (element-name) element-content ...)
  (define (make-general-element content)
    (assert (and (pair? content)     ;(>= (length content) 2)
                 (pair? (cdr content))))
    (cons (caadr content) (cddr content)))

  ;; set to work...
  (let ((get-lexeme (let ((p (open-input-string (spot-urls str))))
                      (lambda () (paragraph-lexer p))))
        (parsing-table-row? (and (not (null? rest)) (car rest))))
    (let loop ((stack '()))
      (let ((t (get-lexeme)))
        (case (token-name t)
          ((EOF)                        ;END OF PARSE
           (clear-stack stack))
          ((PSTRING)
           (loop (push-stack stack (token-value t))))
          ((PESCAPED-TEXT)
           (loop
            (push-stack stack
                        (escape-xml-chars-in-string (token-value t)))))
          ((PCHAR)
           (loop
            (push-stack stack
                        (escape-char (token-value t)))))
          ((EMPH)
           (receive (content s2)
               (pop-stack stack 'em)
             (loop (push-stack s2 (or content 'em)))))
          ((STRONG)
           (receive (content s2)
               (pop-stack stack 'strong)
             (loop (push-stack s2 (or content 'strong)))))
          ((MONOSPACE)
           (receive (content s2)
               (pop-stack stack 'code)
             (loop (push-stack s2 (or content 'code)))))
          ((QUOTE)
           (receive (content s2)
               (pop-stack stack 'q)
             (loop (push-stack s2 (or content 'q)))))

          ((OLINK)
           (loop
            (push-stack (push-stack stack 'OLINK)
                        (list (token-value t)))))
          ((CLINK)
           (receive (content s2)
               (pop-stack stack 'OLINK)
             ;; if there's any content, it will be '(OLINK ("link") ?link-content)
             (loop (push-stack s2 (if content (make-a-link content) "]]")))))

          ;; LINKSEP is both the separator between OLINK and CLINK, and the table column separator
          ((LINKSEP)
           (if parsing-table-row?
               (receive (content s2)
                   (pop-stack stack 'OLINK 'td 'th)
                 (case (and content (car content))
                   ((OLINK) (loop stack))
                   ((td th) (loop (push-stack (push-stack s2 content) 'td)))
                   (else (loop (push-stack s2 'td)))))
               (loop stack)))
          ((TABLE-HEADER-SEPARATOR)
           (receive (content s2)
               (pop-stack stack 'td 'th)
             (if content
                 (loop (push-stack (push-stack s2 content) 'th))
                 (loop (push-stack s2 'th)))))

          ((OIMG)
           (loop
            (push-stack (push-stack stack 'OIMG)
                                   (list (token-value t)))))
          ((CIMG)
           (receive (content s2)
               (pop-stack stack 'OIMG)
             ;; if there's any content, it will be '(OIMG ("link") ?link-content)
             (loop (push-stack s2 (if content (make-an-img content) "}}")))))

          ((OGENERAL)
           ;; push first '(token-value) then 'OGENERAL to the stack
           (loop (push-stack (push-stack stack 'OGENERAL) (list (string->symbol (token-value t))))))
          ((CGENERAL)
           (receive (content s2)
               (pop-stack stack 'OGENERAL)
             ;; If there's any content, it is '(OGENERAL (token-value) real-content...)
             (loop (push-stack s2 (if content (make-general-element content) ">>")))))

          ((LINEBREAK)
           (loop (push-stack stack '(br))))
          (else
           (error 'parse-paragraph "Unexpected token ~s~a"
                  (token-name t)
                  (if (token? t)
                      (format " (~s)" (token-value t))
                      ""))))))))

;;;;;;;;;;;;;;;
;;
;; Support functions

;; ESCAPE-CHAR : char -> string
;; Convert the character to a string, escaping it if necessary.
(define (escape-char c)
  (case c
    ((#\<) "&lt;")
    ((#\>) "&gt;")
    ((#\&) "&amp;")
    (else (string c))
    ;; The following replaces any non-ASCII characters by HTML character references.
    ;; I think this is unnecessary, and adds complication, so skip it.
    ;;     (else
    ;;      (if (and (char<=? #\! c)
    ;;               (char<=? c #\~))
    ;;          (list->string (list c))
    ;;          (format "&#~a;" (number->string (char->integer c) 16))))
    ))

;; ESCAPE-XML-CHARS-IN-STRING : string -> string
(define (escape-xml-chars-in-string s)
  (regexp-replace* #rx"[&<>]"
                   s
                   (lambda (match)
                     (escape-char (string-ref match 0)))))
;;   (escape-xml-chars-in-string "nothing here") => "nothing here"
;;   (escape-xml-chars-in-string "one & two") => "one &amp; two"
;;   (escape-xml-chars-in-string "<text>") => "&lt;text&gt;"

;; (define (trim-space s)
;;   (regexp-replace (regexp "^ *(.*[^ ]) *$") s "\\1"))
(define (chop s)
  (substring/shared s 0 (- (string-length s) 1)))

(define (normalise-link s)
  (if (regexp-match? (regexp "^[a-z]*://") s)
      s                               ;it's a URL
      (string-downcase (regexp-replace* (regexp " +") s "_"))))

;; Debugging procedure
(define (list-lexemes p lexer-spec)
  (let ((get-lexeme (if (eqv? lexer-spec 'block)
                        (λ () (position-token-token (block-lexer p)))
                        (λ () (paragraph-lexer p)))))
    (let loop ()
      (let ((l (get-lexeme)))
        (if (eqv? l 'EOF)
            '()
            (cons l (loop)))))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Exported functions

;; PARSE : port-or-string -> wikitext
(define (parse source)
  (define (make-wiki-parser p)
    (port-count-lines! p)
    (lambda ()
      (block-parser (lambda () (block-lexer p)))))
  (cond ((port? source)
         ((make-wiki-parser source)))
        ((string? source)
         (parse (open-input-string source)))
        (else
         (error 'parser "Can't parse source of type ~s" source))))

;; BODY : wikitext -> list
;; Return the parsed SEXP from the source document
(define (body w)
  (wikitext-body w))

;; The following is a nice idea, but (of course) (body w) is a sexp,
;; not a string, so what's actually required here is a prettyprinter.
;; ;; BODY/METADATA : wikitext -> string
;; ;; Return the body as a string, but with metadata tidied up.
;; (define (body/metadata w)
;;   (with-output-to-string
;;     (λ ()
;;        (for-each (λ (k)
;;                     (for-each (λ (v) (printf "::~a ~a~%" k v))
;;                               (lookup/multiple w k)))
;;                  (lookup-keys w))
;;        (with-input-from-string (body w)
;;          (λ ()
;;             (let loop ((line (read-line)))
;;               (cond ((eof-object? line) (void))
;;                     ((regexp-match? #rx"^::" line) (loop (read-line)))
;;                     (else (display line)
;;                           (loop (read-line))))))))))

;; LOOKUP : wikitext symbol -> string or false
;; Return named metadata, or false if the keyword was not mentioned.
;; If the keyword was specified multiple times, this returns only the final one.
;; If the keyword was present but had no value, this returns an empty string
(define (lookup w key)
  (let ((value (hash-ref (wikitext-metadata w) key #f)))
    (cond ((not value)
           #f)
          ((pair? value)
           (list-ref value (- (length value) 1)))
          (else
           value))))

;; LOOKUP/MULTIPLE : wikitext symbol -> list or false
;; Return named metadata as a list, or false if the keyword was not mentioned.
;; If the keyword was specified multiple times, this returns all of the values in order.
;; If the keyword was present but had no value, return an empty list
(define (lookup/multiple w key)
  (let ((value (hash-ref (wikitext-metadata w) key #f)))
    (cond ((not value)
           #f)
          ((pair? value)
           value)
          ((string=? value "")
           '())
          (else
           (list value)))))

;; PARSE-DATE : string -> srfi-19-date or false
;; Parse a string into a date.  The date can be formatted in a variety
;; of formats, both year-month-day and day-month-year.
(define (parse-date s)
  (define (parse-date* s formats)
    (with-handlers ((exn:fail?
                     (lambda (exn)
                       (parse-date* s (cdr formats)))))
                   (and (not (null? formats))
                        (srfi19:string->date s (car formats)))))
  (if (regexp-match? #rx"^ *[0-9][0-9][0-9][0-9]" s)
      (parse-date* s '("~Y~m~dT~H~M~S" "~Y~m~dT~H~M" "~Y~m~d" "~Y~b~d" "~Y~B~d"))
      (parse-date* s '("~d~b~Y" "~d~B~Y" "~d~m~Y"))))

;; LOOKUP-PARSED : wikitext symbol -> any or false
;; Return named metadata, possibly parsed as appropriate.
;; Return #f if the key is not found, or cannot be parsed.
;; The only ones handled specially at present are
;;    'date    : wikitext symbol -> srfi19:date? or false
;;    'updated : wikitext symbol -> srfi19:date? or false
(define (lookup-parsed w key)
  (let ((val (lookup w key)))
    (case key
      ((date updated) (and val (parse-date val)))
      (else val))))

;; LOOKUP-KEYS : wikitext -> list-of-symbols
;; Return the list of metadata keys
(define (lookup-keys w)
  (hash-map (wikitext-metadata w) (lambda (k v) k)))

;; SET-METADATA! : wikitext symbol string -> void
;; Set a metadata key to have a given value.
;; This appends the value, so that the value changes for LOOKUP,
;; but is extended for LOOKUP/MULTIPLE.
;; Is there a case for a set-metadata/replacement! ?
(define (set-metadata! w key value)
  (let ((prev (lookup/multiple w key)))
    (set-wikitext-metadata! w
                            (hash-set (wikitext-metadata w)
                                      key
                                      (if prev
                                          (append prev (list value))
                                          value)))))

;; SET-PARSED-METADATA! : wikitext symbol any -> void
;; The companion to SET-METADATA! and LOOKUP-PARSED.  Given a parsed object,
;; set that in the metadata.
;; Actually, there seems little point in this, and it's only added complication/testing.
;; (define (set-parsed-metadata! w key value)
;;   (case key
;;     ((date) (set-metadata! w 'date (srfi19:date->string value "~1")))
;;     (else #f)))
;; (provide set-parsed-metadata!)
;; Can't say the following: 'any' isn't allowed in the domain.
;; (provide/contract (set-parsed-metadata!
;;                    (wikitext/c symbol? any . -> . any)))