ssax/input-parse.rkt
#lang racket/base
(require racket/port
         "myenv.ss"
         "parse-error.ss")
(provide (all-defined-out))

;****************************************************************************
;			Simple Parsing of input
;
; The following simple functions surprisingly often suffice to parse
; an input stream. They either skip, or build and return tokens,
; according to inclusion or delimiting semantics. The list of
; characters to expect, include, or to break at may vary from one
; invocation of a function to another. This allows the functions to
; easily parse even context-sensitive languages.
;
; EOF is generally frowned on, and thrown up upon if encountered.
; Exceptions are mentioned specifically. The list of expected characters
; (characters to skip until, or break-characters) may include an EOF
; "character", which is to be coded as symbol *eof*
;
; The input stream to parse is specified as a PORT, which is usually
; the last (and optional) argument. It defaults to the current input
; port if omitted.
;
; IMPORT
; This package relies on a function parser-error, which must be defined
; by a user of the package. The function has the following signature:
;	parser-error PORT MESSAGE SPECIALISING-MSG*
; Many procedures of this package call parser-error to report a parsing
; error.  The first argument is a port, which typically points to the
; offending character or its neighborhood. Most of the Scheme systems
; let the user query a PORT for the current position. MESSAGE is the
; description of the error. Other arguments supply more details about
; the problem.
;
; $Id: input-parse.scm,v 1.1.1.1 2001/07/11 19:33:43 oleg Exp $

;------------------------------------------------------------------------
;		     Preparation and tuning section

; This package is heavily used. Therefore, we take time to tune it in,
; in particular for Gambit.

;------------------------------------------------------------------------

; -- procedure+: peek-next-char PORT
; 	advances to the next character in the PORT and peeks at it.
; 	This function is useful when parsing LR(1)-type languages
; 	(one-char-read-ahead).

(define (peek-next-char port)
  (read-char port) 
  (peek-char port)) 


;------------------------------------------------------------------------

; -- procedure+: assert-curr-char CHAR-LIST STRING PORT
;	Reads a character from the PORT and looks it up
;	in the CHAR-LIST of expected characters
;	If the read character was found among expected, it is returned
;	Otherwise, the procedure writes a nasty message using STRING
;	as a comment, and quits.

(define (assert-curr-char expected-chars comment port)
  (let ((c (read-char port)))
    (if (memq c expected-chars) c
    (parser-error port "Wrong character " c
    	   " (0x" (if (eof-object? c) "*eof*"
    	   	    (number->string (char->integer c) 16)) ") "
    	   comment ". " expected-chars " expected"))))
    	   

; -- procedure+: skip-until CHAR PORT
;	Reads and skips characters from the PORT until the break
;	character is encountered. This break character is returned.

(define (skip-until-char stop-char port)
  (let loop ((c (read-char port)))
    (cond
     ((eqv? c stop-char) c)
     ((eof-object? c)
      (parser-error port "Unexpected EOF while skipping until " stop-char))
     (else (loop (read-char port))))))

; -- procedure+: skip-while CHAR-LIST PORT
;	Reads characters from the PORT and disregards them,
;	as long as they are mentioned in the CHAR-LIST.
;	The first character (which may be EOF) peeked from the stream
;	that is NOT a member of the CHAR-LIST is returned. This character
;	is left on the stream.

(define (skip-while skip-chars port)
  (do ((c (peek-char port) (peek-char port)))
      ((not (memv c skip-chars)) c)
    (read-char port)))

; whitespace const

;------------------------------------------------------------------------
;				Stream tokenizers


; -- procedure+:
;    next-token PREFIX-CHAR-LIST BREAK-CHAR-LIST [COMMENT-STRING] [PORT]
;	skips any number of the prefix characters (members of the
;	PREFIX-CHAR-LIST), if any, and reads the sequence of characters
;	up to (but not including) a break character, one of the
;	BREAK-CHAR-LIST.
;	The string of characters thus read is returned.
;	The break character is left on the input stream
;	The list of break characters may include EOF, which is to be coded as
;	a symbol *eof*. Otherwise, EOF is fatal, generating an error message
;	including a specified COMMENT-STRING (if any)
;
; Note: since we can't tell offhand how large the token being read is
; going to be, we make a guess, pre-allocate a string, and grow it by
; quanta if necessary. The quantum is always the length of the string
; before it was extended the last time. Thus the algorithm does
; a Fibonacci-type extension, which has been proven optimal.
; Note, explicit port specification in read-char, peek-char helps.

; Procedure input-parse:init-buffer
; returns an initial buffer for next-token* procedures.
; The input-parse:init-buffer may allocate a new buffer per each invocation:
;	(define (input-parse:init-buffer) (make-string 32))
; Size 32 turns out to be fairly good, on average.
; That policy is good only when a Scheme system is multi-threaded with
; preemptive scheduling, or when a Scheme system supports shared substrings.
; In all the other cases, it's better for input-parse:init-buffer to
; return the same static buffer. next-token* functions return a copy
; (a substring) of accumulated data, so the same buffer can be reused.
; We shouldn't worry about new token being too large: next-token will use
; a larger buffer automatically. Still, the best size for the static buffer
; is to allow most of the tokens to fit in.
; Using a static buffer _dramatically_ reduces the amount of produced garbage
; (e.g., during XML parsing).
;; ryanc: Unfortunately, single static buffer not safe in Racket
;; FIXME: tune size, see if thread-cell cache is worth it
(define input-parse:init-buffer
  (let ([buffers (make-thread-cell #f)])
    (lambda ()
      (let ([buffer (thread-cell-ref buffers)])
        (or buffer
            (let ([buffer (make-string 512)])
              (thread-cell-set! buffers buffer)
              buffer))))))

(define (next-token prefix-skipped-chars break-chars comment port)
  (let* ((buffer (input-parse:init-buffer))
	 (curr-buf-len (string-length buffer)) (quantum 16))
    (let loop ((i 0) (c (skip-while prefix-skipped-chars port)))
      (cond
        ((memq c break-chars) (substring buffer 0 i))
    	((eof-object? c)
    	  (if (memq '*eof* break-chars)
    	    (substring buffer 0 i)		; was EOF expected?
    	    (parser-error port "EOF while reading a token " comment)))
    	(else
    	  (when (>= i curr-buf-len)	; make space for i-th char in buffer
    	    (begin			; -> grow the buffer by the quantum
    	      (set! buffer (string-append buffer (make-string quantum)))
    	      (set! quantum curr-buf-len)
    	      (set! curr-buf-len (string-length buffer))))
    	  (string-set! buffer i c)
    	  (read-char port)			; move to the next char
    	  (loop (++ i) (peek-char port))
    	  )))))


; -- procedure+: next-token-of INC-CHARSET PORT
;	Reads characters from the PORT that belong to the list of characters
;	INC-CHARSET. The reading stops at the first character which is not
;	a member of the set. This character is left on the stream.
;	All the read characters are returned in a string.
;
; -- procedure+: next-token-of PRED PORT
;	Reads characters from the PORT for which PRED (a procedure of one
;	argument) returns non-#f. The reading stops at the first character
;	for which PRED returns #f. That character is left on the stream.
;	All the results of evaluating of PRED up to #f are returned in a
;	string.
;
;	PRED is a procedure that takes one argument (a character
;	or the EOF object) and returns a character or #f. The returned
;	character does not have to be the same as the input argument
;	to the PRED. For example,
;	(next-token-of (lambda (c)
;			  (cond ((eof-object? c) #f)
;				((char-alphabetic? c) (char-downcase c))
;				(else #f))))
;	will try to read an alphabetic token from the current
;	input port, and return it in lower case.
;
; Note: since we can't tell offhand how large the token being read is
; going to be, we make a guess, pre-allocate a string, and grow it by
; quanta if necessary. The quantum is always the length of the string
; before it was extended the last time. Thus the algorithm does
; a Fibonacci-type extension, which has been proven optimal.
;
; This procedure is similar to next-token but only it implements
; an inclusion rather than delimiting semantics.

(define (next-token-of incl-list/pred port)
  (let* ((buffer (input-parse:init-buffer))
	 (curr-buf-len (string-length buffer))
         (quantum 16))
  (if (procedure? incl-list/pred)
    (let loop ((i 0) (c (peek-char port)))
      (cond
        ((incl-list/pred c) =>
          (lambda (c)
            (when (>= i curr-buf-len)	; make space for i-th char in buffer
              (begin			; -> grow the buffer by the quantum
                (set! buffer (string-append buffer (make-string quantum)))
                (set! quantum curr-buf-len)
                (set! curr-buf-len (string-length buffer))))
            (string-set! buffer i c)
            (read-char port)			; move to the next char
            (loop (++ i) (peek-char port))))
        (else (substring buffer 0 i))))
			; incl-list/pred is a list of allowed characters
    (let loop ((i 0) (c (peek-char port)))
      (cond
        ((not (memq c incl-list/pred)) (substring buffer 0 i))
    	(else
    	  (when (>= i curr-buf-len)	; make space for i-th char in buffer
    	    (begin			; -> grow the buffer by the quantum
    	      (set! buffer (string-append buffer (make-string quantum)))
    	      (set! quantum curr-buf-len)
    	      (set! curr-buf-len (string-length buffer))))
    	  (string-set! buffer i c)
    	  (read-char port)			; move to the next char
    	  (loop (++ i) (peek-char port))
    	  ))))))

;; ============================================================

; -- Function: find-string-from-port? STR IN-PORT
;    Looks for a string STR within the input port IN-PORT
;    When the STR is found, the function returns the number of
;    characters it has read from the port, and the port is set
;    to read the first char after that (that is, after the STR)
;    The function returns #f when the string wasn't found
; Note the function reads the port *STRICTLY* sequentially, and does not
; perform any buffering. So the function can be used even if the port is open
; on a pipe or other communication channel.

(define (find-string-from-port? str input-port)
  (let ([output-counting-port (open-output-nowhere)])
    ;; Could just use file-position, but that returns byte count, not char count
    ;; Could use output string port, but don't need contents (memory use)
    (port-count-lines-enabled output-counting-port)
    (let ([result
           (regexp-match? (regexp-quote str)
                          input-port
                          0 #f
                          output-counting-port)])
      (and result
           (let-values ([(_line _col pos) (port-next-location output-counting-port)])
             (sub1 pos))))))

;; ============================================================

;		Character-encoding
;
; This module deals with particular character-encoding issues such as
; conversions between characters and their ASCII or UCS2 codes, Scheme
; representations of "Carriage Return" (CR), "tabulation" (TAB) and
; other control characters.
;
; This module by necessity is platform-specific as character encoding
; issues are hardly addressed in R5RS. For example, the result of
; char->integer is generally not an ASCII code of an integer (although
; it is, on many Scheme systems, with the important exception of
; Scheme48 and SCSH). The level of support for character sets other
; than ASCII varies widely among Scheme systems.
;
; This file collects various character-encoding functions that are
; necessary for the SSAX XML parser. The functions are of general use
; and scope.
;
; $Id: char-encoding.scm,v 1.1 2003/04/09 20:34:28 oleg Exp $


;	ascii->char INT -> CHAR
; return a character whose ASCII code is INT
; Note, because ascii->char is injective (there are more characters than
; ASCII characters), the inverse transformation is not defined.
(define ascii->char integer->char)


;	ucscode->char INT -> CHAR
; Return a character whose UCS (ISO/IEC 10646) code is INT
; Note
; This function is required for processing of XML character entities:
; According to Section "4.1 Character and Entity References"
; of the XML Recommendation:
;  "[Definition: A character reference refers to a specific character
;   in the ISO/IEC 10646 character set, for example one not directly
;   accessible from available input devices.]"

(define (ucscode->char code)
  (integer->char code))

; Commonly used control characters

(define char-return (ascii->char 13))
(define char-tab    (ascii->char 9))
(define char-newline (ascii->char 10)) ; a.k.a. #\newline, per R5RS
(define char-space (ascii->char 32))