forth/lexer-tx.ss
;; Forth (syntax) reader

;; Part of the brood experiment is to transform the highly
;; reflective/selfmodifying forth structure into a feed-forward
;; layered language approach. This approach facilitates source code
;; processing: representations are more transparent: the meaning
;; doesn't change while staying in one layer.

;; This approach allows a purely functional, purely concatenative
;; language (PCL) as intermediate representation.  However, this
;; approach necessarily gives up some simplicity: feedforward parsing
;; is reduced to a preprocessing step and requires a separate meta
;; language to describe transformations from Forth to the pure
;; concatenative language.

;; While doing this, some requirements need to be met

;; 1) forth should be character-delimited: simplifies on-target lexing
;; 2) first stage tokenizer in brood = word/white/comment segmentation
;; 3) the parser needs to be extensible

;; Here 1) is embedded in 2) to make source processing in an editor
;; (like emacs) easier by preserving the original source characters,
;; enhanced with markup. This representation should not introduce
;; structures that 1) can't interpret. For special purposes, the
;; system should be extensible to allow low-level forth parsing
;; extensions.

#lang scheme/base

(require
 (for-syntax scheme/base)
 (lib "match.ss")
 "../tools/stx.ss"
 "../tools/list.ss"
 "../tools/grabbag.ss"
 )

(provide read-forth-syntax
         read-forth
         string->forth-syntax
         file->forth-syntax
         )


;; Convert (syntax) string to lexed Forth syntax, preserving attributes.
(define (string->forth-syntax _str)
  (syntax-case _str ()
    (str
     (let ((code
            (port->list
             (port (syntax->datum #'str))
             read-forth)))
       (datum->syntax #'str
                      (map 
                       (lambda (atom)
                         (datum->syntax #'str atom))
                       code))))))


;; Syntax has source location from the file, but lexical context is
;; pluggable.
(define (file->forth-syntax file [context-stx #f])
  (let* ((port
          (open-input-file file))
         (forth
          (map
           (lexical-context-from context-stx)
           (port->list
            port
            (lambda (x)
              (read-forth-syntax file x))))))
    (close-input-port port)
    forth))


;; Convert file to lexed Forth syntax with proper lexical context.


(define-syntax in?
  (syntax-rules ()
    ((_ var lst)
     (case var
       (lst #t)
       (else #f)))))


(define (newline? x)
  (in? x (#\newline   ;; unix
          #\return))) ;; windows

(define (white? x)
  (and (char? x)
       (char-whitespace? x)))

(define (not-white? x)
  (not (white? x)))

;; This will emulate a standard forth reader, but preserves all
;; source characters for source text processing.

;; Read the next word, upto but excluding the delimiting whitespace
;; character. This produces either an eof-object, or a token with 3
;; elements: leading whitespace, word characters and delimiting
;; whitespace. Leading whitespace can contain zero
;; elements. Delimiting whitespace can contain zero elements only in
;; the EOF condition. Each element is tagged with it's starting
;; line/column/position.


(define (forth-collect p done?)

  (define (drop) (read-char p))
  (define (head) (peek-char p))
  (define stack '())
  (define (str!)
    (list->string (reverse stack)))

  (define l/c/p
    (let-values
        (((line column pos)
          (port-next-location p)))
      (list
       (or line 1)
       (or column 0)
       pos)))

  (define (done-at delim)
    (drop)
    (list l/c/p (str!) delim))

  (define (collect)
    (cond
     ((eof-object? (head))
      (if (not (null? stack)) ;; allow EOF terminated identifiers
          (begin
            ;; (printf "WARNING: EOF terminated identifer: ~s ~s\n" l/c/p stack)
            (done-at #\newline)) ;; fake newline
          (head)))
     ((done? (head))
      (done-at (head)))
     (else
      (push! stack (head))
      (drop)
      (collect))))
  
  (port-count-lines! p)
  (collect))


;; Read next word or whitespace
(define (forth-white/word p)
  
  (port-count-lines! p)
  (let ((head (peek-char p)))
    (cond
     ((eof-object? head)
      head)
     (else
      (cons (if (white? head)
                'white
                'word)
            (forth-collect p white?))))))


;; Implement character parsing words.
(define (forth-accept p)
  (match
   (forth-white/word p)

   ;; make this extensible
   (('word pos "\\" delimiter)
    (match
     (forth-collect p newline?)
     ((cpos str cdel)
      `(comment ,pos ,str ,cdel))))
   
   (other
    other)))


(define (num/sym str)
  (or
   (string->number str)
   (string->symbol str)))

;; Read discards comments and whitespace.

(define (read/map port fn)
  (match
   (forth-accept port)
   (('word (l c p) str del)
    (fn str l c p))
   ((and obj (= eof-object? #t)) obj)
   (other (read/map port fn))))

(define (read-forth port)
  (read/map
   port
   (lambda (str l c p)
     (num/sym str))))                     

(define (read-forth-syntax file port)
  (read/map
   port
   (lambda (str l c p)
     (datum->syntax
      #f (num/sym str)
      (list file l c p #f)
      #f #f))))