bsl/lexer.rkt
#lang racket

#|

File: lexer.rkt
Author: Bill Turtle (wrturtle)

Defines the lexer for Pyret BSL.

|#

(require parser-tools/lex)
(require (prefix-in : parser-tools/lex-sre))
(require (for-syntax "../utilities.rkt"))
(require 2htdp/image)

(provide/contract
 [expression-lexer (-> input-port? position-token?)]
)

(provide (rename-out [comment comment-re]
                     [alpha-numeric alpha-numeric-re]
                     [maybe-float float-re]
                     [maybe-imag imag-re]
                     [current-source-name lexer-source-name]
                     [identifier identifier-re]
         )
)

(provide value-tokens op-tokens keyword-tokens)

;; -----------------------------------------------------------------------------
;; Error Messages

(define MSG-ERROR-INVALID-NUM "this is not a valid number") 
(define MSG-ERROR-INVALID-OCT "this is not a valid octal number")
(define MSG-ERROR-INVALID-HEX "this is not a valid hexadecimal number")
(define MSG-ERROR-INVALID-IMAG "this is not a valid imaginary number")
(define MSG-ERROR-INVALID-FLOAT "this is not a valid floating-point number")
(define MSG-ERROR-INVALID-INEXACT "this is not a valid inexact number")
(define MSG-ERROR-INVALID-STRING "this is not a valid string")
(define MSG-ERROR-INVALID-TOKEN "this is not a valid piece of code")
(define MSG-ERROR-STRING-OCTAL-OVER-255 "this octal escape sequence is over 255")
(define MSG-ERROR-UNICODE-NUMBER-TOO-LARGE
  (string-append "unicode numbers must be smaller than 0x10ffff"))
(define MSG-ERROR-BAD-UNICODE
  "no hex digit after unicode escape sequence")


;; -----------------------------------------------------------------------------
;; Definitions

(define current-source-name (make-parameter #f))

;; We start out by defining our own type of exception
(define-struct (exn:fail:pyret:lexer 
                exn:fail)
  (a-srcloc)
  #:property prop:exn:srclocs
  (lambda (a-struct)
    (match a-struct
      [(struct exn:fail:pyret:lexer 
         (msg marks a-srcloc))
       (list a-srcloc)])))

(define (start-and-end->srcloc src start end)
  (let ([sn (if (current-source-name)
                (current-source-name)
                #f)])
    (srcloc sn
            (position-line start)
            (position-col start)
            (position-offset start) 
            (- (position-offset end) (position-offset start)))))

(define (pyret-lexer-error message sl)
  (raise
   (make-exn:fail:pyret:lexer
    (string-append "lexer: " message)
    (current-continuation-marks)
    sl)))



;; -----------------------------------------------------------------------------
;; string lexing help functions

(define (make-escaped-string c)
  (case c
    [(#\a) "\a"]
    [(#\b) "\b"]
    [(#\t) "\t"]
    [(#\n) "\n"]
    [(#\v) "\v"]
    [(#\f) "\f"]
    [(#\r) "\r"]
    [(#\e) "\e"]
    [(#\") "\""]
    [(#\') "\'"]
    [(#\\) "\\"]
    [(#\newline) (string-append (string #\\) (string #\newline))]
    [else (string-append (string #\\) (string c))]))

(define (get-hex-unicode ip start-pos)
  (define (hexify str current-pos)
    (let ([the-num (string->number str 16)])
      (if the-num
          (if (> the-num #x10ffff)
              (pyret-lexer-error MSG-ERROR-UNICODE-NUMBER-TOO-LARGE
                                 (start-and-end->srcloc (current-source-name)
                                                        start-pos
                                                        current-pos))
              (string (integer->char (string->number str 16))))
          (pyret-lexer-error MSG-ERROR-BAD-UNICODE
                             (start-and-end->srcloc (current-source-name)
                                                    start-pos
                                                    current-pos)))))
  (define (char-hex? c)
    (member c '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
                #\a #\A #\b #\B #\c #\C #\d #\D #\e #\E #\f #\F)))
  (define (helper str seen stop current-pos)
    (if (= seen stop)
        (hexify str current-pos)
        (let ([next (peek-char-or-special ip)])
          (unless (char? next)
            (pyret-lexer-error MSG-ERROR-INVALID-STRING
                               (start-and-end->srcloc (current-source-name)
                                                      start-pos
                                                      current-pos)))
          (if (not (char-hex? next))
              (hexify str current-pos)
              (let ([next (read-char-or-special ip)])
                (let-values ([(next-line next-col next-off)
                              (port-next-location ip)])
                  (let ([next-pos (make-position next-off next-line next-col)])
                    (helper (string-append str (string next))
                            (add1 seen)
                            stop
                            next-pos))))))))
  (let ([start (read-char ip)])
    (let-values ([(next-l next-c next-o)
                  (port-next-location ip)])
      (let ([limit (case start
                     [(#\x) 2]
                     [(#\u) 4]
                     [(#\U) 8])])
        (helper "" 0 limit (make-position next-o next-l next-c))))))

(define (get-octal-unicode ip start-pos)
  (define (octalify str current-pos)
    (let ([the-num (string->number str 8)])
      (if (> the-num 255)
          (pyret-lexer-error MSG-ERROR-STRING-OCTAL-OVER-255
                             (start-and-end->srcloc (current-source-name)
                                                    start-pos
                                                    (make-position
                                                     (add1 (position-offset current-pos))
                                                     (position-line current-pos)
                                                     (add1 (position-col current-pos)))))
          (string (integer->char (string->number str 8))))))
  (define (char-octal? c) (member c '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7)))
  (define (helper str num current-pos)
    (if (= num 3) ; we're done
        (octalify str current-pos)
        (let-values ([(next-line next-col next-pos)
                      (port-next-location ip)])
          (let ([next (peek-char-or-special ip)])
            (unless (char? next)
              (pyret-lexer-error MSG-ERROR-INVALID-STRING
                                 (start-and-end->srcloc (current-source-name)
                                                        start-pos
                                                        (make-position next-pos
                                                                       next-line
                                                                       next-col))))
            (if (not (char-octal? next))
                (octalify str (make-position next-pos next-line next-col))
                (helper (string-append str (string (read-char ip))) 
                        (add1 num)
                        (make-position next-pos next-line next-col)))))))
  (helper "" 0 start-pos))
            
        
  

;; lex-string function lexes a string, given #\' or #\" as the starting
;; character
(define/contract (lex-string ip start-char start-pos)
  (-> input-port? 
      (lambda (sc) (or (equal? #\' sc) (equal? #\" sc))) 
      position? 
      position-token?)
  ;; right now, there is no triple quoting in Pyret
  (lex-normal-string ip start-pos start-char))

(define/contract (lex-normal-string ip start-pos end-char)
  (-> input-port?
      position?
      (lambda (t) (or (equal? t #\") (equal? t #\')))
      position-token?)
  (define (helper str)
    (let-values ([(next-line next-col next-pos)
                  (port-next-location ip)])
      (let ([next (read-char-or-special ip)])
        (unless (char? next)
          (pyret-lexer-error MSG-ERROR-INVALID-STRING
                             (start-and-end->srcloc (current-source-name)
                                                    start-pos
                                                    (make-position next-pos
                                                                   next-line
                                                                   next-col))))
        ;; see if the next char is the end-char
        (if (equal? next end-char)
            ;; looks like we're done
            (make-position-token (token-STRING str)
                                 start-pos
                                 (make-position next-pos
                                                next-line
                                                next-col))
            ;; see if the next char is an escape
            (if (equal? next #\\)
                ;; now we suck up the next char if we can
                (let-values ([(next2-line next2-col next2-pos)
                              (port-next-location ip)])
                  (let ([next2 (peek-char-or-special ip)])
                    (unless (char? next2)
                      (pyret-lexer-error MSG-ERROR-INVALID-STRING
                                         (start-and-end->srcloc (current-source-name)
                                                                start-pos
                                                                (make-position next2-pos
                                                                               next2-line
                                                                               next-col))))
                    (cond
                      [(member next2 '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8))
                       ; this is an octal escape sequence
                       (helper 
                        (string-append str 
                                       (get-octal-unicode ip 
                                                          (make-position
                                                           next2-pos 
                                                           next2-line 
                                                           next2-col))))]
                      [(member next2 '(#\x #\u #\U))
                       (helper
                        (string-append str
                                       (get-hex-unicode ip
                                                        (make-position
                                                          next2-pos
                                                          next2-line
                                                          next2-col))))]
                      [else 
                       (helper (string-append str (make-escaped-string (read-char ip))))])))
                (helper (string-append str (string next))))))))
  (helper ""))
                             

                                                    
;; -----------------------------------------------------------------------------
;; Token definitions

;; First, we define the tokens that we are going to lex, starting with
;; tokens that represent values.
(define-tokens value-tokens (NUMBER STRING IMAGE IDENTIFIER))

;; Now, operator tokens
(define-empty-tokens op-tokens ( newline
                                 
                                 OP CP            ; ( )
                                 OB CB            ; [ ]
                                 
                                 COMMA            ; ,
                                 SEMI             ; ;
                                 COLON            ; :
                                 DOT              ; .
                                 
                                 + -
                                 * /
                                 % **
                                 < <=
                                 = !=
                                 >= >

                                 TWO-DOTS         ; ..
                                 THREE-DOTS       ; ...
                                 FOUR-DOTS
                                 FIVE-DOTS
                                 SIX-DOTS
                                 
                                 EOF))

;; Finally, keyword tokens
(define-empty-tokens keyword-tokens ( EMPTY-KW       ; the empty list, []
                                      TRUE-KW        ; True
                                      FALSE-KW       ; False
                                      DEF-KW         ; def
                                      AND-KW         ; and
                                      OR-KW          ; or
                                      NOT-KW         ; not
                                      IF-KW          ; if
                                      ELIF-KW        ; elif
                                      ELSE-KW        ; else
                                      DONE-KW        ; :done
                                      STRUCT-KW      ; struct
                                      IN-KW          ; in
                                      FUN-KW         ; fun

                                      TEST-KW        ; test
                                      TEST-IS-KW     ; is
                                      TEST-WITHIN-KW ; within
                                      TEST-ERROR-KW  ; test_error
                                      TEST-MATCHES-KW ; matches
                                      TEST-RANGE-KW  ; test_range
                                      TEST-FROM-KW   ; from
                                      TEST-TO-KW     ; to

                                      BIG-BANG-KW         ; big_bang
                                    )
)

;; These are abbreviations that we use later in expression-lexer.
(define INEXACT-PREFIX "0nx")
(define LENGTH-INEXACT-PREFIX (string-length INEXACT-PREFIX))
(define-lex-abbrevs
  [line-break #\newline]
  [comment (:: "#" (complement (:: any-string line-break any-string)) line-break)]
  [delimiter blank]
  [letter (:or (:/ #\a #\z) (:/ #\A #\Z))]
  [digit (:/ #\0 #\9)]
  [alpha-numeric (:or letter digit)]
  [identifier (:: letter (:* (:or alpha-numeric #\_)))]
  
  ;; numbers
  [nonzero-digit (:/ #\1 #\9)]
  [maybe-decimal-integer (:or (:: nonzero-digit (:* alpha-numeric))
                              "0")]
  [maybe-octal-integer (:: #\0
                           (:or #\o #\O)
                           (:: (:? #\-) (:* alpha-numeric)))]
  [maybe-hex-integer (:: #\0
                         (:or #\x #\X)
                         (:: (:? #\-) (:* alpha-numeric)))]
  [maybe-float (:or (:: (:* digit)
                        #\.
                        (:* digit)
                        (:? (:: (:or #\e #\E) (:or #\- (:? #\+)) (:* alpha-numeric))))
                    (:: (:+ digit)
                        (:or #\e #\E)
                        (:or #\- (:? #\+))
                        (:* digit)))]
  [maybe-imag (:: (:or maybe-decimal-integer maybe-float)
                  #\i)]
  
  [inexact-prefix-re "0nx"]
  
  [maybe-real-inexact
   (:: inexact-prefix-re
       (:? (:or #\+ #\-))
       (:or maybe-decimal-integer maybe-float))]
  [maybe-imag-inexact-no-op
   (:: inexact-prefix-re
       maybe-imag)]
  [maybe-imag-inexact-with-op
   (:: inexact-prefix-re
       (:or #\+ #\-)
       maybe-imag)]
  [maybe-complex-inexact
   (:: inexact-prefix-re
       (:? (:or #\+ #\-))
       (:or maybe-decimal-integer maybe-float)
       (:or #\+ #\-)
       maybe-imag)]
       
)


(define expression-lexer
  (lexer-src-pos
   [(eof) 'EOF]
   [blank
    (return-without-pos (expression-lexer input-port))]
   [#\newline
    #;(return-without-pos (expression-lexer input-port))
    'newline]
   [#\; 'SEMI]
   [#\, 'COMMA]
   [#\( 'OP]
   [#\) 'CP]
   [#\[ 'OB]
   [#\] 'CB]
   [#\: 'COLON]
   [#\. 'DOT]
   ;; let \ escape a newline
   [(:: #\\ #\newline)
    (return-without-pos (expression-lexer input-port))]
   [#\#
    (begin
      (let ([suck-up-line
             (λ (myself ip)
               (let ([next (read-char-or-special ip)])
                 (if (or (eof-object? next)
                         (and (char? next)
                              (equal? next #\newline)))
                     (return-without-pos (expression-lexer input-port))
                     (myself myself ip))))])
        (suck-up-line suck-up-line input-port)))]
   
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Strings
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   [#\'
    (return-without-pos (lex-string input-port #\' start-pos))]
   [#\"
    (return-without-pos (lex-string input-port #\" start-pos))]
   
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Numbers
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   [maybe-imag-inexact-no-op
    (let* ([nstr (string-append "0+" (substring lexeme LENGTH-INEXACT-PREFIX))]
           [maybe-num (string->number nstr)])
      (if maybe-num
          (token-NUMBER (string->number (string-append "#i" nstr)))
          (pyret-lexer-error MSG-ERROR-INVALID-INEXACT
                             (start-and-end->srcloc current-source-name
                                                    start-pos
                                                    end-pos))))]
   [maybe-imag-inexact-with-op
    (let* ([nstr (string-append "0" (substring lexeme LENGTH-INEXACT-PREFIX))]
           [maybe-num (string->number nstr)])
      (if maybe-num
          (token-NUMBER (string->number (string-append "#i" nstr)))
          (pyret-lexer-error MSG-ERROR-INVALID-INEXACT
                             (start-and-end->srcloc current-source-name
                                                    start-pos
                                                    end-pos))))]
   [maybe-complex-inexact
    (let* ([nstr (substring lexeme LENGTH-INEXACT-PREFIX)]
           [maybe-num (string->number nstr)])
      (if maybe-num
          (token-NUMBER (string->number (string-append "#i" nstr)))
          (pyret-lexer-error MSG-ERROR-INVALID-INEXACT
                             (start-and-end->srcloc current-source-name
                                                    start-pos
                                                    end-pos))))]
   
   [maybe-real-inexact
    (let* ([nstr (substring lexeme LENGTH-INEXACT-PREFIX)]
           [maybe-num (string->number nstr)])
      (if maybe-num
          (token-NUMBER (string->number (string-append "#i" nstr)))
          (pyret-lexer-error MSG-ERROR-INVALID-INEXACT
                             (start-and-end->srcloc current-source-name
                                                    start-pos
                                                    end-pos))))]


   [maybe-imag
    (let ([new-lexeme (if (equal? (string-ref lexeme 0) #\-)
                          (string-append "0" lexeme)
                          (string-append "0+" lexeme))])
      (let ([maybe-num (string->number new-lexeme)])
        (if maybe-num
            (token-NUMBER maybe-num)
            (pyret-lexer-error MSG-ERROR-INVALID-IMAG
                               (start-and-end->srcloc current-source-name
                                                      start-pos
                                                      end-pos)))))]
   [maybe-decimal-integer
    (let ([maybe-num (string->number lexeme)])
      (if maybe-num
          (token-NUMBER maybe-num)
          (pyret-lexer-error MSG-ERROR-INVALID-NUM
                             (start-and-end->srcloc current-source-name
                                                    start-pos
                                                    end-pos))))]
   
   [maybe-octal-integer
    (let ([new-lexeme lexeme])
      (string-set! new-lexeme 0 #\#)
      (let ([maybe-num (string->number new-lexeme)])
        (if maybe-num
            (token-NUMBER maybe-num)
            (pyret-lexer-error MSG-ERROR-INVALID-OCT
                               (start-and-end->srcloc current-source-name
                                                      start-pos
                                                      end-pos)))))]
   [maybe-hex-integer
    (let ([new-lexeme lexeme])
      (string-set! new-lexeme 0 #\#)
      (let ([maybe-num (string->number new-lexeme)])
        (if maybe-num
            (token-NUMBER maybe-num)
            (pyret-lexer-error MSG-ERROR-INVALID-HEX
                               (start-and-end->srcloc current-source-name
                                                      start-pos
                                                      end-pos)))))]
   [maybe-float
    (let ([maybe-num (string->number lexeme)])
      (if maybe-num
          (token-NUMBER maybe-num)
          (pyret-lexer-error MSG-ERROR-INVALID-FLOAT
                             (start-and-end->srcloc current-source-name
                                                    start-pos
                                                    end-pos))))]
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; operators
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   
   [#\+ '+]
   [#\- '-]
   [#\* '*]
   [#\/ '/]
   [#\% '%]
   [(:: #\* #\*) '**]
   [#\< '<]
   ["<=" '<=]
   ["=" '=]
   ["!=" '!=]
   [">=" '>=]
   [">" '>]
   
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; keywords
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   
   ["empty" 'EMPTY-KW]
   ["True" 'TRUE-KW]
   ["False" 'FALSE-KW]
   ["def" 'DEF-KW]
   ["and" 'AND-KW]
   ["or" 'OR-KW]
   ["not" 'NOT-KW]
   ["if" 'IF-KW]
   ["elif" 'ELIF-KW]
   ["else" 'ELSE-KW]
   [":done" 'DONE-KW]
   ["struct" 'STRUCT-KW]
   ["in" 'IN-KW]
   ["fun" 'FUN-KW]
   ["test" 'TEST-KW]
   ["is" 'TEST-IS-KW]
   ["within" 'TEST-WITHIN-KW]
   ["test_error" 'TEST-ERROR-KW]
   ["matches" 'TEST-MATCHES-KW]
   ["test_range" 'TEST-RANGE-KW]
   ["from" 'TEST-FROM-KW]
   ["to" 'TEST-TO-KW]
   ["big_bang" 'BIG-BANG-KW]

;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Templates
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

   [".." 'TWO-DOTS]
   ["..." 'THREE-DOTS]
   ["...." 'FOUR-DOTS]
   ["....." 'FIVE-DOTS]
   ["......" 'SIX-DOTS]
   
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; other
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   
   [(special)
    (if (image? lexeme)
        (token-IMAGE lexeme)
        (pyret-lexer-error MSG-ERROR-INVALID-TOKEN
                           (start-and-end->srcloc current-source-name
                                                  start-pos
                                                  end-pos)))]
   [identifier
    (token-IDENTIFIER (string->symbol lexeme))]
   
  )
)

;; -----------------------------------------------------------------------------
;; newline-adding-lexer

(define-struct/contract newline-adding-lexer-state [(seen-eof? boolean?)]
  #:transparent
  #:mutable)

(define nal-state (make-parameter (newline-adding-lexer-state #f)))
(provide nal-state newline-adding-lexer-state)

;; This lexer punts to `expression-lexer', but it adds a newline at the end of
;; lex, to make sure that the program still parses if the user fails to type
;; a newline at the end. To use this, you must parameterize the `nal-state'
;; parameter like so:
;;
;; (parameterize ([nal-state (newline-adding-lexer-state #f)])
;;   ...)
(define (newline-adding-lexer ip)
  (let ([token (expression-lexer ip)])
    (if (equal? (position-token-token token) 'EOF)
        (if (newline-adding-lexer-state-seen-eof? (nal-state))
            token
            (begin
              (set-newline-adding-lexer-state-seen-eof?! (nal-state) #t)
              (make-position-token 'newline
                                   (position-token-start-pos token)
                                   (position-token-end-pos token))))
        token)))
(provide newline-adding-lexer)



#|
(define/contract (lex-triple-quoted-string ip start-pos)
  (-> input-port?
      position?
      position-token?)
  (define (helper str num)
    ;; this procedure is a little more complicated, since we have to check
    ;; what's up every time we see a double quote
    (let-values ([(next-pos next-line next-col)
                  (port-next-location ip)])
      (let ([next (read-char-or-special ip)])
        (unless (char? next)
          (pyret-lexer-error MSG-ERROR-INVALID-STRING
                             (start-and-end->srcloc (current-source-name)
                                                    start-pos
                                                    (make-position next-pos
                                                                   next-line
                                                                   next-col))))
        (cond
          [(equal? next #\\)
           (let-values ([(next2-line next2-col next2-pos)
                         (port-next-location ip)])
             (let ([next2 (read-char-or-special ip)])
               (unless (char? next)
                 (pyret-lexer-error MSG-ERROR-INVALID-STRING
                                    (start-and-end->srcloc (current-source-name)
                                                           start-pos
                                                           (make-position next-pos
                                                                          next-line
                                                                          next-col))))
               (helper (string-append str (make-escaped-string next2)))))]
          [(equal? next #\")
           (if (= num 2)
               ;; we're done!
               (make-position-token (token-STRING str)
                                    start-pos
                                    (make-position next-pos next-line next-col))
               ;; see if there are more double quotes...
               (helper str (add1 num)))]
          ;; don't forget about double quotes we may have missed
          [else
           (helper (string-append str (make-string num #\")) 0)]))))
  (helper "" 0))
|#