bsl/lex.rkt
#lang racket

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; file: lex.rkt                                ;;
;; author: Bill Turtle                          ;;
;;                                              ;;
;; lexer for the Pyret/BSL language             ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(require parser-tools/lex
         (prefix-in : parser-tools/lex-sre)
         plai/test-harness)
(provide value-tokens keyword-tokens op-tokens letter digit identifier
         line-break comment lex-string
)
(provide/contract
 [expression-lexer (-> input-port? position-token?)]
)

;; These are 'primitive' values
(define-tokens value-tokens (NUMBER IDENTIFIER STRING CHAR IMAGE))

;; we do not use all of these in BSL, but we may as well reserve them now
(define-empty-tokens keyword-tokens
  (AND AS ASSERT BREAK CLASS CONTINUE DEF
   DEL ELIF ELSE EXCEPT EXEC FINALLY FOR
   FROM GLOBAL IF IMPORT IN IS LAMBDA
   NOT OR PASS PRINT RAISE RETURN TRY
   WHILE WITH YIELD
   ;; these are keywords we added to the language
   FUN STRUCT TRUE FALSE))
(define-empty-tokens op-tokens (newline
                                    
                                    OP CP         ; ( )
                                    OB CB         ; [ ]
                                    
                                    COMMA         ; ,
                                    SEMI          ; ; (not used)
                                    COLON         ; :
                                    PERIOD        ; .
                                    
                                    LESS-EQUAL    ; <=
                                    GREATER-EQUAL ; >=
                                    NOT-EQUAL-C   ; !=
                                    NOT-EQUAL-ML  ; <>
                                    = < > + - * /
                                    == %
                                    **            ; exponentiation
                                    EOF))

(define-lex-abbrevs
  [letter    (:or (:/ "a" "z") (:/ "A" "Z"))]
  [identifier (:: letter (:* (:or letter digit #\_ #\?)))]
  [line-break #\newline]
  ; TODO: fix comment re
  [comment (:: "#" (complement (:: any-string line-break any-string)) line-break)]

  ;; the following are taken from the Python reference
  
  ; strings
  [stringliteral (:: (:? stringprefix) (:or shortstring longstring))]
  [stringprefix (:or "r" "u" "ur" "R" "U" "UR" "Ur" "uR"
                     "b" "B" "br" "BRr" "bR" "BR")]
  [shortstring (:or (:: "'" (:* shortstringitem) "'")
                    (:: #\" (:* shortstringitem) #\"))]
  [longstring (:or (:: "'''" (:* longstringitem) "'''")
                   (:: (:: #\" #\" #\")
                       (:* longstringitem)
                       (:: #\" #\" #\")))]
  [shortstringitem (:or shortstringchar escapeseq)]
  [longstringitem (:or longstringchar escapeseq)]
  [shortstringchar (complement (:: #\\ #\newline #\"))]
  [longstringchar (complement #\newline)]
  [escapeseq (:: #\\ any-char)]
  
  ; Integer and long integer literals
  [longinteger (:: integer (:or #\l #\L))]
  [integer decimalinteger]
  [decimalinteger (:or (:: nonzerodigit (:* digit)) "0")]
  #;[octinteger (:or (:: "0" (:or "o" "O") (:? octdigit)) (:: "0" (:? octdigit)))]
  #;[hexinteger (:: "0" (:or "x" "X") (:? hexdigit))]
  #;[bininteger (:: "0" (:or "b" "B") (:? bindigit))]
  [digit (:/ #\0 #\9)]
  [nonzerodigit (:/ #\1 #\9)]
  #;[octdigit (:/ #\0 #\7)]
  #;[bindigit (:or #\0 #\1)]
  #;[hexdigit (:or digit (:/ #\a #\f) (:/ #\A #\F))]
  
  ; Floating point
  [floatnumber (:or pointfloat exponentfloat)]
  [pointfloat (:or (:: (:? intpart) fraction)
                   (:: intpart #\.))]
  [exponentfloat (:: (:or intpart pointfloat) exponent)]
  [intpart (:+ digit)]
  [fraction (:: "." (:+ digit))]
  [exponent (:: (:or #\e #\E) (:? (:or #\+ #\-)) (:+ digit))]
  [imagnumber (:: (:or floatnumber intpart) (:or #\j #\J))])
                   

(define expression-lexer
  (lexer-src-pos
   [(eof) 'EOF]
   [whitespace
    (return-without-pos (expression-lexer input-port))]
   [#\newline 'newline]
   [comment
    (return-without-pos (expression-lexer input-port))]
   [(:or "=" "+" "-" "*" "/" "<" ">" "**" "%" "==") (string->symbol lexeme)]
   ["(" 'OP]
   [")" 'CP]
   ["[" 'OB]
   ["]" 'CB]
   ["," 'COMMA]
   [";" 'SEMI]
   [":" 'COLON]
   ["." 'PERIOD]
   ["!" 'NOT]
   ["<=" 'LESS-EQUAL]
   [">=" 'GREATER-EQUAL]
   ["!=" 'NOT-EQUAL-C]
   ["<>" 'NOT-EQUAL-ML]
   ["fun" 'FUN]
   ["let" 'LET]
   ["in" 'IN]
   ["where" 'WHERE]
   ["def" 'DEF]
   ["if" 'IF]
   ["elif" 'ELIF]
   ["else" 'ELSE]
   ["and" 'AND]
   ["or" 'OR]
   ["not" 'NOT]
   ["return" 'RETURN]
   ["True" 'TRUE]
   ["False" 'FALSE]
   ["print" 'PRINT]
   ["is" 'IS]
   ["struct" 'STRUCT]
   [(special)
    (token-IMAGE lexeme)] ; FIXME check to make sure it's an image
   [identifier
    (token-IDENTIFIER #;(string->symbol (regexp-replace* #rx"_" lexeme "-"))
                      (string->symbol lexeme))]  ; don't replace _'s yet
   [(:: #\' any-char #\')
    (token-CHAR (string-ref lexeme 1))]
   [(:: #\" (:* (:~ #\")) #\")
    (let* ([len (string-length lexeme)]
           [sub (substring lexeme 1 (- len 1))]) ; strip outer backslashes
      (token-STRING sub))
    #;(token-STRING lexeme)]
   [longinteger
    (let* ([len (string-length lexeme)]
           [intstring (substring lexeme 0 (sub1 len))])
      (token-NUMBER (string->number intstring)))]
   [integer
    (token-NUMBER (string->number lexeme))]
   [decimalinteger (token-NUMBER (string->number lexeme))]
   [floatnumber (token-NUMBER (string->number lexeme))]
   [imagnumber
    (token-NUMBER
      (string->number
        (format "0+~a" 
                (regexp-replace #rx"j" 
                                (regexp-replace #rx"J" lexeme "i")
                                "i"))))]))

(define (lex-string string)
  (define p (open-input-string string))
  (position-token-token (expression-lexer p)))