bsl/lex.rkt
#lang racket

;; file: lex.rkt
;; author: Bill Turtle
;;
;; lexer for the Pyret/BSL language

(require parser-tools/lex
         (prefix-in : parser-tools/lex-sre)
         plai/test-harness)
(provide value-tokens keyword-tokens op-tokens letter digit identifier
         line-break comment expression-lexer)

;; this will be the same for all versions of Pyret
;; TODO: symbols?
(define-tokens value-tokens (NUMBER IDENTIFIER STRING CHAR)) ; values to bind to identifiers

(define-empty-tokens keyword-tokens (FUN DEF LET IN WHERE IF ELIF ELSE RETURN AND OR NOT TRUE FALSE PRINT IS STRUCT))
(define-empty-tokens op-tokens (newline
                                    
                                    OP CP         ; ( )
                                    OB CB         ; [ ]
                                    
                                    COMMA         ; ,
                                    SEMI          ; ; (not used)
                                    COLON         ; :
                                    PERIOD        ; .
                                    
                                    LESS-EQUAL    ; <=
                                    GREATER-EQUAL ; >=
                                    NOT-EQUAL-C   ; !=
                                    NOT-EQUAL-ML  ; <>
                                    = < > + - * /
                                    == %
                                    **            ; exponentiation
                                    EOF))

(define-lex-abbrevs
  [letter    (:or (:/ "a" "z") (:/ "A" "Z"))]
  [digit     (:/ #\0 #\9)]
  [identifier (:: letter (:* (:or letter digit #\_ #\?)))]
  [line-break #\newline]
  [comment (:: "#" (complement (:: any-string line-break any-string)) line-break)])

(define expression-lexer
  (lexer-src-pos
   [(eof) 'EOF]
   [whitespace
    (return-without-pos (expression-lexer input-port))]
   [comment
    (return-without-pos (expression-lexer input-port))]
   [(:or "=" "+" "-" "*" "/" "<" ">" "**" "%" "==") (string->symbol lexeme)]
   ["(" 'OP]
   [")" 'CP]
   ["[" 'OB]
   ["]" 'CB]
   ["," 'COMMA]
   [";" 'SEMI]
   [":" 'COLON]
   ["." 'PERIOD]
   ["!" 'NOT]
   ["<=" 'LESS-EQUAL]
   [">=" 'GREATER-EQUAL]
   ["!=" 'NOT-EQUAL-C]
   ["<>" 'NOT-EQUAL-ML]
   ["fun" 'FUN]
   ["let" 'LET]
   ["in" 'IN]
   ["where" 'WHERE]
   ["def" 'DEF]
   ["if" 'IF]
   ["elif" 'ELIF]
   ["else" 'ELSE]
   ["and" 'AND]
   ["or" 'OR]
   ["not" 'NOT]
   ["return" 'RETURN]
   ["True" 'TRUE]
   ["False" 'FALSE]
   ["print" 'PRINT]
   ["is" 'IS]
   ["struct" 'STRUCT]
   [identifier
    (token-IDENTIFIER #;(string->symbol (regexp-replace* #rx"_" lexeme "-"))
                      (string->symbol lexeme))]  ; don't replace _'s yet
   [(:+ digit) (token-NUMBER (string->number lexeme))]
   [(:: (:+ digit) #\. (:* digit)) (token-NUMBER (string->number lexeme))] ; non-integers
   [(:: #\' any-char #\')
    (token-CHAR (string-ref lexeme 1))]
   [(:: #\" any-string #\")
    (let* ([len (string-length lexeme)]
           [sub (substring lexeme 1 (- len 1))]) ; strip outer quotations
      (token-STRING sub))]))

(define (test-lexer string)
  (define p (open-input-string string))
  (position-token-token (expression-lexer p)))

;(test (test-lexer "true") 'TRUE)
;(test (test-lexer "false") 'FALSE)