bsl/lex.rkt
#lang racket

;; file: lex.rkt
;; author: Bill Turtle
;;
;; parser for the Pyret/BSL language

(require parser-tools/lex
         (prefix-in : parser-tools/lex-sre)
         plai/test-harness)
(provide value-tokens keyword-tokens op-tokens letter digit identifier
         line-break comment expression-lexer)

;; this will be the same for all versions of Pyret
(define-tokens value-tokens (NUMBER IDENTIFIER)) ; values to bind to identifiers

(define-empty-tokens keyword-tokens (DEF LAMBDA IF ELIF ELSE RETURN AND OR NOT TRUE FALSE))
(define-empty-tokens op-tokens (newline
                                    
                                    OP CP         ; ( )
                                    OB CB         ; [ ]
                                    
                                    COMMA         ; ,
                                    SEMI          ; ; (not used)
                                    COLON         ; :
                                    PERIOD        ; .
                                    
                                    NEG           ; !
                                    LESS-EQUAL    ; <=
                                    GREATER-EQUAL ; >=
                                    NOT-EQUAL     ; !=
                                    = < > + - * /
                                    == %
                                    **            ; exponentiation
                                    EOF))

(define-lex-abbrevs
  [letter    (:or (:/ "a" "z") (:/ "A" "Z"))]
  [digit     (:/ #\0 #\9)]
  [identifier (:: letter (:* (:or letter digit #\_ #\?)))]
  [line-break #\newline]
  [comment (:: "#" (complement (:: any-string line-break any-string)) line-break)])

(define expression-lexer
  (lexer-src-pos
   [(eof) 'EOF]
   [whitespace
    (return-without-pos (expression-lexer input-port))]
   [comment
    (return-without-pos (expression-lexer input-port))]
   [#\newline (token-newline)] ; (token-newline) returns 'newline
   [(:or "=" "+" "-" "*" "/" "<" ">" "**" "%" "==") (string->symbol lexeme)]
   ["(" 'OP]
   [")" 'CP]
   ["[" 'OB]
   ["]" 'CB]
   ["," 'COMMA]
   [";" 'SEMI]
   [":" 'COLON]
   ["." 'PERIOD]
   ["!" 'NEG]
   ["<=" 'LESS-EQUAL]
   [">=" 'GREATER-EQUAL]
   ["!=" 'NOT-EQUAL]
   [#\λ 'LAMBDA]
   ["lambda" 'LAMBDA]
   ["def" 'DEF]
   ["if" 'IF]
   ["elif" 'ELIF]
   ["else" 'ELSE]
   ["and" 'AND]
   ["or" 'OR]
   ["not" 'NOT]
   ["return" 'RETURN]
   ["true" 'TRUE]
   ["false" 'FALSE]
   [identifier
    (token-IDENTIFIER (string->symbol (regexp-replace* #rx"_" lexeme "-")))]  ; replace _ with -
   [(:+ digit) (token-NUMBER (string->number lexeme))]
   [(:: #\- (:+ digit)) (token-NUMBER (string->number lexeme))] ; (e.g., -5)
   [(:: (:+ digit) #\. (:* digit)) (token-NUMBER (string->number lexeme))])) ; decimal

(define (test-lexer string)
  (define p (open-input-string string))
  (position-token-token (expression-lexer p)))

;(test (test-lexer "true") 'TRUE)
;(test (test-lexer "false") 'FALSE)