#lang racket
(require parser-tools/lex
(prefix-in : parser-tools/lex-sre)
plai/test-harness)
(provide value-tokens keyword-tokens op-tokens letter digit identifier
line-break comment lex-string
)
(provide/contract
[expression-lexer (-> input-port? position-token?)]
)
(define-tokens value-tokens (NUMBER IDENTIFIER STRING CHAR IMAGE))
(define-empty-tokens keyword-tokens
(AND AS ASSERT BREAK CLASS CONTINUE DEF
DEL ELIF ELSE EXCEPT EXEC FINALLY FOR
FROM GLOBAL IF IMPORT IN IS LAMBDA
NOT OR PASS PRINT RAISE RETURN TRY
WHILE WITH YIELD
FUN STRUCT TRUE FALSE))
(define-empty-tokens op-tokens (newline
OP CP OB CB
COMMA SEMI COLON PERIOD
LESS-EQUAL GREATER-EQUAL NOT-EQUAL-C NOT-EQUAL-ML = < > + - * /
== %
** EOF))
(define-lex-abbrevs
[letter (:or (:/ "a" "z") (:/ "A" "Z"))]
[identifier (:: letter (:* (:or letter digit #\_ #\?)))]
[line-break #\newline]
[comment (:: "#" (complement (:: any-string line-break any-string)) line-break)]
[stringliteral (:: (:? stringprefix) (:or shortstring longstring))]
[stringprefix (:or "r" "u" "ur" "R" "U" "UR" "Ur" "uR"
"b" "B" "br" "BRr" "bR" "BR")]
[shortstring (:or (:: "'" (:* shortstringitem) "'")
(:: #\" (:* shortstringitem) #\"))]
[longstring (:or (:: "'''" (:* longstringitem) "'''")
(:: (:: #\" #\" #\")
(:* longstringitem)
(:: #\" #\" #\")))]
[shortstringitem (:or shortstringchar escapeseq)]
[longstringitem (:or longstringchar escapeseq)]
[shortstringchar (complement (:: #\\ #\newline #\"))]
[longstringchar (complement #\newline)]
[escapeseq (:: #\\ any-char)]
[longinteger (:: integer (:or #\l #\L))]
[integer decimalinteger]
[decimalinteger (:or (:: nonzerodigit (:* digit)) "0")]
[octinteger (:or (:: "0" (:or "o" "O") (:? octdigit)) (:: "0" (:? octdigit)))]
[hexinteger (:: "0" (:or "x" "X") (:? hexdigit))]
[bininteger (:: "0" (:or "b" "B") (:? bindigit))]
[digit (:/ #\0 #\9)]
[nonzerodigit (:/ #\1 #\9)]
[octdigit (:/ #\0 #\7)]
[bindigit (:or #\0 #\1)]
[hexdigit (:or digit (:/ #\a #\f) (:/ #\A #\F))]
[floatnumber (:or pointfloat exponentfloat)]
[pointfloat (:or (:: (:? intpart) fraction)
(:: intpart #\.))]
[exponentfloat (:: (:or intpart pointfloat) exponent)]
[intpart (:+ digit)]
[fraction (:: "." (:+ digit))]
[exponent (:: (:or #\e #\E) (:? (:or #\+ #\-)) (:+ digit))]
[imagnumber (:: (:or floatnumber intpart) (:or #\j #\J))])
(define expression-lexer
(lexer-src-pos
[(eof) 'EOF]
[whitespace
(return-without-pos (expression-lexer input-port))]
[#\newline 'newline]
[comment
(return-without-pos (expression-lexer input-port))]
[(:or "=" "+" "-" "*" "/" "<" ">" "**" "%" "==") (string->symbol lexeme)]
["(" 'OP]
[")" 'CP]
["[" 'OB]
["]" 'CB]
["," 'COMMA]
[";" 'SEMI]
[":" 'COLON]
["." 'PERIOD]
["!" 'NOT]
["<=" 'LESS-EQUAL]
[">=" 'GREATER-EQUAL]
["!=" 'NOT-EQUAL-C]
["<>" 'NOT-EQUAL-ML]
["fun" 'FUN]
["let" 'LET]
["in" 'IN]
["where" 'WHERE]
["def" 'DEF]
["if" 'IF]
["elif" 'ELIF]
["else" 'ELSE]
["and" 'AND]
["or" 'OR]
["not" 'NOT]
["return" 'RETURN]
["True" 'TRUE]
["False" 'FALSE]
["print" 'PRINT]
["is" 'IS]
["struct" 'STRUCT]
[(special)
(token-IMAGE lexeme)] [identifier
(token-IDENTIFIER (string->symbol (regexp-replace* #rx"_" lexeme "-"))
(string->symbol lexeme))] [(:: #\' any-char #\')
(token-CHAR (string-ref lexeme 1))]
[(:: #\" (:* (:~ #\")) #\")
(let* ([len (string-length lexeme)]
[sub (substring lexeme 1 (- len 1))]) (token-STRING sub))
(token-STRING lexeme)]
[longinteger
(let* ([len (string-length lexeme)]
[intstring (substring lexeme 0 (sub1 len))])
(token-NUMBER (string->number intstring)))]
[integer
(token-NUMBER (string->number lexeme))]
[decimalinteger (token-NUMBER (string->number lexeme))]
[floatnumber (token-NUMBER (string->number lexeme))]
[imagnumber
(token-NUMBER
(string->number
(format "0+~a"
(regexp-replace #rx"j"
(regexp-replace #rx"J" lexeme "i")
"i"))))]))
(define (lex-string string)
(define p (open-input-string string))
(position-token-token (expression-lexer p)))