#lang racket
(require parser-tools/lex)
(require (prefix-in : parser-tools/lex-sre))
(require (for-syntax "../utilities.rkt"))
(require 2htdp/image)
(provide/contract
[expression-lexer (-> input-port? position-token?)]
)
(provide (rename-out [comment comment-re]
[alpha-numeric alpha-numeric-re]
[maybe-float float-re]
[maybe-imag imag-re]
[current-source-name lexer-source-name]
[identifier identifier-re]
)
)
(provide value-tokens op-tokens keyword-tokens)
(define MSG-ERROR-INVALID-NUM "this is not a valid number")
(define MSG-ERROR-INVALID-OCT "this is not a valid octal number")
(define MSG-ERROR-INVALID-HEX "this is not a valid hexadecimal number")
(define MSG-ERROR-INVALID-IMAG "this is not a valid imaginary number")
(define MSG-ERROR-INVALID-FLOAT "this is not a valid floating-point number")
(define MSG-ERROR-INVALID-INEXACT "this is not a valid inexact number")
(define MSG-ERROR-INVALID-STRING "this is not a valid string")
(define MSG-ERROR-INVALID-TOKEN "this is not a valid piece of code")
(define MSG-ERROR-STRING-OCTAL-OVER-255 "this octal escape sequence is over 255")
(define MSG-ERROR-UNICODE-NUMBER-TOO-LARGE
(string-append "unicode numbers must be smaller than 0x10ffff"))
(define MSG-ERROR-BAD-UNICODE
"no hex digit after unicode escape sequence")
(define current-source-name (make-parameter #f))
(define-struct (exn:fail:pyret:lexer
exn:fail)
(a-srcloc)
#:property prop:exn:srclocs
(lambda (a-struct)
(match a-struct
[(struct exn:fail:pyret:lexer
(msg marks a-srcloc))
(list a-srcloc)])))
(define (start-and-end->srcloc src start end)
(let ([sn (if (current-source-name)
(current-source-name)
#f)])
(srcloc sn
(position-line start)
(position-col start)
(position-offset start)
(- (position-offset end) (position-offset start)))))
(define (pyret-lexer-error message sl)
(raise
(make-exn:fail:pyret:lexer
(string-append "lexer: " message)
(current-continuation-marks)
sl)))
(define (make-escaped-string c)
(case c
[(#\a) "\a"]
[(#\b) "\b"]
[(#\t) "\t"]
[(#\n) "\n"]
[(#\v) "\v"]
[(#\f) "\f"]
[(#\r) "\r"]
[(#\e) "\e"]
[(#\") "\""]
[(#\') "\'"]
[(#\\) "\\"]
[(#\newline) (string-append (string #\\) (string #\newline))]
[else (string-append (string #\\) (string c))]))
(define (get-hex-unicode ip start-pos)
(define (hexify str current-pos)
(let ([the-num (string->number str 16)])
(if the-num
(if (> the-num #x10ffff)
(pyret-lexer-error MSG-ERROR-UNICODE-NUMBER-TOO-LARGE
(start-and-end->srcloc (current-source-name)
start-pos
current-pos))
(string (integer->char (string->number str 16))))
(pyret-lexer-error MSG-ERROR-BAD-UNICODE
(start-and-end->srcloc (current-source-name)
start-pos
current-pos)))))
(define (char-hex? c)
(member c '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
#\a #\A #\b #\B #\c #\C #\d #\D #\e #\E #\f #\F)))
(define (helper str seen stop current-pos)
(if (= seen stop)
(hexify str current-pos)
(let ([next (peek-char-or-special ip)])
(unless (char? next)
(pyret-lexer-error MSG-ERROR-INVALID-STRING
(start-and-end->srcloc (current-source-name)
start-pos
current-pos)))
(if (not (char-hex? next))
(hexify str current-pos)
(let ([next (read-char-or-special ip)])
(let-values ([(next-line next-col next-off)
(port-next-location ip)])
(let ([next-pos (make-position next-off next-line next-col)])
(helper (string-append str (string next))
(add1 seen)
stop
next-pos))))))))
(let ([start (read-char ip)])
(let-values ([(next-l next-c next-o)
(port-next-location ip)])
(let ([limit (case start
[(#\x) 2]
[(#\u) 4]
[(#\U) 8])])
(helper "" 0 limit (make-position next-o next-l next-c))))))
(define (get-octal-unicode ip start-pos)
(define (octalify str current-pos)
(let ([the-num (string->number str 8)])
(if (> the-num 255)
(pyret-lexer-error MSG-ERROR-STRING-OCTAL-OVER-255
(start-and-end->srcloc (current-source-name)
start-pos
(make-position
(add1 (position-offset current-pos))
(position-line current-pos)
(add1 (position-col current-pos)))))
(string (integer->char (string->number str 8))))))
(define (char-octal? c) (member c '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7)))
(define (helper str num current-pos)
(if (= num 3) (octalify str current-pos)
(let-values ([(next-line next-col next-pos)
(port-next-location ip)])
(let ([next (peek-char-or-special ip)])
(unless (char? next)
(pyret-lexer-error MSG-ERROR-INVALID-STRING
(start-and-end->srcloc (current-source-name)
start-pos
(make-position next-pos
next-line
next-col))))
(if (not (char-octal? next))
(octalify str (make-position next-pos next-line next-col))
(helper (string-append str (string (read-char ip)))
(add1 num)
(make-position next-pos next-line next-col)))))))
(helper "" 0 start-pos))
(define/contract (lex-string ip start-char start-pos)
(-> input-port?
(lambda (sc) (or (equal? #\' sc) (equal? #\" sc)))
position?
position-token?)
(lex-normal-string ip start-pos start-char))
(define/contract (lex-normal-string ip start-pos end-char)
(-> input-port?
position?
(lambda (t) (or (equal? t #\") (equal? t #\')))
position-token?)
(define (helper str)
(let-values ([(next-line next-col next-pos)
(port-next-location ip)])
(let ([next (read-char-or-special ip)])
(unless (char? next)
(pyret-lexer-error MSG-ERROR-INVALID-STRING
(start-and-end->srcloc (current-source-name)
start-pos
(make-position next-pos
next-line
next-col))))
(if (equal? next end-char)
(make-position-token (token-STRING str)
start-pos
(make-position next-pos
next-line
next-col))
(if (equal? next #\\)
(let-values ([(next2-line next2-col next2-pos)
(port-next-location ip)])
(let ([next2 (peek-char-or-special ip)])
(unless (char? next2)
(pyret-lexer-error MSG-ERROR-INVALID-STRING
(start-and-end->srcloc (current-source-name)
start-pos
(make-position next2-pos
next2-line
next-col))))
(cond
[(member next2 '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8))
(helper
(string-append str
(get-octal-unicode ip
(make-position
next2-pos
next2-line
next2-col))))]
[(member next2 '(#\x #\u #\U))
(helper
(string-append str
(get-hex-unicode ip
(make-position
next2-pos
next2-line
next2-col))))]
[else
(helper (string-append str (make-escaped-string (read-char ip))))])))
(helper (string-append str (string next))))))))
(helper ""))
(define-tokens value-tokens (NUMBER STRING IMAGE IDENTIFIER))
(define-empty-tokens op-tokens ( newline
OP CP OB CB
COMMA SEMI COLON DOT
+ -
* /
% **
< <=
= !=
>= >
TWO-DOTS THREE-DOTS FOUR-DOTS
FIVE-DOTS
SIX-DOTS
EOF))
(define-empty-tokens keyword-tokens ( EMPTY-KW TRUE-KW FALSE-KW DEF-KW AND-KW OR-KW NOT-KW IF-KW ELIF-KW ELSE-KW DONE-KW STRUCT-KW IN-KW FUN-KW
TEST-KW TEST-IS-KW TEST-WITHIN-KW TEST-ERROR-KW TEST-MATCHES-KW TEST-RANGE-KW TEST-FROM-KW TEST-TO-KW
BIG-BANG-KW )
)
(define INEXACT-PREFIX "0nx")
(define LENGTH-INEXACT-PREFIX (string-length INEXACT-PREFIX))
(define-lex-abbrevs
[line-break #\newline]
[comment (:: "#" (complement (:: any-string line-break any-string)) line-break)]
[delimiter blank]
[letter (:or (:/ #\a #\z) (:/ #\A #\Z))]
[digit (:/ #\0 #\9)]
[alpha-numeric (:or letter digit)]
[identifier (:: letter (:* (:or alpha-numeric #\_)))]
[nonzero-digit (:/ #\1 #\9)]
[maybe-decimal-integer (:or (:: nonzero-digit (:* alpha-numeric))
"0")]
[maybe-octal-integer (:: #\0
(:or #\o #\O)
(:: (:? #\-) (:* alpha-numeric)))]
[maybe-hex-integer (:: #\0
(:or #\x #\X)
(:: (:? #\-) (:* alpha-numeric)))]
[maybe-float (:or (:: (:* digit)
#\.
(:* digit)
(:? (:: (:or #\e #\E) (:or #\- (:? #\+)) (:* alpha-numeric))))
(:: (:+ digit)
(:or #\e #\E)
(:or #\- (:? #\+))
(:* digit)))]
[maybe-imag (:: (:or maybe-decimal-integer maybe-float)
#\i)]
[inexact-prefix-re "0nx"]
[maybe-real-inexact
(:: inexact-prefix-re
(:? (:or #\+ #\-))
(:or maybe-decimal-integer maybe-float))]
[maybe-imag-inexact-no-op
(:: inexact-prefix-re
maybe-imag)]
[maybe-imag-inexact-with-op
(:: inexact-prefix-re
(:or #\+ #\-)
maybe-imag)]
[maybe-complex-inexact
(:: inexact-prefix-re
(:? (:or #\+ #\-))
(:or maybe-decimal-integer maybe-float)
(:or #\+ #\-)
maybe-imag)]
)
(define expression-lexer
(lexer-src-pos
[(eof) 'EOF]
[blank
(return-without-pos (expression-lexer input-port))]
[#\newline
(return-without-pos (expression-lexer input-port))
'newline]
[#\; 'SEMI]
[#\, 'COMMA]
[#\( 'OP]
[#\) 'CP]
[#\[ 'OB]
[#\] 'CB]
[#\: 'COLON]
[#\. 'DOT]
[(:: #\\ #\newline)
(return-without-pos (expression-lexer input-port))]
[#\#
(begin
(let ([suck-up-line
(λ (myself ip)
(let ([next (read-char-or-special ip)])
(if (or (eof-object? next)
(and (char? next)
(equal? next #\newline)))
(return-without-pos (expression-lexer input-port))
(myself myself ip))))])
(suck-up-line suck-up-line input-port)))]
[#\'
(return-without-pos (lex-string input-port #\' start-pos))]
[#\"
(return-without-pos (lex-string input-port #\" start-pos))]
[maybe-imag-inexact-no-op
(let* ([nstr (string-append "0+" (substring lexeme LENGTH-INEXACT-PREFIX))]
[maybe-num (string->number nstr)])
(if maybe-num
(token-NUMBER (string->number (string-append "#i" nstr)))
(pyret-lexer-error MSG-ERROR-INVALID-INEXACT
(start-and-end->srcloc current-source-name
start-pos
end-pos))))]
[maybe-imag-inexact-with-op
(let* ([nstr (string-append "0" (substring lexeme LENGTH-INEXACT-PREFIX))]
[maybe-num (string->number nstr)])
(if maybe-num
(token-NUMBER (string->number (string-append "#i" nstr)))
(pyret-lexer-error MSG-ERROR-INVALID-INEXACT
(start-and-end->srcloc current-source-name
start-pos
end-pos))))]
[maybe-complex-inexact
(let* ([nstr (substring lexeme LENGTH-INEXACT-PREFIX)]
[maybe-num (string->number nstr)])
(if maybe-num
(token-NUMBER (string->number (string-append "#i" nstr)))
(pyret-lexer-error MSG-ERROR-INVALID-INEXACT
(start-and-end->srcloc current-source-name
start-pos
end-pos))))]
[maybe-real-inexact
(let* ([nstr (substring lexeme LENGTH-INEXACT-PREFIX)]
[maybe-num (string->number nstr)])
(if maybe-num
(token-NUMBER (string->number (string-append "#i" nstr)))
(pyret-lexer-error MSG-ERROR-INVALID-INEXACT
(start-and-end->srcloc current-source-name
start-pos
end-pos))))]
[maybe-imag
(let ([new-lexeme (if (equal? (string-ref lexeme 0) #\-)
(string-append "0" lexeme)
(string-append "0+" lexeme))])
(let ([maybe-num (string->number new-lexeme)])
(if maybe-num
(token-NUMBER maybe-num)
(pyret-lexer-error MSG-ERROR-INVALID-IMAG
(start-and-end->srcloc current-source-name
start-pos
end-pos)))))]
[maybe-decimal-integer
(let ([maybe-num (string->number lexeme)])
(if maybe-num
(token-NUMBER maybe-num)
(pyret-lexer-error MSG-ERROR-INVALID-NUM
(start-and-end->srcloc current-source-name
start-pos
end-pos))))]
[maybe-octal-integer
(let ([new-lexeme lexeme])
(string-set! new-lexeme 0 #\#)
(let ([maybe-num (string->number new-lexeme)])
(if maybe-num
(token-NUMBER maybe-num)
(pyret-lexer-error MSG-ERROR-INVALID-OCT
(start-and-end->srcloc current-source-name
start-pos
end-pos)))))]
[maybe-hex-integer
(let ([new-lexeme lexeme])
(string-set! new-lexeme 0 #\#)
(let ([maybe-num (string->number new-lexeme)])
(if maybe-num
(token-NUMBER maybe-num)
(pyret-lexer-error MSG-ERROR-INVALID-HEX
(start-and-end->srcloc current-source-name
start-pos
end-pos)))))]
[maybe-float
(let ([maybe-num (string->number lexeme)])
(if maybe-num
(token-NUMBER maybe-num)
(pyret-lexer-error MSG-ERROR-INVALID-FLOAT
(start-and-end->srcloc current-source-name
start-pos
end-pos))))]
[#\+ '+]
[#\- '-]
[#\* '*]
[#\/ '/]
[#\% '%]
[(:: #\* #\*) '**]
[#\< '<]
["<=" '<=]
["=" '=]
["!=" '!=]
[">=" '>=]
[">" '>]
["empty" 'EMPTY-KW]
["True" 'TRUE-KW]
["False" 'FALSE-KW]
["def" 'DEF-KW]
["and" 'AND-KW]
["or" 'OR-KW]
["not" 'NOT-KW]
["if" 'IF-KW]
["elif" 'ELIF-KW]
["else" 'ELSE-KW]
[":done" 'DONE-KW]
["struct" 'STRUCT-KW]
["in" 'IN-KW]
["fun" 'FUN-KW]
["test" 'TEST-KW]
["is" 'TEST-IS-KW]
["within" 'TEST-WITHIN-KW]
["test_error" 'TEST-ERROR-KW]
["matches" 'TEST-MATCHES-KW]
["test_range" 'TEST-RANGE-KW]
["from" 'TEST-FROM-KW]
["to" 'TEST-TO-KW]
["big_bang" 'BIG-BANG-KW]
[".." 'TWO-DOTS]
["..." 'THREE-DOTS]
["...." 'FOUR-DOTS]
["....." 'FIVE-DOTS]
["......" 'SIX-DOTS]
[(special)
(if (image? lexeme)
(token-IMAGE lexeme)
(pyret-lexer-error MSG-ERROR-INVALID-TOKEN
(start-and-end->srcloc current-source-name
start-pos
end-pos)))]
[identifier
(token-IDENTIFIER (string->symbol lexeme))]
)
)
(define-struct/contract newline-adding-lexer-state [(seen-eof? boolean?)]
#:transparent
#:mutable)
(define nal-state (make-parameter (newline-adding-lexer-state #f)))
(provide nal-state newline-adding-lexer-state)
(define (newline-adding-lexer ip)
(let ([token (expression-lexer ip)])
(if (equal? (position-token-token token) 'EOF)
(if (newline-adding-lexer-state-seen-eof? (nal-state))
token
(begin
(set-newline-adding-lexer-state-seen-eof?! (nal-state) #t)
(make-position-token 'newline
(position-token-start-pos token)
(position-token-end-pos token))))
token)))
(provide newline-adding-lexer)