(module lexer mzscheme
(require (lib "lex.ss" "parser-tools"))
(require (prefix : (lib "lex-sre.ss" "parser-tools")))
(require (lib "etc.ss"))
(define-empty-tokens Operators
(PIPE OR OREQUAL
= > < ! ~ ? :
== <= >= != && ++ --
+ - * / & ^ % << >> >>>
+= -= *= /= &= ^= %= <<= >>= >>>=))
(define-empty-tokens Separators
(O_PAREN C_PAREN O_BRACE C_BRACE O_BRACKET C_BRACKET SEMI_COLON PERIOD COMMA))
(define-empty-tokens EmptyLiterals
(EOF))
(define-empty-tokens Keywords
(break case char const continue default double else float for goto if do int long
return short static struct switch typedef union void while))
(define-tokens BasicTokens
(STRING_LIT CHAR_LIT INTEGER_LIT LONG_LIT FLOAT_LIT DOUBLE_LIT
IDENTIFIER STRING_ERROR NUMBER_ERROR READ_ERROR HEX_LIT OCT_LIT HEXL_LIT OCTL_LIT))
(define (trim-string s f l)
(substring s f (- (string-length s) l)))
(define-lex-abbrevs
(CR #\015)
(LF #\012)
(LineTerminator (:or CR
LF
(:seq CR LF)))
(InputCharacter (^ CR LF))
(FF #\014)
(TAB #\011)
(WhiteSpace (:or #\space
TAB
FF
LineTerminator))
(Comment (:or TraditionalComment
EndOfLineComment
DocumentationComment))
(TraditionalComment (:seq "/*" NotStar CommentTail))
(EndOfLineComment (:seq "//" (:* (:~ CR LF))))
(DocumentationComment (:seq "/**" CommentTailStar))
(CommentTail (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash))
(:* NotStar)
(:+ "*")
"/"))
(CommentTailStar (:seq (:* (:seq (:* "*") NotStarNotSlash (:* NotStar) "*"))
(:* "*")
"/"))
(NotStar (:or (:~ "*")))
(NotStarNotSlash (:or (:~ "*" "/")))
(SyntaxComment (:or TraditionalCommentEOF
EndOfLineComment))
(TraditionalCommentEOF (:seq "/*" CommentTailEOF))
(CommentTailEOF (:or (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash))
(:* NotStar)
(:+ "*")
"/")
(:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash))
(:* NotStar)
(:* "*"))))
(Identifier (:seq IDLetter (:* IDLetterOrDigit)))
(IDLetter (:or (:/ "A" "Z")
(:/ "a" "z")
"_"
"$"))
(IDLetterOrDigit (:or IDLetter
(:/ "0" "9")))
(Keyword (:or "break" "case" "char" "const" "continue" "default" "double"
"else" "float" "for" "goto" "if" "do" "int" "long" "return"
"short" "static" "struct" "switch" "typedef" "union" "void" "while"))
(Digits (:+ (:/ #\0 #\9)))
(DigitsOpt (:* (:/ #\0 #\9)))
(IntegerTypeSuffix (:or "l" "L"))
(DecimalNumeral (:or #\0
(:seq (:/ #\1 #\9) (:* (:/ #\0 #\9)))))
(HexDigit (:or (:/ #\0 #\9)
(:/ #\a #\f)
(:/ #\A #\F)))
(HexNumeral (:or (:seq #\0 "x" (:+ HexDigit))
(:seq #\0 "X" (:+ HexDigit))))
(OctalNumeral (:seq #\0 (:+ (:/ #\0 #\7))))
(FloatTypeSuffix (:or "f" "F"))
(DoubleTypeSuffix (:or "d" "D"))
(FloatA (:seq Digits #\. DigitsOpt (:? ExponentPart)))
(FloatB (:seq #\. Digits (:? ExponentPart)))
(FloatC (:seq Digits ExponentPart))
(FloatD (:seq Digits (:? ExponentPart)))
(ExponentPart (:seq (:or "e" "E") (:? (:or "+" "-")) Digits))
(EscapeSequence (:or "\\b" "\\t" "\\n" "\\f" "\\r" "\\\"" "\\'" "\\\\"
(:seq #\\ (:/ #\0 #\3) (:/ #\0 #\7) (:/ #\0 #\7))
(:seq #\\ (:/ #\0 #\7) (:/ #\0 #\7))
(:seq #\\ (:/ #\0 #\7))))
(Operator (:or "=" ">" "<" "!" "~" "?" ":"
"==" "<=" ">=" "!=" "&&" "||" "++" "--"
"+" "-" "*" "/" "&" "|" "^" "%" "<<" ">>" ">>>"
"+=" "-=" "*=" "/=" "&=" "|=" "^=" "%=" "<<=" ">>=" ">>>=")))
(define-tokens str-tok (STRING_CHAR))
(define-empty-tokens StringErrors (STRING_END STRING_EOF STRING_NEWLINE))
(define-struct string-error (string error-token) #f)
(define (tokens->string toks)
(list->string (map (compose token-value position-token-token) toks)))
(define (string-lexer first-token-pos in)
(let* ([tokens (get-string-tokens in)]
[rev-tokens (reverse tokens)]
[last-token (car rev-tokens)]
[str (tokens->string (reverse (cdr rev-tokens)))])
(make-position-token
(if (eq? 'STRING_END (get-token-name last-token))
(token-STRING_LIT str)
(token-STRING_ERROR (make-string-error str (position-token-token last-token))))
first-token-pos
(position-token-end-pos last-token))))
(define (get-string-tokens in)
(let ((tok (get-string-token in)))
(case (get-token-name tok)
((STRING_EOF STRING_END STRING_NEWLINE) (list tok))
(else (cons tok (get-string-tokens in))))))
(define get-string-token
(lexer-src-pos
(#\" (token-STRING_END))
(EscapeSequence (token-STRING_CHAR (EscapeSequence->char lexeme)))
((:~ CR LF) (token-STRING_CHAR (string-ref lexeme 0)))
((:or CR LF) (token-STRING_NEWLINE))
(#\032 (token-STRING_EOF))
((eof) (token-STRING_EOF))))
(define (get-token-name tok)
(token-name (position-token-token tok)))
(define (EscapeSequence->char es)
(cond
((string=? es "\\b") #\010)
((string=? es "\\t") #\011)
((string=? es "\\n") #\012)
((string=? es "\\f") #\014)
((string=? es "\\r") #\015)
((string=? es "\\\"") #\")
((string=? es "\\'") #\')
((string=? es "\\\\") #\\)
(else (integer->char (string->number (trim-string es 1 0) 8)))))
(define (make-c-lexer nesting)
(lexer-src-pos
(Operator (let ((l lexeme))
(cond
((string=? l "|") (token-PIPE))
((string=? l "||") (token-OR))
((string=? l "|=") (token-OREQUAL))
(else (string->symbol l)))))
("(" (token-O_PAREN))
(")" (token-C_PAREN))
("{" (begin
(when nesting
(box++ nesting))
(token-O_BRACE)))
("}" (if nesting
(begin
(box-- nesting)
(if (zero? (unbox nesting))
'EOF
(token-C_BRACE)))
(token-C_BRACE)))
("[" (token-O_BRACKET))
("]" (token-C_BRACKET))
(";" (token-SEMI_COLON))
("," (token-COMMA))
("." (token-PERIOD))
(#\" (return-without-pos (string-lexer start-pos input-port)))
((:seq #\' (:~ CR LF #\' #\\) #\')
(token-CHAR_LIT (string-ref lexeme 1)))
((:seq #\' EscapeSequence #\')
(token-CHAR_LIT (EscapeSequence->char
(trim-string lexeme 1 1))))
((:or FloatA FloatB FloatC)
(token-DOUBLE_LIT (string->number lexeme)))
((:seq (:or FloatA FloatB FloatC FloatD) FloatTypeSuffix)
(token-FLOAT_LIT (string->number (trim-string lexeme 0 1))))
((:seq (:or FloatA FloatB FloatC FloatD) DoubleTypeSuffix)
(token-DOUBLE_LIT (string->number (trim-string lexeme 0 1))))
(DecimalNumeral
(token-INTEGER_LIT (string->number lexeme 10)))
((:seq DecimalNumeral IntegerTypeSuffix)
(token-LONG_LIT (string->number (trim-string lexeme 0 1) 10)))
(HexNumeral
(token-HEX_LIT (string->number (trim-string lexeme 2 0) 16)))
((:seq HexNumeral IntegerTypeSuffix)
(token-HEXL_LIT (string->number (trim-string lexeme 2 1) 16)))
(OctalNumeral
(token-OCT_LIT (string->number (trim-string lexeme 1 0) 8)))
((:seq OctalNumeral IntegerTypeSuffix)
(token-OCTL_LIT (string->number (trim-string lexeme 1 1) 8)))
(Keyword (string->symbol lexeme))
(Identifier (token-IDENTIFIER (string->symbol lexeme)))
(Comment (return-without-pos ((make-c-lexer nesting) input-port)))
((:+ WhiteSpace) (return-without-pos ((make-c-lexer nesting) input-port)))
(#\032 (if (and nesting (> (unbox nesting) 0))
(token-READ_ERROR "expected a `}' to close `{'")
'EOF))
((eof) (if (and nesting (> (unbox nesting) 0))
(token-READ_ERROR "expected a `}' to close `{'")
'EOF))
((:+ (:or (:/ #\0 #\9)(:/ #\a #\z)(:/ #\A #\Z))) (token-NUMBER_ERROR lexeme))
))
(define c-lexer (make-c-lexer #f))
(define (box++ box)
(let ([v (unbox box)])
(set-box! box (add1 v))
v))
(define (box-- box)
(let ([v (unbox box)])
(set-box! box (sub1 v))
v))
(provide (all-defined-except EscapeSequence->char
box++ box--
string-lexer get-string-tokens get-string-token get-token-name
trim-string)))