#lang scheme/base (require parser-tools/lex (prefix-in : parser-tools/lex-sre) "syntactic-context.ss") (provide Operators Separators EmptyLiterals Keywords BasicTokens str-tok StringErrors string-error tokens->string make-c-lexer c-lexer) ;; XXX: most of this was lifted from the ProfessorJ parser; modify for C ;; ============================================================================= ;; TOKENS ;; ============================================================================= ;; TODO: Java -> C (define-empty-tokens Operators (PIPE OR OREQUAL = > < ! ~ ? : == <= >= != && ++ -- -> + - * / & ^ % << >> >>> += -= *= /= &= ^= %= <<= >>= >>>=)) ;; XXX: use L/R instead of O/C (define-empty-tokens Separators (O_PAREN C_PAREN O_BRACE C_BRACE O_BRACKET C_BRACKET SEMI_COLON COLON PERIOD COMMA ELLIPSIS)) (define-empty-tokens EmptyLiterals (EOF)) (define-empty-tokens Keywords (auto break case char const continue default do double else enum extern float for goto if inline int long register restrict return short signed sizeof static struct switch typedef union unsigned void volatile while _Bool _Complex)) (define-tokens BasicTokens (STRING_LIT CHAR_LIT INTEGER_LIT LONG_LIT FLOAT_LIT DOUBLE_LIT IDENTIFIER TYPEDEF_NAME STRING_ERROR NUMBER_ERROR READ_ERROR HEX_LIT OCT_LIT HEXL_LIT OCTL_LIT)) (define (trim-string s f l) (substring s f (- (string-length s) l))) ;; ============================================================================= ;; REGULAR EXPRESSIONS ;; ============================================================================= (define-lex-abbrevs (CR #\015) (LF #\012) (LineTerminator (:or CR LF (:seq CR LF))) (InputCharacter (^ CR LF)) (FF #\014) (TAB #\011) (WhiteSpace (:or #\space TAB FF LineTerminator)) ;; (Had to transform CommentTail and CommentTailStar into one RE) (Comment (:or TraditionalComment EndOfLineComment DocumentationComment)) (TraditionalComment (:seq "/*" NotStar CommentTail)) (EndOfLineComment (:seq "//" (:* (:~ CR LF)))) (DocumentationComment (:seq "/**" CommentTailStar)) (CommentTail (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash)) (:* NotStar) (:+ "*") "/")) (CommentTailStar (:seq (:* (:seq (:* "*") NotStarNotSlash (:* NotStar) "*")) (:* "*") "/")) (NotStar (:or (:~ "*"))) (NotStarNotSlash (:or (:~ "*" "/"))) (SyntaxComment (:or TraditionalCommentEOF EndOfLineComment)) (TraditionalCommentEOF (:seq "/*" CommentTailEOF)) (CommentTailEOF (:or (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash)) (:* NotStar) (:+ "*") "/") (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash)) (:* NotStar) (:* "*")))) ;; (No need to worry about excluding keywords and such. They will ;; appear first in the lexer spec) (Identifier (:seq IDLetter (:* IDLetterOrDigit))) (IDLetter (:or (:/ "A" "Z") (:/ "a" "z") "_" "$")) (IDLetterOrDigit (:or IDLetter (:/ "0" "9"))) (Keyword (:or "break" "case" "char" "const" "continue" "default" "double" "else" "enum" "extern" "float" "for" "goto" "if" "do" "int" "long" "return" "short" "sizeof" "static" "struct" "switch" "typedef" "union" "void" "while")) (Digits (:+ (:/ #\0 #\9))) (DigitsOpt (:* (:/ #\0 #\9))) (IntegerTypeSuffix (:or "l" "L")) (DecimalNumeral (:or #\0 (:seq (:/ #\1 #\9) (:* (:/ #\0 #\9))))) (HexDigit (:or (:/ #\0 #\9) (:/ #\a #\f) (:/ #\A #\F))) (HexNumeral (:or (:seq #\0 "x" (:+ HexDigit)) (:seq #\0 "X" (:+ HexDigit)))) (OctalNumeral (:seq #\0 (:+ (:/ #\0 #\7)))) (FloatTypeSuffix (:or "f" "F")) (DoubleTypeSuffix (:or "d" "D")) (FloatA (:seq Digits #\. DigitsOpt (:? ExponentPart))) (FloatB (:seq #\. Digits (:? ExponentPart))) (FloatC (:seq Digits ExponentPart)) (FloatD (:seq Digits (:? ExponentPart))) (ExponentPart (:seq (:or "e" "E") (:? (:or "+" "-")) Digits)) (EscapeSequence (:or "\\b" "\\t" "\\n" "\\f" "\\r" "\\\"" "\\'" "\\\\" (:seq #\\ (:/ #\0 #\3) (:/ #\0 #\7) (:/ #\0 #\7)) (:seq #\\ (:/ #\0 #\7) (:/ #\0 #\7)) (:seq #\\ (:/ #\0 #\7)))) (Operator (:or ">" "<" "!" "~" "?" ":" "==" "<=" ">=" "!=" "&&" "||" "++" "--" "->" "+" "-" "*" "/" "&" "|" "^" "%" "<<" ">>" ">>>" "+=" "-=" "*=" "/=" "&=" "|=" "^=" "%=" "<<=" ">>=" ">>>="))) ;; String tokens (define-tokens str-tok (STRING_CHAR)) (define-empty-tokens StringErrors (STRING_END STRING_EOF STRING_NEWLINE)) (define-struct string-error (string error-token) #:transparent) ;; ============================================================================= ;; LEXERS ;; ============================================================================= ;; tokens->string : (listof position-token) -> string (define (tokens->string toks) (list->string (map (compose token-value position-token-token) toks))) ;; string-lexer : position input-port -> position-token (define (string-lexer first-token-pos in) (let* ([tokens (get-string-tokens in)] [rev-tokens (reverse tokens)] [last-token (car rev-tokens)] [str (tokens->string (reverse (cdr rev-tokens)))]) (make-position-token (if (eq? 'STRING_END (get-token-name last-token)) (token-STRING_LIT str) (token-STRING_ERROR (make-string-error str (position-token-token last-token)))) first-token-pos (position-token-end-pos last-token)))) ;; get-string-tokens : input-port -> (listof position-token) (define (get-string-tokens in) (let ((tok (get-string-token in))) (case (get-token-name tok) ((STRING_EOF STRING_END STRING_NEWLINE) (list tok)) (else (cons tok (get-string-tokens in)))))) ;; get-string-token : input-port -> position-token (define get-string-token (lexer-src-pos (#\" (token-STRING_END)) (EscapeSequence (token-STRING_CHAR (EscapeSequence->char lexeme))) ((:~ CR LF) (token-STRING_CHAR (string-ref lexeme 0))) ((:or CR LF) (token-STRING_NEWLINE)) (#\032 (token-STRING_EOF)) ((eof) (token-STRING_EOF)))) ;; get-token-name : position-token -> symbol (define (get-token-name tok) (token-name (position-token-token tok))) (define (EscapeSequence->char es) (case es [("\\b") #\010] [("\\t") #\011] [("\\n") #\012] [("\\f") #\014] [("\\r") #\015] [("\\\"") #\"] [("\\'") #\'] [("\\\\") #\\] [else (integer->char (string->number (trim-string es 1 0) 8))])) (define (make-c-lexer ps ls) (define (return x) (begin (save-token! ls x) x)) (lexer-src-pos ["=" (begin (return '=))] [Operator (return (case lexeme [("|") (token-PIPE)] [("||") (token-OR)] [("|=") (token-OREQUAL)] [else (string->symbol lexeme)]))] ["(" (begin (parenthesis++ ls) (return (token-O_PAREN)))] [")" (begin (parenthesis-- ls) (return (token-C_PAREN)))] ["{" (begin (brace++ ls) ;(push-scope! ps) (return (token-O_BRACE)))] ["}" (begin (brace-- ls) ;(pop-scope! ps) (if (and (lexer-state-read? ls) (zero? (lexer-state-brace-depth ls))) (return 'EOF) (return (token-C_BRACE))))] ["[" (return (token-O_BRACKET))] ["]" (return (token-C_BRACKET))] [":" (return (token-COLON))] [";" (begin (when (declaration-context? ps) (dequeue-declaration! ps ls) (lexer-declaration-depth-- ls) (printf "dequeued for semi\n") (dump-context ps ls)) (return (token-SEMI_COLON)))] ["," (begin (when (declaration-context? ps) (dequeue-declaration! ps ls) (lexer-declaration-depth-- ls) (printf "dequeued for comma\n") (dump-context ps ls)) (return (token-COMMA)))] ["..." (return (token-ELLIPSIS))] ["." (return (token-PERIOD))] [#\" (return-without-pos (return (string-lexer start-pos input-port)))] [(:seq #\' (:~ CR LF #\' #\\) #\') (return (token-CHAR_LIT (string-ref lexeme 1)))] [(:seq #\' EscapeSequence #\') (return (token-CHAR_LIT (EscapeSequence->char (trim-string lexeme 1 1))))] [(:or FloatA FloatB FloatC) (return (token-DOUBLE_LIT (string->number lexeme)))] [(:seq (:or FloatA FloatB FloatC FloatD) FloatTypeSuffix) (return (token-FLOAT_LIT (string->number (trim-string lexeme 0 1))))] [(:seq (:or FloatA FloatB FloatC FloatD) DoubleTypeSuffix) (return (token-DOUBLE_LIT (string->number (trim-string lexeme 0 1))))] [DecimalNumeral (return (token-INTEGER_LIT (string->number lexeme 10)))] [(:seq DecimalNumeral IntegerTypeSuffix) (return (token-LONG_LIT (string->number (trim-string lexeme 0 1) 10)))] [HexNumeral (return (token-HEX_LIT (string->number (trim-string lexeme 2 0) 16)))] [(:seq HexNumeral IntegerTypeSuffix) (return (token-HEXL_LIT (string->number (trim-string lexeme 2 1) 16)))] [OctalNumeral (return (token-OCT_LIT (string->number (trim-string lexeme 1 0) 8)))] [(:seq OctalNumeral IntegerTypeSuffix) (return (token-OCTL_LIT (string->number (trim-string lexeme 1 1) 8)))] [Keyword (return (string->symbol lexeme))] [Identifier (let ([id (string->symbol lexeme)]) (printf "looking at ~a~n" id) (dump-context ps ls) (return (cond [(lookup id (parser-state-env ps)) => (lambda (binding) (if (eq? binding 'type) (token-TYPEDEF_NAME id) (token-IDENTIFIER id)))] [else (token-IDENTIFIER id)])))] [Comment (return-without-pos ((make-c-lexer ps ls) input-port))] [(:+ WhiteSpace) (return-without-pos ((make-c-lexer ps ls) input-port))] [#\032 (if (and (lexer-state-read? ls) (> (lexer-state-brace-depth ls) 0)) (return (token-READ_ERROR "expected a `}' to close `{'")) (return 'EOF))] [(eof) (if (and (lexer-state-read? ls) (> (lexer-state-brace-depth ls) 0)) (return (token-READ_ERROR "expected a `}' to close `{'")) (return 'EOF))] [(:+ (:or (:/ #\0 #\9)(:/ #\a #\z)(:/ #\A #\Z))) (return (token-NUMBER_ERROR lexeme))] )) (define c-lexer (make-c-lexer (new-parser-state) (new-lexer-state #f)))