lexer.ss
(module lexer mzscheme
  (require (lib "lex.ss" "parser-tools"))
  (require (prefix : (lib "lex-sre.ss" "parser-tools")))
  (require (lib "etc.ss"))

  ;; ===========================================================================
  ;; TOKENS
  ;; ===========================================================================

  ;; TODO: Java -> C
  (define-empty-tokens Operators
    (PIPE OR OREQUAL
     =	> < !	~	?	:
     ==	<=	>=	!=	&&	++	--
     +	-	*	/	&	^	%	<< >> >>>
     +=	-=	*=	/=	&=	^=	%=	<<=	>>=	>>>=))

  (define-empty-tokens Separators
   (O_PAREN C_PAREN O_BRACE C_BRACE O_BRACKET C_BRACKET SEMI_COLON PERIOD COMMA))

  (define-empty-tokens EmptyLiterals
   (EOF))

  (define-empty-tokens Keywords
    (break case char const continue default double else float for goto if do int long
     return short static struct switch typedef union void while))

  (define-tokens BasicTokens
    (STRING_LIT CHAR_LIT INTEGER_LIT LONG_LIT FLOAT_LIT DOUBLE_LIT 
     IDENTIFIER STRING_ERROR NUMBER_ERROR READ_ERROR HEX_LIT OCT_LIT HEXL_LIT OCTL_LIT))

  (define (trim-string s f l)
    (substring s f (- (string-length s) l)))

  ;; ===========================================================================
  ;; REGULAR EXPRESSIONS
  ;; ===========================================================================

  (define-lex-abbrevs
    (CR #\015)
    (LF #\012)
    (LineTerminator (:or CR 
		       LF 
		       (:seq CR LF)))
    (InputCharacter (^ CR LF))

    (FF #\014)
    (TAB #\011)
    (WhiteSpace (:or #\space 
		   TAB
		   FF
		   LineTerminator))

    ;; (Had to transform CommentTail and CommentTailStar into one RE)
    (Comment (:or TraditionalComment 
		EndOfLineComment
                DocumentationComment))
    (TraditionalComment (:seq "/*" NotStar CommentTail))
    (EndOfLineComment (:seq "//" (:* (:~ CR LF))))
    (DocumentationComment (:seq "/**" CommentTailStar))
    (CommentTail (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash))
                    (:* NotStar)
		    (:+ "*")
		    "/"))
    (CommentTailStar (:seq (:* (:seq (:* "*") NotStarNotSlash (:* NotStar) "*"))
                        (:* "*")
                        "/"))
    (NotStar (:or (:~ "*")))
    (NotStarNotSlash (:or (:~ "*" "/")))
                     
    (SyntaxComment (:or TraditionalCommentEOF
                      EndOfLineComment))
    (TraditionalCommentEOF (:seq "/*" CommentTailEOF))
    (CommentTailEOF (:or (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash))
                        (:* NotStar)
                        (:+ "*")
                        "/")
                       (:seq (:* (:seq (:* NotStar) (:+ "*") NotStarNotSlash))
                        (:* NotStar)
                        (:* "*"))))

    ;; (No need to worry about excluding keywords and such.  They will
    ;;  appear first in the lexer spec)
    (Identifier (:seq IDLetter (:* IDLetterOrDigit)))
    (IDLetter (:or (:/ "A" "Z")
		   (:/ "a" "z")
		   "_"
		   "$"))
    (IDLetterOrDigit (:or IDLetter
			  (:/ "0" "9")))

    (Keyword (:or "break" "case" "char" "const" "continue" "default" "double"
                  "else" "float" "for" "goto" "if" "do" "int" "long" "return"
                  "short" "static" "struct" "switch" "typedef" "union" "void" "while"))

    (Digits (:+ (:/ #\0 #\9)))
    (DigitsOpt (:* (:/ #\0 #\9)))

    (IntegerTypeSuffix (:or "l" "L"))
    (DecimalNumeral (:or #\0
		       (:seq (:/ #\1 #\9) (:* (:/ #\0 #\9)))))
    (HexDigit (:or (:/ #\0 #\9)
		 (:/ #\a #\f)
		 (:/ #\A #\F)))
    (HexNumeral (:or (:seq #\0 "x" (:+ HexDigit))
		   (:seq #\0 "X" (:+ HexDigit))))
    (OctalNumeral (:seq #\0 (:+ (:/ #\0 #\7))))

    (FloatTypeSuffix (:or "f" "F"))
    (DoubleTypeSuffix (:or "d" "D"))

    (FloatA (:seq Digits #\. DigitsOpt (:? ExponentPart)))
    (FloatB (:seq #\. Digits (:? ExponentPart)))
    (FloatC (:seq Digits ExponentPart))
    (FloatD (:seq Digits (:? ExponentPart)))

    (ExponentPart (:seq (:or "e" "E") (:? (:or "+" "-")) Digits))

    (EscapeSequence (:or "\\b" "\\t" "\\n" "\\f" "\\r" "\\\"" "\\'" "\\\\"
		       (:seq #\\ (:/ #\0 #\3) (:/ #\0 #\7) (:/ #\0 #\7))
		       (:seq #\\ (:/ #\0 #\7) (:/ #\0 #\7))
		       (:seq #\\ (:/ #\0 #\7))))

    (Operator (:or "="	">" "<" "!"	"~"	"?"	":"
		 "=="	"<="	">="	"!="	"&&" "||"	"++"	"--"
		 "+"	"-"	"*"	"/"	"&" "|"	"^"	"%"	"<<" ">>" ">>>"
		 "+="	"-="	"*="	"/="	"&="	"|="	"^="	"%="	"<<="	">>="	">>>=")))

  ;; String tokens
  (define-tokens str-tok (STRING_CHAR))
  (define-empty-tokens StringErrors (STRING_END STRING_EOF STRING_NEWLINE))

  (define-struct string-error (string error-token) #f)

  ;; ===========================================================================
  ;; LEXERS
  ;; ===========================================================================

  ;; tokens->string : (listof position-token) -> string
  (define (tokens->string toks)
    (list->string (map (compose token-value position-token-token) toks)))

  ;; string-lexer : position input-port -> position-token
  (define (string-lexer first-token-pos in)
    (let* ([tokens (get-string-tokens in)]
           [rev-tokens (reverse tokens)]
           [last-token (car rev-tokens)]
           [str (tokens->string (reverse (cdr rev-tokens)))])
      (make-position-token
       (if (eq? 'STRING_END (get-token-name last-token))
           (token-STRING_LIT str)
           (token-STRING_ERROR (make-string-error str (position-token-token last-token))))
       first-token-pos
       (position-token-end-pos last-token))))

  ;; get-string-tokens : input-port -> (listof position-token)
  (define (get-string-tokens in)
    (let ((tok (get-string-token in)))
      (case (get-token-name tok)
        ((STRING_EOF STRING_END STRING_NEWLINE) (list tok))
        (else (cons tok (get-string-tokens in))))))

  ;; get-string-token : input-port -> position-token
  (define get-string-token
    (lexer-src-pos
     (#\" (token-STRING_END))
     (EscapeSequence (token-STRING_CHAR (EscapeSequence->char lexeme)))
     ((:~ CR LF) (token-STRING_CHAR (string-ref lexeme 0)))
     ((:or CR LF) (token-STRING_NEWLINE))
     (#\032 (token-STRING_EOF))
     ((eof) (token-STRING_EOF))))

  ;; get-token-name : position-token -> symbol
  (define (get-token-name tok)
    (token-name (position-token-token tok)))

  (define (EscapeSequence->char es)
    (cond
     ((string=? es "\\b") #\010)
     ((string=? es "\\t") #\011)
     ((string=? es "\\n") #\012)
     ((string=? es "\\f") #\014)
     ((string=? es "\\r") #\015)
     ((string=? es "\\\"") #\")
     ((string=? es "\\'") #\')
     ((string=? es "\\\\") #\\)
     (else (integer->char (string->number (trim-string es 1 0) 8)))))

  (define (make-c-lexer nesting)
    (lexer-src-pos
     (Operator (let ((l lexeme))
                 (cond
                   ((string=? l "|") (token-PIPE))
                   ((string=? l "||") (token-OR))
                   ((string=? l "|=") (token-OREQUAL))
                   (else (string->symbol l)))))

     ("(" (token-O_PAREN))
     (")" (token-C_PAREN))
     ("{" (begin
            (when nesting
              (box++ nesting))
            (token-O_BRACE)))
     ("}" (if nesting
              (begin
                (box-- nesting)
                (if (zero? (unbox nesting))
                    'EOF
                    (token-C_BRACE)))
              (token-C_BRACE)))
;          (token-C_BRACE))
     ("[" (token-O_BRACKET))
     ("]" (token-C_BRACKET))
     (";" (token-SEMI_COLON))
     ("," (token-COMMA))
     ("." (token-PERIOD))

     (#\" (return-without-pos (string-lexer start-pos input-port)))

     ((:seq #\' (:~ CR LF #\' #\\) #\')
      (token-CHAR_LIT (string-ref lexeme 1)))
     ((:seq #\' EscapeSequence #\') 
      (token-CHAR_LIT (EscapeSequence->char 
                       (trim-string lexeme 1 1))))

     ((:or FloatA FloatB FloatC)
      (token-DOUBLE_LIT (string->number lexeme)))
     ((:seq (:or FloatA FloatB FloatC FloatD) FloatTypeSuffix)
      (token-FLOAT_LIT (string->number (trim-string lexeme 0 1))))
     ((:seq (:or FloatA FloatB FloatC FloatD) DoubleTypeSuffix)
      (token-DOUBLE_LIT (string->number (trim-string lexeme 0 1))))

     (DecimalNumeral
      (token-INTEGER_LIT (string->number lexeme 10)))
     ((:seq DecimalNumeral IntegerTypeSuffix)
      (token-LONG_LIT (string->number (trim-string lexeme 0 1) 10)))
     (HexNumeral
      (token-HEX_LIT (string->number (trim-string lexeme 2 0) 16)))
     ((:seq HexNumeral IntegerTypeSuffix)
      (token-HEXL_LIT (string->number (trim-string lexeme 2 1) 16)))
     (OctalNumeral
      (token-OCT_LIT (string->number (trim-string lexeme 1 0) 8)))
     ((:seq OctalNumeral IntegerTypeSuffix)
      (token-OCTL_LIT (string->number (trim-string lexeme 1 1) 8)))

     (Keyword (string->symbol lexeme))

     (Identifier (token-IDENTIFIER (string->symbol lexeme)))

     (Comment (return-without-pos ((make-c-lexer nesting) input-port)))

     ((:+ WhiteSpace) (return-without-pos ((make-c-lexer nesting) input-port)))

     (#\032 (if (and nesting (> (unbox nesting) 0))
                (token-READ_ERROR "expected a `}' to close `{'")
                'EOF))
     ((eof) (if (and nesting (> (unbox nesting) 0))
                (token-READ_ERROR "expected a `}' to close `{'")
                'EOF))

     ((:+ (:or (:/ #\0 #\9)(:/ #\a #\z)(:/ #\A #\Z))) (token-NUMBER_ERROR lexeme))
     
     ))

  (define c-lexer (make-c-lexer #f))

  (define (box++ box)
    (let ([v (unbox box)])
      (set-box! box (add1 v))
      v))

  (define (box-- box)
    (let ([v (unbox box)])
      (set-box! box (sub1 v))
      v))

  (provide (all-defined-except EscapeSequence->char
                               box++ box--
                               string-lexer get-string-tokens get-string-token get-token-name
                               trim-string)))