lexer.ss
(module lexer mzscheme
  (require
   (lib "lex.ss" "parser-tools")
   (prefix : (lib "lex-sre.ss" "parser-tools"))
   (lib "match.ss"))
  (provide
   get-token
   get-comment-token
   get-string-token)
  (define (ret lexeme type paren start-pos end-pos)
    (values lexeme type paren (position-offset start-pos) (position-offset end-pos)))
  (define-lex-abbrevs
    [digit (:/ "0" "9")]
    [digit2 (:/ "0" "1")]
    [digit8 (:/ "0" "7")]
    [digit10 digit]
    [digit16 (:/ "af" "AF" "09")]
    [unicode
     (:or
      (:: "u" (:** 1 4 digit16))
      (:: "U" (:** 1 6 digit16)))]
    [keyword
     (:or
      "as" "assert" "do" "done" "downto" "else" "exception" "for" "fun"
      "function" "if" "lazy" "match" "mutable" "new" "private" "then"
      "to" "try" "when" "while" "with")]
    [governing-keyword
     (:or
      "and" "class" "constraint" "external" "functor" "in"
      "include" "inherit" "initializer" "let" "method"
      "module" "object" "open" "rec" "type" "val" "virtual")]
    [open-close (:or "begin" "end" "sig" "struct")]
    [true-false (:or "true" "false")]
    [ident (:: alphabetic (:* (:or alphabetic numeric "'" "_")))]
    [int-expr (:: numeric (:* (:or numeric "_")))]
    [int-literal (:: (:? "-") int-expr)]
    [decimal-expr (:: "." (:* (:or numeric "_")))]
    [bad-exponent-expr (:: (:or "e" "E") (:? (:or "+" "-")))]
    [exponent-expr (:: (:or "e" "E") (:? (:or "+" "-")) int-expr)]
    [float-literal (:: int-literal (:? decimal-expr) (:? exponent-expr))]
    [bad-float-literal (:: int-literal (:? decimal-expr) bad-exponent-expr)]
    [text-operator (:or "asr" "land" "lor" "lsl" "lsr" "lxor" "mod" "or" "of")]
    [operator
     (:or
      "=" "<" ">" "@" "^" "|" "&" "+" "-" "*" "/" "$" "%"
      "." "::" "_" ";" "," ":" "->")]
    [paren (:or "(" ")" "[" "]" "{" "}")]
    [comment-start (:or "(*" "(*)")]
    [comment-end "*)"]
    [non-comment-tag
     (:or
      (:~ "(" ")")
      (:: "(" (:~ "*"))
      (:: (:~ "*") ")")
      (:: "(" (:~ "*") ")"))]
    [comment-inside (:: (:+ non-comment-tag))]
    [comment-through-end (:: (:* non-comment-tag) "*)")]
    [double-semi ";;"]
    [char-elt
     (:or
      (:~ (:or "\"" "\\"))
      "\\\\"
      "\\\""
      "\\'"
      "\\n"
      "\\t"
      "\\b"
      "\\r"
      (:: "\\" (:** 1 3 digit8))
      (:: "\\x" (:** 1 2 digit16)))]
    [char-literal (:: "\'" char-elt "\'")]
    [bad-str (:: "\"" (:* (:~ "\"" "\\")
                     (:: "\\" any-char))
                 (:? "\\" "\""))]
    [str (:: "\"" (:* char-elt) "\"")])
  
  ; get-comment-token: input-port -> (values string symbol boolean number number)
  (define get-comment-token
    (lexer
     [comment-through-end (ret lexeme 'comment-end #f start-pos end-pos)]
     [comment-start (find-end-of-comment
                     input-port lexeme
                     (position-offset start-pos)
                     (position-offset end-pos))]
     [comment-inside (ret lexeme 'comment-inside #f start-pos end-pos)]
     [(eof) (values lexeme 'eof #f #f #f)]
     [any-char (ret lexeme 'error #f start-pos end-pos)]))
  
  ; find-end-of-comment: input-port string number number ->
  ;   (values string symbol boolean number number)
  (define (find-end-of-comment input-port lexeme start-pos end-pos)
    (define-values (result-lexeme result-status paren comment-start-pos comment-end-pos)
      (get-comment-token input-port))
    (cond [(eq? result-status 'eof)
           (values lexeme 'error paren start-pos end-pos)]
          [(eq? result-status 'comment-end)
           (values (string-append lexeme result-lexeme)
                   'comment
                   paren start-pos comment-end-pos)]
          [(eq? result-status 'comment-inside)
           (find-end-of-comment
            input-port
            (string-append lexeme result-lexeme)
            start-pos comment-end-pos)]
          [(eq? result-status 'comment)
           (find-end-of-comment
            input-port
            (string-append lexeme result-lexeme)
            start-pos comment-end-pos)]
          [(eq? result-status 'error)
           (values (string-append lexeme result-lexeme)
                   'error
                   paren start-pos comment-end-pos)]))
  
  (define get-string-token
    (lexer
     [str (ret lexeme 'string #f start-pos end-pos)]
     [bad-str (ret lexeme 'error #f start-pos end-pos)]
     [any-char (ret lexeme 'error #f start-pos end-pos)]
     [(special)
      (ret "" 'no-color #f start-pos end-pos)]
     [(special-comment)
      (ret "" 'no-color #f start-pos end-pos)]))

  (define ocaml-lexer
    (lexer
     [whitespace (ret lexeme 'white-space #f start-pos end-pos)]
     [(eof) (values lexeme 'eof #f #f #f)]
     [double-semi (ret lexeme 'double-semi #f start-pos end-pos)]
     [comment-start (find-end-of-comment
                     input-port lexeme
                     (position-offset start-pos) (position-offset end-pos))]
     [char-literal (ret lexeme 'string #f start-pos end-pos)]
     [str (ret lexeme 'string #f start-pos end-pos)]
     [text-operator (ret lexeme 'operator #f start-pos end-pos)]
     [paren (ret lexeme 'parenthesis (string->symbol lexeme) start-pos end-pos)]
     [open-close (ret lexeme 'governing-keyword #f start-pos end-pos)]
     [int-literal (ret lexeme 'number #f start-pos end-pos)]
     [float-literal (ret lexeme 'number #f start-pos end-pos)]
     [bad-float-literal (ret lexeme 'error #f start-pos end-pos)]
     [true-false (ret lexeme 'true-false #f start-pos end-pos)]
     [keyword (ret lexeme 'keyword #f start-pos end-pos)]
     [governing-keyword (ret lexeme 'governing-keyword #f start-pos end-pos)]
     [ident (ret lexeme 'identifier #f start-pos end-pos)]
     [operator (ret lexeme 'operator #f start-pos end-pos)]
     [bad-str (ret lexeme 'error #f start-pos end-pos)]
     [any-char (ret lexeme 'error #f start-pos end-pos)]
     [(special)
      (ret "" 'no-color #f start-pos end-pos)]
     [(special-comment)
      (ret "" 'no-color #f start-pos end-pos)]))
  
  (define get-token ocaml-lexer)
  
  )