syntax-color.ss
(module syntax-color mzscheme
  (require (lib "lex.ss" "parser-tools")
           (prefix : (lib "lex-sre.ss" "parser-tools"))
           "syntax/regexps.ss"
           "config.ss")

  ;; Unfortunately, tokenizing JavaScript is context-sensitive. The syntax of
  ;; regular expressions (similar to that of Perl) is ambiguous with division,
  ;; as the following example demonstrates:

  ;;     var g = 2;
  ;;     function f(re) { return "3".match(re); }
  ;;
  ;;     4  /3/g    /* evaluates to 0.66666666 */
  ;;     f( /3/g )  /* evaluates to "3"        */
  ;;        ^^^^

  ;; The language definition states that regular expression literals should only
  ;; be allowed in expression contexts, and division operators should only be
  ;; allowed in binary infix operator position.

  ;; However, for syntax coloring, we tokenize without parsing, so we cannot
  ;; no whether we are in an expression context or an operator context. So there
  ;; is no way to know the correct way to tokenize.

  ;; Moreover, in the PLT Framework manual, section 7.1, under the start-colorer
  ;; method of color:text<%>, the documented invariant for a syntax coloring
  ;; tokenizer is that "the tokenization of some part of the input cannot depend
  ;; on earlier parts of the input." So we cannot even perform a heuristic guess
  ;; based on the previous token.

  ;; We choose to prefer regular expression literals when it is possible to lex
  ;; them, and otherwise back off to the division operator. In practice, the
  ;; ambiguity should not arise often.

  (define (syn-val lex a b c d)
    (values lex a b (position-offset c) (position-offset d)))

  (define (colorize-string delimiter my-start-pos)
    (define lxr
      (lexer
       [(:or #\' #\")
        (if (string=? lexeme delimiter)
            (syn-val "" 'string #f my-start-pos end-pos)
            (lxr input-port))]
       [(eof) (syn-val "" 'error #f my-start-pos end-pos)]
       [(:seq #\\ (:or #\' #\")) (lxr input-port)]
       [any-char (lxr input-port)]))
    lxr)

  (define (colorize-block-comment my-start-pos)
    (define lxr
      (lexer
       [(:seq #\* #\/)
        (syn-val "" 'comment #f my-start-pos end-pos)]
       [(eof) (syn-val "" 'error #f my-start-pos end-pos)]
       [any-char (lxr input-port)]))
    lxr)

  ;; TODO: fix the taxonomy a little
  (define get-syntax-token
    (lexer
     [(:or "true" "false" "null")
      (syn-val lexeme 'literal #f start-pos end-pos)]     
     [lex:integer
      (syn-val lexeme 'literal #f start-pos end-pos)]
     [lex:float
      (syn-val lexeme 'literal #f start-pos end-pos)]
     [(:or "[" "]" "{" "}" "(" ")")
      (syn-val lexeme 'parenthesis (string->symbol lexeme) start-pos end-pos)]
     [(:or "," ":" ";" "=" ".")
      (syn-val lexeme 'default #f start-pos end-pos)]
;     [lex:keyword
;      (syn-val lexeme 'keyword #f start-pos end-pos)]
     [(:seq #\/ #\*)
      ((colorize-block-comment start-pos) input-port)]
     [lex:line-comment
      (syn-val lexeme 'comment #f start-pos end-pos)]
     [lex:assignment-operator
      (syn-val lexeme 'keyword #f start-pos end-pos)]
     [lex:operator
      (syn-val lexeme 'keyword #f start-pos end-pos)]
     [lex:identifier
      (if (memq (string->symbol lexeme) (lexical-keywords))
          (syn-val lexeme 'keyword #f start-pos end-pos)
          (syn-val lexeme 'identifier #f start-pos end-pos))]
     [(:or #\' #\")
      ((colorize-string lexeme start-pos) input-port)]
     [(:+ lex:whitespace)
      (syn-val lexeme 'whitespace #f start-pos end-pos)]
     [(eof)
      (syn-val lexeme 'eof #f start-pos end-pos)]
     [any-char
      (syn-val lexeme 'error #f start-pos end-pos)]
     ))

  (provide get-syntax-token))