stylesheet/lexer.scm
(module lexer mzscheme
  (provide parse-css-port parse-css-file parse-css-string)
  
  (require (lib "lex.ss" "parser-tools")
           (prefix : (lib "lex-sre.ss" "parser-tools"))
           (lib "cfg-parser.ss" "algol60")
           (lib "yacc.ss" "parser-tools")
           (lib "readerr.ss" "syntax"))
  
  ; (file-path "lexer.scm")
  
  ;;;
  ;;; LEXER ABBREVIATIONS
  ;;;
  
  (define-lex-abbrevs
    (atoz       (:or (:/ #\a #\z) (:/ #\A #\Z)))
    (atof       (:or (:/ #\a #\f) (:/ #\A #\F)))
    (digit      (:/ #\0 #\9))
    (hex        (:or digit atof))
    (non-ascii  (:/ #\u0080 #\u00ff))
    (unicode    (:seq #\\ (:** 1 6 hex)
                      (:? (:or (:seq #\return #\newline)
                               #\space #\tab #\return #\newline #\page))))
    (escape     (:or unicode  (:seq #\\ (:~ #\return #\newline #\page hex))))
    (nmstart    (:or #\_ atoz non-ascii escape))
    (nmchar     (:or #\_ atoz digit #\- non-ascii escape))
    (string1    (:seq #\" (:* (:or (:~ #\newline #\return #\page #\")
                                   (:seq #\\ nl)
                                   escape))
                      #\"))
    (string2    (:seq #\' (:* (:or (:~ #\newline #\return #\page #\')
                                   (:seq #\\ nl)
                                   escape))
                      #\'))
    (invalid1   (:seq #\" (:* (:or (:~ #\newline #\return #\page #\")
                                   (:seq #\\ nl)
                                   escape))))
    (invalid2   (:seq #\' (:* (:or (:~ #\newline #\return #\page #\')
                                   (:seq #\\ nl)
                                   escape))))
    (ident      (:seq (:? #\-) nmstart (:* nmchar)))
    (name       (:+ nmchar))
    (num        (:or (:+ digit)
                     (:seq (:* digit) #\. (:+ digit))))
    (String     (:or string1 string2))
    (invalid    (:or invalid1 invalid2))
    (url        (:* (:or #\! #\# #\$ #\% #\& 
                         (char-range #\* #\~)  ; this range contains . / digits lower and uper letter and more
                         non-ascii
                         escape)))
    (s          (:or #\space #\tab #\return #\newline #\page))
    (w          (:* s))
    (nl         (:or #\newline (:seq #\return #\newline) #\return #\page))
    
    (A (:or #\a #\A (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\1) (:seq #\6 #\1)) (:? (:or (:seq #\return #\newline) s)))))
    (B (:or #\b #\B (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\2) (:seq #\6 #\2)) (:? (:or (:seq #\return #\newline) s)))))
    (C (:or #\c #\C (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\3) (:seq #\6 #\3)) (:? (:or (:seq #\return #\newline) s)))))
    (D (:or #\d #\D (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\4) (:seq #\6 #\4)) (:? (:or (:seq #\return #\newline) s)))))
    (E (:or #\e #\E (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\5) (:seq #\6 #\5)) (:? (:or (:seq #\return #\newline) s)))))
    (F (:or #\f #\F (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\6) (:seq #\6 #\6)) (:? (:or (:seq #\return #\newline) s)))))
    (G (:or #\g #\G (:seq #\\ #\\ #\g) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\7) (:seq #\6 #\7)) (:? (:or (:seq #\return #\newline) s)))))
    (H (:or #\h #\H (:seq #\\ #\\ #\h) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\8) (:seq #\6 #\8)) (:? (:or (:seq #\return #\newline) s)))))
    (I (:or #\i #\I (:seq #\\ #\\ #\i) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\9) (:seq #\6 #\9)) (:? (:or (:seq #\return #\newline) s)))))
    (J (:or #\j #\J (:seq #\\ #\\ #\j) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\a) (:seq #\6 #\a)) (:? (:or (:seq #\return #\newline) s)))))
    (K (:or #\k #\K (:seq #\\ #\\ #\k) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\b) (:seq #\6 #\b)) (:? (:or (:seq #\return #\newline) s)))))
    (L (:or #\l #\L (:seq #\\ #\\ #\l) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\c) (:seq #\6 #\c)) (:? (:or (:seq #\return #\newline) s)))))
    (M (:or #\m #\M (:seq #\\ #\\ #\m) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\d) (:seq #\6 #\d)) (:? (:or (:seq #\return #\newline) s)))))
    (N (:or #\n #\N (:seq #\\ #\\ #\n) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\e) (:seq #\6 #\e)) (:? (:or (:seq #\return #\newline) s)))))
    (O (:or #\o #\O (:seq #\\ #\\ #\o) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\4 #\f) (:seq #\6 #\f)) (:? (:or (:seq #\return #\newline) s)))))
    (P (:or #\p #\P (:seq #\\ #\\ #\p) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\0) (:seq #\7 #\0)) (:? (:or (:seq #\return #\newline) s)))))
    (Q (:or #\q #\Q (:seq #\\ #\\ #\q) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\1) (:seq #\7 #\1)) (:? (:or (:seq #\return #\newline) s)))))
    (R (:or #\r #\R (:seq #\\ #\\ #\r) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\2) (:seq #\7 #\2)) (:? (:or (:seq #\return #\newline) s)))))
    (S (:or #\s #\S (:seq #\\ #\\ #\s) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\3) (:seq #\7 #\3)) (:? (:or (:seq #\return #\newline) s)))))
    (T (:or #\t #\T (:seq #\\ #\\ #\t) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\4) (:seq #\7 #\4)) (:? (:or (:seq #\return #\newline) s)))))
    (U (:or #\u #\U (:seq #\\ #\\ #\u) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\5) (:seq #\7 #\5)) (:? (:or (:seq #\return #\newline) s)))))
    (V (:or #\v #\V (:seq #\\ #\\ #\v) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\6) (:seq #\7 #\6)) (:? (:or (:seq #\return #\newline) s)))))
    (W (:or #\w #\W (:seq #\\ #\\ #\w) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\7) (:seq #\7 #\7)) (:? (:or (:seq #\return #\newline) s)))))
    (X (:or #\x #\X (:seq #\\ #\\ #\x) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\8) (:seq #\7 #\8)) (:? (:or (:seq #\return #\newline) s)))))
    (Y (:or #\y #\Y (:seq #\\ #\\ #\y) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\9) (:seq #\7 #\9)) (:? (:or (:seq #\return #\newline) s)))))
    (Z (:or #\z #\Z (:seq #\\ #\\ #\z) (:seq #\\ (:** 0 4 #\0) (:or (:seq #\5 #\a) (:seq #\7 #\a)) (:? (:or (:seq #\return #\newline) s)))))
    )
  
  ;;;
  ;;; TOKENS
  ;;;
  
  (define-tokens non-terminals  (<angle> <dimension> <ems> <exs> <freq> 
                                         <function>  <hash> <ident> <invalid> 
                                         <length> <number> <percentage> 
                                         <string> <time> <uri>
                                         
                                         CDO           ; comment delimeter open
                                         CDC           ; comment delimeter close
                                         CHARSET-SYM
                                         COLON
                                         COMMA
                                         DASHMATCH 
                                         DOT
                                         EOF
                                         EQUAL
                                         IMPORT-SYM
                                         IMPORTANT-SYM
                                         INCLUDES
                                         GREATER
                                         LBRACE
                                         LBRACKET
                                         MEDIA-SYM
                                         MINUS
                                         PAGE-SYM
                                         PLUS
                                         S         ; space
                                         SLASH
                                         RBRACE
                                         RBRACKET
                                         RPAREN
                                         SEMICOLON
                                         STAR
                                         UNPARSEABLE))
  
  (define stx-for-original-property (read-syntax #f (open-input-string "original")))
  
  (define-syntax (token stx)
    (syntax-case stx ()
      [(_ name val)
       (identifier? (syntax name))
       (let ([name (syntax name)])
         (with-syntax ([token-name (datum->syntax-object
                                    name
                                    (string->symbol
                                     (format "token-~a" (syntax-e name))))]
                       [source-name (datum->syntax-object name 'source-name)]
                       [start-pos (datum->syntax-object name 'start-pos)]
                       [end-pos (datum->syntax-object name 'end-pos)])
           (syntax 
            (token-name 
             (datum->syntax-object #f val
                                   (list
                                    source-name
                                    (position-line start-pos)
                                    (position-col start-pos)
                                    (position-offset start-pos)
                                    (- (position-offset end-pos)
                                       (position-offset start-pos)))
                                   stx-for-original-property)))))]))
  (define-syntax (ttoken stx)
    (syntax-case stx ()
      [(_ name)
       (identifier? (syntax name))
       (syntax (token name 'name))]))
  
  ;;;
  ;;; LEXER RULES
  ;;;
  
  (define (lex source-name)
    (letrec ([loop 
              (lexer
               ((:seq "url(" w String w ")") (token <uri> lexeme))
               ((:seq "url(" w url w ")")    (token <uri> lexeme))
               
               ; comment
               ; \/\* [^*]* \*+ ([^/*][^*]*\*+)* \/
               ((:: "/*" (complement (:: any-string "*/" any-string)) "*/")
                (loop input-port))
               ((:: (:+ s) "/*" (complement (:: any-string "*/" any-string)) "*/")
                (token S lexeme))
               
               #;
               ((:seq "/*"   
                      (:* (:~ #\*)) (:+ #\*)
                      (:* (:~ #\/ #\*) (:* (:~ #\*)) (:+ #\*))
                      "/")
                (loop input-port))  ; ignore
               ; space+ comment
               #;
               ((:seq (:+ s) "/*" 
                      (:~ #\*) (:+ #\*)
                      (:* (:~ #\/ #\*) (:* (:~ #\*)) (:+ #\*))
                      "/")         (token S lexeme))
               
               
               ((:+ s)             (token S lexeme))
               
               ("<!--"             (token CDO '<!--))
               ("--!>"             (token CDC '--!>))
               ("~="               (token INCLUDES '~=))
               ("|="               (token DASHMATCH (string->symbol "|=")))
               
               ((:seq w #\{)       (token LBRACE  '|{|))
               ((:seq w #\})       (token RBRACE  '|}|))
               ((:seq w #\+)       (token PLUS    '+))
               ((:seq w #\>)       (token GREATER '>))
               ((:seq w #\,)       (token COMMA   '|,|))
               
               (String             (token <string> lexeme))
               (invalid            (token <invalid> lexeme))
               (ident              (token <ident> lexeme))
               ((:seq #\# name)    (token <hash> lexeme))
               
               ("@import"          (token IMPORT-SYM  '@import))
               ("@page"            (token PAGE-SYM    '@page))
               ("@media"           (token MEDIA-SYM   '@media))
               ("@charset"         (token CHARSET-SYM '@charset))
               
               ((:seq #\! w 
                      "important") (token IMPORTANT-SYM '!important))
               
               ((:seq num E M)     (token <ems> lexeme))
               ((:seq num E X)     (token <exs> lexeme))
               
               ((:seq num P X)     (token <length> lexeme))
               ((:seq num C M)     (token <length> lexeme))
               ((:seq num M M)     (token <length> lexeme))
               ((:seq num I N)     (token <length> lexeme))
               ((:seq num P T)     (token <length> lexeme))
               ((:seq num P C)     (token <length> lexeme))
               
               ((:seq num D E G)   (token <angle> lexeme))
               ((:seq num R A D)   (token <angle> lexeme))
               ((:seq num G R A D) (token <angle> lexeme))
               
               ((:seq num M S)     (token <time> lexeme))
               ((:seq num S)       (token <time> lexeme))
               
               ((:seq num H Z)     (token <freq> lexeme))
               ((:seq num K H Z)   (token <freq> lexeme))
               
               ((:seq num ident)   (token <dimension> lexeme))
               
               ((:seq num #\%)     (token <percentage> lexeme))
               ((:seq num)         (token <number> lexeme))
               
               
               ((:seq ident "(")             (token <function> lexeme))
               
               ((eof) (ttoken EOF))
               (#\;   (token SEMICOLON '|;|))
               (#\:   (token COLON    '|:|))
               (#\/   (token SLASH    '/))
               (#\-   (token MINUS    '-))
               (#\+   (token PLUS     '+))
               (#\>   (token GREATER  '>))
               (#\.   (token DOT      '|.|))
               (#\*   (token STAR     '*))
               (#\[   (token LBRACKET '|[|))
               (#\]   (token RBRACKET '|]|))
               (#\=   (token EQUAL    '=))
               (#\)   (token RPAREN   '|)|))
               ; catch all
               (any-char (token UNPARSEABLE (string->symbol lexeme)))
               )])
      loop))
  
  ; css-parser : (-> token) -> sexp
  (define parse
    (cfg-parser
     (tokens   non-terminals)
     (start    stylesheet)
     (end      EOF)
     (error    (lambda (a b stx)
                 (display stx)
                 (raise-read-error (format "parse error near ~a" (syntax-e stx))
                                   (syntax-source stx)
                                   (syntax-line stx)
                                   (syntax-column stx)
                                   (syntax-position stx)
                                   (syntax-span stx))))
     (suppress)
     
     (grammar  
      (S*            ((S* S)                                   'skip)
                     (()                                       'skip))
      (S+            ((S+ S)                                   'skip)
                     ((S)                                      'skip))
      
      ;; stylesheet
      (stylesheet    ((stylesheet1 stylesheet2* stylesheet3*)  `(css ,$1 . ,(append (reverse $2) (reverse $3))))
                     ((            stylesheet2* stylesheet3*)  `(css . ,(append (reverse $1) (reverse $2)))))
      (stylesheet1   ((CHARSET-SYM S* <string> S* SEMICOLON)   `(@charset ,$3)))
      (stylesheet2*  ((s-cdo-cds* stylesheet22*)               (reverse $2))) ; ignore the s-cdo-cds*
      (s-cdo-cds     ((S)                                      'S)
                     ((CDO)                                    'CDO)
                     ((CDC)                                    'CDC))
      (s-cdo-cds*    ((s-cdo-cds* s-cdo-cds)                   (cons $2 $1))
                     (()                                       '()))
      (stylesheet22  ((import s-cdo-cds*)                      $1)
                     ((import)                                 $1))
      (stylesheet22* ((stylesheet22* stylesheet22)             (cons $2 $1))
                     (()                                       '()))
      (stylesheet3   ((ruleset s-cdo-cds*)                     $1)
                     ((media s-cdo-cds*)                       $1)
                     ((page s-cdo-cds*)                        $1))
      (stylesheet3*  ((stylesheet3* stylesheet3)               (cons $2 $1))
                     (()                                       '()))
      ;; import
      (import            ((IMPORT-SYM S* string-or-uri S* import1 SEMICOLON S*) `((import $3) ,@(reverse $5)))
                         ((IMPORT-SYM S* string-or-uri S*         SEMICOLON S*) `((import $3))))
      (string-or-uri     ((<string>) $1)
                         ((<uri>)    $1))
      (import1           ((medium comma-medium-list) (cons $1 $2)))
      (comma-medium-list ((comma-medium-list COMMA S* medium)  (cons $4 $1))
                         (()                                   '()))
      ;; media
      (media             ((MEDIA-SYM S* medium comma-medium-list LBRACE S* ruleset-start RBRACE S*)  
                          `(MEDIA ,(cons $3 (reverse $4)) ,(reverse $7))))
      ;; medium
      (medium            ((<ident> S*) $1))
      ;; page
      (page              ((PAGE-SYM S* pseudo-page S* LBRACE S* declaration semi-declaration-star RBRACE S*)
                          `(PAGE ,$3 ,(cons $7 (reverse $8))))
                         ((PAGE-SYM S* S* LBRACE S* declaration semi-declaration-star RBRACE S*)
                          `(PAGE ,(cons $6 (reverse $7)))))
      (pseudo-page       ((COLON <ident>)  $2))
      (operator          ((SLASH S*)      '/)
                         ((COMMA S*)      'COMMA)
                         (()              '()))
      (combinator        ((PLUS S*)       '+)
                         ((GREATER S*)    '>)
                         ((S+)            'SPACE-COMBINATOR))
      (unary-operator    ((MINUS)         '-)
                         ((PLUS)          '+))
      (property          ((<ident> S*)      $1))
      (ruleset           ((selector comma-selector-star LBRACE S* declaration semi-declaration-star RBRACE S*)
                          `(RULESET ,$1 ,$2 ,(cons $5 (reverse $6)))))
      (comma-selector-star ((comma-selector-star COMMA S* selector) (cons $4 $1))
                           (()                                   '()))
      
      (semi-declaration-star    ((semi-declaration-star SEMICOLON S* declaration) (cons $4 $1))
                                (()                                               ()))
      
      (ruleset-start            ((ruleset-start ruleset) (cons $1 $2))
                                (()                      '()))
      (selector                 ((simple-selector combinator-simple-selector-star) `(SELECTOR ,@(cons $1 (reverse $2)))))
      (combinator-simple-selector-star ((combinator-simple-selector-star combinator simple-selector) (cons $3 (cons $2 $1)))
                                       (()                                                           '()))
      (simple-selector          ((element-name HASH/class/attrib/pseudo-star) `(SIMPLE-SELECTOR ,$1 ,(reverse $2)))
                                ((HASH/class/attrib/pseudo-plus)              `(SIMPLE-SELECTOR ,(reverse $1))))
      (HASH/class/attrib/pseudo ((<hash>)   $1)
                                ((class)  $1)
                                ((attrib) $1)
                                ((pseudo) $1))
      (HASH/class/attrib/pseudo-star ((HASH/class/attrib/pseudo-star HASH/class/attrib/pseudo) (cons $2 $1))
                                     (()                                                       '()))
      (HASH/class/attrib/pseudo-plus ((HASH/class/attrib/pseudo-star HASH/class/attrib/pseudo) (cons $2 $1))
                                     ((HASH/class/attrib/pseudo)                               (list $1)))
      (class                    ((DOT <ident>) `(CLASS ,$2)))
      (element-name             ((<ident>) $1)
                                ((STAR)  '*))
      (attrib                   ((LBRACKET <ident> S* RBRACKET) `(ATTRIB $2))
                                ((LBRACKET <ident> S* attrib1 RBRACKET) `(ATTRIB $2 ,@$4)))
      (attrib1                  ((equal/includes/dashmatch S* ident/string S*)  (list $1 $3)))
      (equal/includes/dashmatch ((EQUAL)     '=)
                                ((INCLUDES)  'INCLUDES)
                                ((DASHMATCH) 'DASHMATCH))
      (ident/string             ((<ident>) $1)
                                ((<string>) $1))
      (pseudo                   ((COLON <ident>)                                `(PSEUDO ,$2))
                                ((COLON <function> S* RPAREN)                   `(PSEUDO ,$2))
                                ((COLON <function> S* <ident> S* RPAREN)        `(PSEUDO ,$2 ,$4)))
      (declaration              ((property COLON S* expr)                       `(DECLARATION ,$1 ,$4))
                                ((property COLON S* expr prio)                  `(DECLARATION ,$1 ,$4 ,$5))
                                (()                                             '()))
      (prio                     ((IMPORTANT-SYM S*)                             'IMPORTANT))
      (expr                     ((term operator-term-star)                      (cons $1 (reverse $2))))
      (operator-term-star       ((operator-term-star operator term)             (cons $3 (cons $2 $1)))
                                (()                                             '()))
      (term                     ((term2) `(TERM2 ,$1))
                                ((unary-operator term1)                         `(TERM1 ,(cons $1 $2)))
                                ((term1) `(TERM1 ,$1)))
      (term1                    ((<number> S*) $1) ((<percentage> S*) $1) ((<length> S*) $1) ((<ems> S*) $1) ((<exs> S*) $1)
                                ((<angle> S*) $1)  ((<time> S*) $1)       ((<freq> S*) $1))
      (term2                    ((<string> S*) $1) ((<ident> S*) $1)      ((<uri> S*) `(URI ,$1))    ((hexcolor) $1) ((function) $1))
      (function                 ((<function> S* expr RPAREN S*) `(FUNCTION ,$1 ,$3)))
      (hexcolor                 ((<hash> S*) `(HEXCOLOR ,$1))))))
  
  
  (define (parse-css-port port file)
    (let ([lexer (lex file)])
      (port-count-lines! port)
      (parse
       (lambda () 
         (let loop ()
           (let ([v (lexer port)])
             (if (void? v)
                 (loop)
                 v)))))))
  
  (define (parse-css-file file)
    (with-input-from-file file
      (lambda ()
        (parse-css-port (current-input-port)
                        (path->complete-path file)))))
  
  (define (parse-css-string string)
    (let ([port (open-input-string string)])
      (parse-css-port port (object-name port))))
  

  
  )
   
   #| PROBLEMATISKE
   
     (t140201-c536-bgpos-00-b-ag.htm t140201-c536-bgpos-00-b-ag.htm)

   html,body,div { margin: 0; border: 0; padding: 0; }
   div { font: 15px/1 Ahem; color: white; width: 11em; }
   .right { width: auto; text-align: right; }

   /* tests */
   body {background-image: url(support/swatch-red.png); background-position: right top;
         background-repeat: no-repeat;}
   .one {background-image: url(support/swatch-red.png); background-position: center;
         background-repeat: no-repeat;}
   .two {background-image: url(support/swatch-red.png); background-position: 50% 50%;
         background-repeat: no-repeat;}
   .three {background-image: url(support/swatch-red.png); background-position: bottom right;
           background-repeat: no-repeat;}


(t140201-c536-bgpos-01-b-ag.htm t140201-c536-bgpos-01-b-ag.htm)

   html,body,div { margin: 0; border: 0; padding: 0; }
   div { font: 15px/1 Ahem; color: white; width: 11em; }

   /* tests */
   .four {background-image: url(support/swatch-red.png); background-position: 100% 100%;
          background-repeat: no-repeat;}
   .five {background-image: url(support/swatch-red.png); background-position: 0% 50%;
          background-repeat: no-repeat;}
   .six {background-image: url(support/swatch-red.png); background-position: 80% 25%;
          background-repeat: no-repeat;}
   .seven {background-image: url(support/swatch-red.png); background-position: 30px 30px;
          background-repeat: no-repeat;}


|#