phrase.ss
#lang scheme/base
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; NET.plt
;;
;; abstraction of common network behaviors and services
;;
;; Bonzai Lab, LLC.  All rights reserved.
;;
;; Licensed under LGPL.
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; phrase.ss - parsing the phrase as defined in rfc2822
;; yc 2/13/2010 - first version
(require "depend.ss"
         "encoded-word.ss"
         "quoted-string.ss"
         )
;; this is the atext... hmm... most of the characters here will be *rejected*...
(define p:atext/unicode (char-when (lambda (c)
                                     (and (not (char-whitespace? c))
                                          (not (memq c '(#\, #\< #\> #\: #\;)))))))

(define p:atom (seq atom <- (one-many p:atext/unicode)
                    (return (list->string atom))))

(define p:word (choice p:encoded-word p:qstring p:atom))

;; what we need is something to alternatively picking things up...
;; token WORD will suck in the tailing
(define p:phrase (seq (zero-one whitespaces (return #t)) 
                      w <- p:word
                      words <- (zero-many (seq ws <- whitespaces
                                               word <- p:word
                                               (return (list (list->string ws) word))))
                    (return (string-join (flatten (cons w words)) ""))))

(define read-phrase (make-reader p:phrase #:eof? #f))

;; another way of reading phrase would be to read anything, and an
(define p:text (char-when (lambda (c)
                            (and (not (char-whitespace? c))))))

(define p:text* (seq v <- (one-many p:text)
                     (return (list->string v))))

(define p:TEXT (choice p:encoded-word p:text*))

(define p:whitespaces/str (seq v <- whitespaces 
                               (return (list->string v))))

(define p:freeform (seq ws <- p:whitespaces/str 
                        t <- p:TEXT
                        texts <- (zero-many (seq ws <- p:whitespaces/str
                                                 t <- p:TEXT
                                                 (return (string-append ws t))))
                        (return (string-join (list* ws t texts) ""))))
;; subject should use this one...
(define read-freeform (make-reader p:freeform #:eof? #f))



;; what about the 3rd form?  the parameters are not a full phrase, instead
;; they are just tokens...
;; they must not be anything that have spaces unless it is incorporated into a
;;
#|
NO-WS-CTL       =       %d1-8 /         ; US-ASCII control characters
                        %d11 /          ;  that do not include the
                        %d12 /          ;  carriage return, line feed,
                        %d14-31 /       ;  and white space characters
                        %d127

CTL -> d1-d32,d127
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,127

     tspecials :=  "(" / ")" / "<" / ">" / "@" /
                   "," / ";" / ":" / "\" / <">
                   "/" / "[" / "]" / "?" / "="
                   ; Must be in quoted-string,
                   ; to use within parameter values


;;|#

(define (token-special-char? c)
  (memq c '(#\( #\) #\< #\> #\@ 
                #\, #\; #\: #\\ #\" 
                #\/ #\[ #\] #\? #\=)))

(define p:token-char (char-when (lambda (c)
                                  (and (< 32 (char->integer c) 128)
                                       (not (token-special-char? c))))))

(define p:atom/token (seq v <- (one-many p:token-char)
                          (return (list->string v))))

;; in this particular case we can also embed encoded-word into quoted-string...
(define p:param-value (choice p:quoted-encoded-word 
                              p:encoded-word 
                              quoted-string 
                              p:atom/token)) 

(define read-param-value (make-reader p:param-value #:eof? #f))
;; encode-phrase
;; do the following...
;; if it's all ascii, check to see if it has one of the tspecial chars, and if so
;; make it into quoted-string...

;; if it's not all ascii, then convert it into encoded word... unless it's more than
;; 48 characters, do not split it into 2...


(define (encode-phrase phrase) 
  (let-values (((ascii latin-1 unicode)
                (string-char-ratios phrase))) 
    (cond ((= ascii 1) 
           (if (string-char-or? phrase token-special-char?) 
               (encode-qstring phrase)
               phrase))
          ((> ascii 0.7) 
           (encode-encoded-word* "utf-8" "Q" phrase))
          (else 
           (encode-encoded-word* "utf-8" "B" phrase)))))

(define (encode-param-value param) 
  (let-values (((ascii latin-1 unicode)
                (string-char-ratios param))) 
    (cond ((= ascii 1)
           (if (string-char-or? param char-whitespace?)
               (encode-qstring param)
               param))
          ((> ascii 0.7)
           (encode-encoded-word "utf-8" "Q" param))
          (else 
           (encode-encoded-word "utf-8" "B" param)))))

(provide/contract (p:phrase Parser/c)
                  (read-phrase Reader/c)
                  (p:freeform Parser/c)
                  (read-freeform Reader/c)
                  (p:param-value Parser/c)
                  (read-param-value Reader/c)
                  (encode-phrase (-> string? string?))
                  (encode-param-value (-> string? string?))
                  )



#|

let's fix the phrase issue.

currently phrase is not working correctly... 

1 - it's unable to parse all of the words 
2 - phrase as defined in 2822 are not used *everywhere* - it's mainly used with email 
    addresses 
3 - phrase as defined in 2822 cannot be used within quoted-strings, but in real-life 
    they exists... (specifically as a filename in content-disposition for CJK filenames) 
4 - phrase does not really exist for header such as subject, because subject does not 
    make use of quoted-strings (but it does make use of encoded-words) 

so most likely I will need to break things up.

in email address we can use the following:
phrase as defined in 2822 (ATOM & quoted-string) & encoded-word as defined in 2047 

in subject we can use regular text (no quoted-string), but we need to look for encoded-word token

in content-disposition params we need to parse for quoted-string, but also encoded-word
within quoted-string... 

so it's okay to help parse for encoded-word within quoted string... 

but within subject we should not parse for that... 

remember that if there are multiple of them together then they should not 

;;|#