mail-parse.ss
;; Mike Burns 2004-08-22 mike@mike-burns.com
;; Mike Burns 2005-12-10 mike@mike-burns.com
;; Copyright 2004, 2005 Mike Burns

;; Parse emails
(module mail-parse mzscheme
  (require (lib "contract.ss")
           (lib "etc.ss")
           (lib "pregexp.ss")
           (lib "list.ss")
           (lib "mime.ss" "net"))

  ;; An email is a (make-email Assoc (listof (listof String)))
  ;; The headers currently do not include the MIME info, due to net/mime.ss .
  ;; The messages are all the messages in the email. If there are no
  ;; attachments, then there is only one message in the list. A message is a list
  ;; of string, each string representing one line.
  (define-struct email (headers messages) (make-inspector))

  (define-struct (exn:malformed-email exn) () (make-inspector))

  (provide/contract
    (struct email ((headers (listof (cons/c symbol? string?)))
                   (messages (listof (listof  string?)))))
    (parse-emails (() (input-port?) . opt-> . (listof email?)))
    (parse-email  (() (input-port?) . opt-> . email?)))

  (provide (struct exn:malformed-email ()))

  ;; Parse a stream of emails.  Emails are separated by lines that begin with
  ;; "From " (note the lack of ":").
  (define parse-emails
    (opt-lambda ((ip (current-input-port)))
      (let loop ((line (peek-line ip)))
        (cond ((eof-object? line) '())
              ((new-email? line)
               (let* ((parsed (parse-email
                                (open-input-string (get-first-email ip))))
                      (parsed-rest (loop (peek-line ip))))
                 (cons parsed parsed-rest)))
              (else (raise
                      (make-exn:malformed-email
                        (string->immutable-string
                          (format "~a: ~a"
                                "Expected a \"From ...\", got"
                                line))
                        (current-continuation-marks))))))))

  ;; Parse an email. It either does or does not have an attachment.
  (define parse-email
    (opt-lambda ((ip (current-input-port)))
      (let ((analysis (mime-analyze ip)))
        (if (multi-message? analysis)
          (parse-email-multi analysis)
          (parse-email-single analysis)))))

  ;; Parse an email with an attachment
  (define/contract parse-email-multi
    (message? . -> . email?)
    (lambda (analysis)
      (make-email
        (message-fields->assoc (message-fields analysis))
        (map message->body
             (entity-parts (message-entity analysis))))))

  ;; Parse an email with no attachment
  (define/contract parse-email-single
    (message? . -> . email?)
    (lambda (analysis)
      (make-email
        (message-fields->assoc (message-fields analysis))
        (list (message->body analysis)))))

  (define (multi-message? analysis)
    (symbol=? (entity-type (message-entity analysis))
              'multipart))

  ;; Produces a list of strings, each string representing a line in the email,
  ;; from a message.
  (define (message->body message)
    (entity-body->body (entity-body (message-entity message))))

  ;; Uses the entity-body procedure to produce a list of strings, each string
  ;; representing a line in the message.
  (define (entity-body->body body)
    (let ((o (open-output-string)))
      ;; Print to o
      (body o)
      ;; cdr because of a leading newline
      (cdr (string->los (get-output-string o)))))

  ;; Break a string with embedded newlines into a list of strings, each string
  ;; representing one line.
  (define (string->los s)
    (pregexp-split "\n" s))

  ;; Show the next line, without consuming anything.
  (define/contract peek-line
    (() (input-port?) . opt-> . (union eof-object? string?))
    (opt-lambda ((ip (current-input-port)))
      (let loop ((acc "")
                 (c (peek-char ip))
                 (col 1))
        (cond
          ((eof-object? c) (if (string=? acc "") c acc))
          ((char=? c #\newline) acc)
          (else (loop (string-append acc (string c))
                (peek-char ip col)
                (+ col 1)))))))

  ;; Is this line the start of  a new email?
  (define (new-email? line)
    (pregexp-match "^From " line))

  ;; Consume a list of strings of colon-separated values, and produce an assoc
  ;; of string, which are those values.
  (define/contract message-fields->assoc
    ((listof string?) . -> . (listof (cons/c symbol? string?)))
    (lambda (fields)
      (filter ;; Get rid of '()s
        pair?
        (map
          (lambda (field)
            (let ((n-v (pregexp-match "^(\\S+): *(.*)" field)))
              (if n-v
                (cons (string->symbol (cadr n-v))
                      (apply string-append (cddr n-v)))
                '())))
          fields))))

  ;; Produce the first email in a stream of emails.
  (define/contract get-first-email
    (input-port? . -> . string?)
    (lambda (ip)
      (let loop ((acc "")
                 (line (peek-line ip))
                 (seen-first #f))
        (cond
          ((eof-object? line) acc)
          ((new-email? line) (if seen-first
                               acc
                               (let* ((a (read-line ip))
                                      (l (peek-line ip)))
                                 (loop a l #t))))
          (else (let ((a (read-line ip))
                      (l (peek-line ip)))
                  (loop (format "~a~n~a" acc a) l seen-first)))))))



  )