mail-parse.ss
;; Mike Burns 2004-08-22 mike@mike-burns.com
;; Mike Burns 2005-12-10 mike@mike-burns.com
;; Copyright 2004, 2005 Mike Burns

;; Parse emails
#lang scheme
(require ;mzscheme
;mzlib/contract ;(lib "contract.ss")
 ;mzlib/etc ; (lib "etc.ss")
 ;mzlib/pregexp ;(lib "pregexp.ss")
 ;(lib "list.ss")
 net/mime ; (lib "mime.ss" "net")
 )

;; An email is a (make-email Assoc (listof (listof String)))
;; The headers currently do not include the MIME info, due to net/mime.ss .
;; The messages are all the messages in the email. If there are no
;; attachments, then there is only one message in the list. A message is a list
;; of string, each string representing one line.gnificant
(define-struct email (headers messages) #:inspector #f) ;(make-inspector))

(define-struct (exn:malformed-email exn) () #:inspector #f) ;(make-inspector))

(provide/contract
 (struct email ((headers (listof (cons/c symbol? string?)))
                (messages (listof (listof  string?)))))
 (parse-archive (path? . -> . (listof email?)))
 (parse-emails ((or/c none/c input-port?) . -> . (listof email?)))
 (parse-email  ((or/c none/c input-port?) . -> . email?))
 (write-email (->* (email?) (output-port?) any))
 )

(provide (struct-out exn:malformed-email))


;> parse-archive : path -> (listof Email)
;
;Parse each file using _parse-emails_ and produce a list of all the
;parsed emails.
(define (parse-archive email-archive-path)
  (append-map 
   (lambda (email-file-path) 
     (call-with-input-file email-file-path parse-emails))
   (find-files an-email-file? email-archive-path)))


;; Parse a stream of emails.  Emails are separated by lines that begin with
;; "From " (note the lack of ":").
(define parse-emails
  ;(opt-
  (lambda ((ip (current-input-port)))
    ; (do ((i (peek-line ip) (+ i 1))) ((= i vsides))
    (let loop ((line (peek-line ip)))
      (cond ((eof-object? line) '())
            ((new-email? line) ;; collection of emails
             (let* ((parsed (parse-email
                             (open-input-string (get-first-email ip))))
                    (parsed-rest (loop (peek-line ip))))
               ; (printf "an mbox collection: ~V ~N" 'parsed )
               (cons parsed parsed-rest)))
            ((an-email? line) (list (parse-email ip))) ;; a whole email per message
            (else (raise
                   (make-exn:malformed-email
                    (string->immutable-string
                     (format "~a: ~a"
                             "Expected a \"From ...\", got"
                             line))
                    (current-continuation-marks))))))))

;; Parse an email. It either does or does not have an attachment.
(define parse-email
  ;(opt-
  (lambda ((ip (current-input-port)))
    (let ((analysis (mime-analyze ip)))
      (if (multi-message? analysis)
          (parse-email-multi analysis)
          (parse-email-single analysis)))))

;; Parse an email with an attachment
(define/contract parse-email-multi
  (message? . -> . email?)
  (lambda (analysis)
    (make-email
     (message-fields->assoc (message-fields analysis))
     (map message->body
          (entity-parts (message-entity analysis))))))

;; Parse an email with no attachment
(define/contract parse-email-single
  (message? . -> . email?)
  (lambda (analysis)
    (make-email
     (message-fields->assoc (message-fields analysis))
     (list (message->body analysis)))))

(define (multi-message? analysis)
  (symbol=? (entity-type (message-entity analysis))
            'multipart))

;; Produces a list of strings, each string representing a line in the email,
;; from a message.
(define (message->body message)
  (entity-body->body (entity-body (message-entity message))))

;; Uses the entity-body procedure to produce a list of strings, each string
;; representing a line in the message.
(define (entity-body->body body)
  (let ((o (open-output-string)))
    ;; Print to o
    (body o)
    ;; cdr because of a leading newline
    (cdr (string->los (get-output-string o)))))

;; Break a string with embedded newlines into a list of strings, each string
;; representing one line.
(define (string->los s)
  (regexp-split #px"\n" s))

;; Show the next line, without consuming anything.
(define/contract peek-line
  (->* () (input-port?) (or/c eof-object? string?))
 ;(() (input-port?) . opt-> . (union eof-object? string?))
  ;(opt-
  (lambda ((ip (current-input-port)))
    (let loop ((acc "")
               (c (peek-char ip))
               (col 1))
      (cond
        ((eof-object? c) (if (string=? acc "") c acc))
        ((char=? c #\newline) acc)
        (else (loop (string-append acc (string c))
                    (peek-char ip col)
                    (+ col 1)))))))

;; Is this line the start of  a new email?
(define (new-email? line)
  (regexp-match #px"^From .*\\d{4}$" line))

;; Is this line the start of  a new email?
(define (an-email? line)
  (regexp-match #px"^(?mi:From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\\-.*|MIME-Version|Forwarded|Message.*)" line))

;; Is this file an email?
(define (an-email-file? path)
  (let ((file (file-name-from-path path)))
    (if (not (directory-exists? path)) 
        (regexp-match
              #px"(?mi:\\d+|\\.(mbx|mbox|email|eml))$" 
              (path->string (file-name-from-path path)))
        #f)))

;; Consume a list of strings of colon-separated values, and produce an assoc
;; of string, which are those values.
(define/contract message-fields->assoc
  ((listof string?) . -> . (listof (cons/c symbol? string?)))
  (lambda (fields)
    (filter ;; Get rid of '()s
     pair?
     (map
      (lambda (field)
        (let ((n-v (regexp-match #px"^(\\S+): *(.*)" field)))
          (if n-v
              (cons (string->symbol (cadr n-v))
                    (apply string-append (cddr n-v)))
              '())))
      fields))))

;; Produce the first email in a stream of emails.
(define/contract get-first-email
  (input-port? . -> . string?)
  (lambda (ip)
    (let loop ((acc "")
               (line (peek-line ip))
               (seen-first #f))
      (cond
        ((eof-object? line) acc)
        ((new-email? line) (if seen-first
                               acc
                               (let* ((a (read-line ip))
                                      (l (peek-line ip)))
                                 (loop a l #t))))
        (else (let ((a (read-line ip))
                    (l (peek-line ip)))
                (loop (format "~a~n~a" acc a) l seen-first)))))))


;;;; new section (c)2008 spdegabrielle
;; write-email : message [output-port] -> void
(define (write-email message (out-port (current-output-port)))
    (printf "THIS IS NOT WORKING ~V:~V~N" message out-port))