;; Mike Burns 2004-08-22
;; Mike Burns 2005-12-10
;; Copyright 2004, 2005 Mike Burns

;; Parse emails
#lang scheme
(require ;mzscheme
 mzlib/contract ;(lib "")
 mzlib/etc ; (lib "")
 mzlib/pregexp ;(lib "")
 ;(lib "")
 net/mime ; (lib "" "net")

;; An email is a (make-email Assoc (listof (listof String)))
;; The headers currently do not include the MIME info, due to net/ .
;; The messages are all the messages in the email. If there are no
;; attachments, then there is only one message in the list. A message is a list
;; of string, each string representing one line.gnificant
(define-struct email (headers messages) #:inspector #f) ;(make-inspector))

(define-struct (exn:malformed-email exn) () #:inspector #f) ;(make-inspector))

 ; (struct id  ((id       contract-expr) ...))
 (struct email ((headers (listof (cons/c symbol? string?)))
                (messages (listof (listof  string?)))))
 (parse-archive (path? . -> . (listof email?)))
 (parse-emails ((or/c none/c input-port?) . -> . (listof email?)))
 (parse-email  ((or/c none/c input-port?) . -> . email?)))

(provide (struct-out exn:malformed-email))

;> parse-archive : path -> (listof Email)
;Parse each file using _parse-emails_ and produce a list of all the
;parsed emails.
(define (parse-archive email-archive-path)
   (lambda (email-file-path) 
     (call-with-input-file email-file-path parse-emails))
   (find-files an-email-file? email-archive-path)))

;; Parse a stream of emails.  Emails are separated by lines that begin with
;; "From " (note the lack of ":").
(define parse-emails
  (lambda ((ip (current-input-port)))
    ; (do ((i (peek-line ip) (+ i 1))) ((= i vsides))
    (let loop ((line (peek-line ip)))
      (cond ((eof-object? line) '())
            ((new-email? line) ;; collection of emails
             (let* ((parsed (parse-email
                             (open-input-string (get-first-email ip))))
                    (parsed-rest (loop (peek-line ip))))
              ; (printf "an mbox collection: ~V ~N" 'parsed )
               (cons parsed parsed-rest)))
            ((an-email? line) (list (parse-email ip))) ;; a whole email per message
            (else (raise
                     (format "~a: ~a"
                             "Expected a \"From ...\", got"

;; Parse an email. It either does or does not have an attachment.
(define parse-email
  (lambda ((ip (current-input-port)))
    (let ((analysis (mime-analyze ip)))
      (if (multi-message? analysis)
          (parse-email-multi analysis)
          (parse-email-single analysis)))))

;; Parse an email with an attachment
(define/contract parse-email-multi
  (message? . -> . email?)
  (lambda (analysis)
     (message-fields->assoc (message-fields analysis))
     (map message->body
          (entity-parts (message-entity analysis))))))

;; Parse an email with no attachment
(define/contract parse-email-single
  (message? . -> . email?)
  (lambda (analysis)
     (message-fields->assoc (message-fields analysis))
     (list (message->body analysis)))))

(define (multi-message? analysis)
  (symbol=? (entity-type (message-entity analysis))

;; Produces a list of strings, each string representing a line in the email,
;; from a message.
(define (message->body message)
  (entity-body->body (entity-body (message-entity message))))

;; Uses the entity-body procedure to produce a list of strings, each string
;; representing a line in the message.
(define (entity-body->body body)
  (let ((o (open-output-string)))
    ;; Print to o
    (body o)
    ;; cdr because of a leading newline
    (cdr (string->los (get-output-string o)))))

;; Break a string with embedded newlines into a list of strings, each string
;; representing one line.
(define (string->los s)
  (pregexp-split "\n" s))

;; Show the next line, without consuming anything.
(define/contract peek-line
  (() (input-port?) . opt-> . (union eof-object? string?))
  (lambda ((ip (current-input-port)))
    (let loop ((acc "")
               (c (peek-char ip))
               (col 1))
        ((eof-object? c) (if (string=? acc "") c acc))
        ((char=? c #\newline) acc)
        (else (loop (string-append acc (string c))
                    (peek-char ip col)
                    (+ col 1)))))))

;; Is this line the start of  a new email?
(define (new-email? line)
  (pregexp-match #px"^From .*\\d{4}$" line))

;; Is this line the start of  a new email?
(define (an-email? line)
  (pregexp-match #px"^(?mi:From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\\-.*|MIME-Version|Forwarded|Message.*)" line))

;; Is this file an email?
(define (an-email-file? path)
  (let ((file (file-name-from-path path)))
    (if file (pregexp-match
              (path->string (file-name-from-path path)))

;; Consume a list of strings of colon-separated values, and produce an assoc
;; of string, which are those values.
(define/contract message-fields->assoc
  ((listof string?) . -> . (listof (cons/c symbol? string?)))
  (lambda (fields)
    (filter ;; Get rid of '()s
      (lambda (field)
        (let ((n-v (pregexp-match "^(\\S+): *(.*)" field)))
          (if n-v
              (cons (string->symbol (cadr n-v))
                    (apply string-append (cddr n-v)))

;; Produce the first email in a stream of emails.
(define/contract get-first-email
  (input-port? . -> . string?)
  (lambda (ip)
    (let loop ((acc "")
               (line (peek-line ip))
               (seen-first #f))
        ((eof-object? line) acc)
        ((new-email? line) (if seen-first
                               (let* ((a (read-line ip))
                                      (l (peek-line ip)))
                                 (loop a l #t))))
        (else (let ((a (read-line ip))
                    (l (peek-line ip)))
                (loop (format "~a~n~a" acc a) l seen-first)))))))