;; Mike Burns 2004-08-22
;; Mike Burns 2005-12-10
;; Copyright 2004, 2005 Mike Burns

;; Parse emails
#lang scheme
(require ;mzscheme
;mzlib/contract ;(lib "")
 ;mzlib/etc ; (lib "")
 ;mzlib/pregexp ;(lib "")
 ;(lib "")
 net/mime ; (lib "" "net")

;; An email is a (make-email Assoc (listof (listof String)))
;; The headers currently do not include the MIME info, due to net/ .
;; The messages are all the messages in the email. If there are no
;; attachments, then there is only one message in the list. A message is a list
;; of string, each string representing one line.gnificant
(define-struct email (headers messages) #:inspector #f) ;(make-inspector))

(define-struct (exn:malformed-email exn) () #:inspector #f) ;(make-inspector))

 (struct email ((headers (listof (cons/c symbol? string?)))
                (messages (listof (listof  string?)))))
 (parse-archive (path? . -> . (listof email?)))
 (parse-emails ((or/c none/c input-port?) . -> . (listof email?)))
 (parse-email  ((or/c none/c input-port?) . -> . email?))
 (write-email (->* (email?) (output-port?) any))
 (an-email-file? (path? . -> . any))
 (get-email-header-val (->* any (email?) any))

(provide (struct-out exn:malformed-email))

;; Parse a stream of emails.  Emails are separated by lines that begin with
;; "From " (note the lack of ":").
(define parse-emails
  (lambda ((ip (current-input-port)))
    ; (do ((i (peek-line ip) (+ i 1))) ((= i vsides))
    (let loop ((line (peek-line ip)))
      (cond ((eof-object? line) '())
            ((new-email? line) ;; collection of emails
             (let* ((parsed (parse-email
                             (open-input-string (get-first-email ip))))
                    (parsed-rest (loop (peek-line ip))))
               ; (printf "an mbox collection: ~V ~N" 'parsed )
               (cons parsed parsed-rest)))
            ((an-email? line) (list (parse-email ip))) ;; a whole email per message
            (else (raise
                     (format "~a: ~a"
                             "Expected a \"From ...\", got"

;; Parse an email. It either does or does not have an attachment.
(define parse-email
  (lambda ((ip (current-input-port)))
    (let ((analysis (mime-analyze ip)))
      (if (multi-message? analysis)
          (parse-email-multi analysis)
          (parse-email-single analysis)))))

;; Parse an email with an attachment
(define/contract parse-email-multi
  (message? . -> . email?)
  (lambda (analysis)
     (message-fields->assoc (message-fields analysis))
     (map message->body
          (entity-parts (message-entity analysis))))))

;; Parse an email with no attachment
(define/contract parse-email-single
  (message? . -> . email?)
  (lambda (analysis)
     (message-fields->assoc (message-fields analysis))
     (list (message->body analysis)))))

(define (multi-message? analysis)
  (symbol=? (entity-type (message-entity analysis))

;; Produces a list of strings, each string representing a line in the email,
;; from a message.
(define (message->body message)
  (entity-body->body (entity-body (message-entity message))))

;; Uses the entity-body procedure to produce a list of strings, each string
;; representing a line in the message.
(define (entity-body->body body)
  (let ((o (open-output-string)))
    ;; Print to o
    (body o)
    ;; cdr because of a leading newline
    (cdr (string->los (get-output-string o)))))

;; Break a string with embedded newlines into a list of strings, each string
;; representing one line.
(define (string->los s)
  (regexp-split #px"\n" s))

;; Show the next line, without consuming anything.
(define/contract peek-line
  (->* () (input-port?) (or/c eof-object? string?))
 ;(() (input-port?) . opt-> . (union eof-object? string?))
  (lambda ((ip (current-input-port)))
    (let loop ((acc "")
               (c (peek-char ip))
               (col 1))
        ((eof-object? c) (if (string=? acc "") c acc))
        ((char=? c #\newline) acc)
        (else (loop (string-append acc (string c))
                    (peek-char ip col)
                    (+ col 1)))))))

;; Is this line the start of  a new email?
(define (new-email? line)
  (regexp-match #px"^From .*\\d{4}$" line))

;; Is this line the start of  a new email?
(define (an-email? line)
  (regexp-match #px"^(?mi:From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\\-.*|MIME-Version|Forwarded|Message.*)" line))

;; Consume a list of strings of colon-separated values, and produce an assoc
;; of string, which are those values.
(define/contract message-fields->assoc
  ((listof string?) . -> . (listof (cons/c symbol? string?)))
  (lambda (fields)
    (filter ;; Get rid of '()s
      (lambda (field)
        (let ((n-v (regexp-match #px"^(\\S+): *(.*)" field)))
          (if n-v
              (cons (string->symbol (cadr n-v))
                    (apply string-append (cddr n-v)))

;; Produce the first email in a stream of emails.
(define/contract get-first-email
  (input-port? . -> . string?)
  (lambda (ip)
    (let loop ((acc "")
               (line (peek-line ip))
               (seen-first #f))
        ((eof-object? line) acc)
        ((new-email? line) (if seen-first
                               (let* ((a (read-line ip))
                                      (l (peek-line ip)))
                                 (loop a l #t))))
        (else (let ((a (read-line ip))
                    (l (peek-line ip)))
                (loop (format "~a~n~a" acc a) l seen-first)))))))

;;;; new section (c)2008 spdegabrielle

;> parse-archive : path -> (listof Email)
;Parse each file using _parse-emails_ and produce a list of all the
;parsed emails.
(define (parse-archive email-archive-path)
   (lambda (email-file-path) 
     (call-with-input-file email-file-path parse-emails))
   (find-files an-email-file? email-archive-path)))

; not exported yet ;

;; Is this file an email?
;; an-email-file? : path -> boolean
(define (an-email-file? path)
  (let ((file (file-name-from-path path)))
    (if (not (directory-exists? path)) 
              (path->string (file-name-from-path path)))

;; write-email : message [output-port] -> void
(define (write-email message (out-port (current-output-port)))
    (printf "THIS IS NOT WORKING ~V:~V~N" message out-port))

;;; tokeniser for email address fields  (to from cc)
;; addresses-string-split : string -> (listof string?)
(define (tokenise-email-addresses string)
  (if (string? string) 
       (lambda (a-string)
          (car (regexp-match 
                (car (regexp-match #px"\\S+@\\S+" a-string)))))) ;; [A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}
       (regexp-split #px",\\s*" string))

;; get-email-header-val : key message -> value
;; return value associated with key in EMAIL HEADER
(define (get-email-header-val key message)
  (let* ((headers-alist (email-headers message))
         (result (assoc key headers-alist)))
    (if (pair? result) (cdr result) #f)))