mediafile.rkt
#lang racket/base
;; Copyright Neil Van Dyke. See file "info.rkt".

(require racket/match
         (planet neil/mcfly)
         (planet neil/path-misc)
         "mediafile-misc.rkt")

(module+ test
  (require "planet-overeasy.rkt"))

(doc (section "Introduction")

     (para (italic "Note: This package is in alpha-testing.  Please see the
``Contributing Test Files'' subsection below.  Thanks."))
     
     (para "The "
           (tt "mediafile")
           " package provides utilities for dealing with collections of media
files (still image, audio, video) and the metadata properties of those files.
Currently, this package provides procedures for extracting metadata from a few
popular media file formats, and procedures for maintaining a database of media
files currently in various filesystem directory trees.  This functionality is
useful for media-player applications, and for managing collections of media
files.")
     
     (para "Currently, this package is implemented in pure Racket code, without
linking any new native code into the Racket process, nor running external
programs."))

(doc (section "Types"))

(doc (defproc (mediafile-type? (x any/c))
       boolean?
       (para "Predicate for whether or not "
             (racket x)
             " is a "
             (racket mediafile-type)
             ".")
       (para "A valid type is either a symbol, of a MIME content-type name, or
a list of symbols, in which the last symbol is the MIME content-type and the
one-or-more preceeding symbols are encodings atop the content-type.  For
example, file "
             (filepath "foo.tif")
             " might have type "
             (racket 'image/tiff)
             ", and file "
             (filepath "foo.tif.gz")
             " might have type "
             (racket '(gzip image/tiff))
             ".")))
(provide mediafile-type?)
(define (mediafile-type? x)
  (match x
    ((? symbol?)                  #true)
    ((list 'gzip ..1 (? symbol?)) #true)
    (_                            #false)))

(doc (defproc (mediafile-props? (x any/c))
       boolean?
       (para "Predicate for whether or not "
             (racket x)
             " is a "
             (racket mediafile-props)
             ", which is used to represent properties of a media file.")
       (para "A props is an alist of alists of symbols to datums.  In other words, following this contract:")
       (racketblock (listof (cons/c any/c
                                    (listof (cons/c symbol?
                                                    any/c)))))
       (para "The top level alist is for ``parts'', such as for distinguishing
multiple media objects in a single container file.  The "
             (racket car)
             " of each of these top level alist pairs can be any datum, although will often be a number representing the sequence of the part in the container, unless there is a better unique key.  A special "
             (racket car)
             " is "
             (racket #f)
             ", which means properties of the entire container file.")
       (para "The "
             (racket cdr)
             " of these top-level pairs is the second-level alist, which is
symbol-to-datum pairs specific to the part.  The names of the symbols are often
specific to the type of either the file or the part.  The datum values
corresponding to the names in the part can be of any type; an application
wishing to do more with the value than display it in raw form must have "
             (italic "a priori")
             " knowledge of the type, such as that "
             (racket 'exif:metering-mode)
             " typically has values like "
             (racket 'center-weighted-average)
             " and "
             (racket 'spot)
             ", and what those values mean for the application.")
       ;; TODO: Talk more about this, including meaning of #(decoded X) vector.
       ))
(provide mediafile-props?)
(define (mediafile-props? x)
  (match x
    ((list (cons #f (list (cons (? symbol?) _) ...))
           (cons _ (list (cons (? symbol?) _) ...)) ...)
     #true)
    ((list (cons _ (list (cons (? symbol?) _) ...)) ...)
     #true)
    (_ #false)))

(module+ test
  (test (mediafile-props? '()) #true)
  (test (mediafile-props? '((#f . ((a . 1) (b . 2)))
                            (0  . ((c . 3) (d . 4)))
                            (1  . ())
                            (2  . ((e . 5) (f . 6)))))
        #true)
  (test (mediafile-props? 'nickelback) #false))

(module+ test
  (test (mediafile-type? 'image/jpeg) #true)
  (test (mediafile-type? '(gzip image/jpeg)) #true)
  (test (mediafile-type? '(gzip gzip image/jpeg)) #true)
  (test (mediafile-type? 69) #false)
  (test (mediafile-type? '(image/jpeg image/jpeg)) #false)
  (test (mediafile-type? '(image/jpeg)) #false)
  (test (mediafile-type? '(69 image/jpeg)) #false))

(doc (defstruct mediafile
       ((path     path?)
        (type     mediafile-type?)
        (identity any/c)
        (size     any/c)
        (mtime    any/c)
        (props    mediafile-props?))
       #:transparent)
     (para "Struct representing a "
           (racket mediafile)
           ".  The "
           (racket identity)
           ", "
           (racket size)
           ", and "
           (racket mtime)
           " values are intended to help determine whether a file has been
modified since it was last scanned for properties."))
(provide (struct-out mediafile))
(define-struct mediafile
  (path
   type
   identity
   size
   mtime
   props)
  #:transparent)

(doc (section "Content Types")
     
     (para "This package currently supports a few different MIME content-types,
listed in the following subsections, along with lists of references that were
used in the implementation for each content-type."))

(doc (subsection "TIFF (image/tiff)")
     
     (itemlist
      
      (item "Adobe Developers Association, "
            (hyperlink "http://www.exif.org/TIFF6.pdf"
                       "TIFF Revision 6.0")
            ", 1992-06-03")
      
      (item (hyperlink "http://www.fileformat.info/format/tiff/sample/index.htm"
                       "FileFormat.Info TIFF Sample Files"))))

(doc (subsection "JPEG/Exif (image/jpeg)")
     
     (itemlist
      
      (item (hyperlink "http://www.w3.org/Graphics/JPEG/itu-t81.pdf"
                       "ITU CCITT T.81")
            ", Terminal Equipment and Protocols for Telematic Services -
Information Technology - Digital Compression and Coding of Continuous-Tone
Still Images - Requirements and Guidelines, 1992-09")
      
      (item (hyperlink "http://www.cipa.jp/english/hyoujunka/kikaku/pdf/DC-008-2010_E.pdf"
                       "CIPA DC-008-Translation-2010: Exchangeable image file
format for digital still cameras: Exif Version 2.3")
            ", 2010-04-26")
      
      (item (hyperlink "http://www.burren.cx/david/canon.html"
                       "Davis Burren, EXIF MakerNote of Canon")
            ", Revision 1.15, 2001-06-03")
      
      (item (hyperlink "http://www.ozhiker.com/electronics/pjmt/jpeg_info/canon_mn.html"
                       "Evan Hunter, Canon Makernote information")
            ", viewed 2012-11-17")
      
      (item (hyperlink "http://www.exiv2.org/tags-canon.html"
                       "Canon MakerNote Tags defined in Exiv2")
            ", viewed 2012-11-17")
      
      (item (hyperlink "http://gvsoft.homedns.org/exif/"
                       "GVsoft Exif MakerNote information")
            ", viewed 2012-11-23")))

(require "mediafile-exif.rkt")

(doc (subsection "Ogg Vorbis (audio/ogg)")
     
     (itemlist
      
      (item (hyperlink "http://xiph.org/vorbis/doc/Vorbis_I_spec.html"
                       "Vorbis I specification")
            ", 2012-02-03")
      
      (item (hyperlink "http://xiph.org/vorbis/doc/v-comment.html"
                       "Ogg Vorbis I format specification: comment field and
header specification"))
      
      (item (hyperlink "http://xiph.org/vorbis/doc/oggstream.html"
                       "Ogg logical and physical bitstream overview"))
      
      (item (hyperlink "http://xiph.org/vorbis/doc/framing.html"
                       "Ogg logical bitstream framing"))
      
      (item (hyperlink "http://xiph.org/ogg/doc/ogg-multiplex.html"
                       "Page Multiplexing and Ordering in a Physical Ogg Stream"))))

(require "mediafile-ogg.rkt")

(doc (section "Files and Scanning")
     
     (para "This section lists procedures for maintaining a database of "
           (racket "mediafile")
           " objects corresponding to files in filesystem directory trees."))

(define %mediafile:extension-to-mime-type-symbol-hash
  #hash(("jpe"  . image/jpeg)
        ("jpeg" . image/jpeg)
        ("jpg"  . image/jpeg)
        ("oga"  . audio/ogg)
        ("ogg"  . audio/ogg)
        ;; ("ogv"  . video/ogg)
        ;; ("ogx"  . application/ogg)
        ("tif"  . image/tiff)
        ("tiff" . image/tiff)))

(define %mediafile:file-type-to-get-props-proc-hash
  (make-immutable-hash
   `((audio/ogg  . ,get-vorbis-file-props)
     (image/jpeg . ,get-jpeg-file-props)
     (image/tiff . ,get-tiff-file-props))))

(define (%mediafile:path->file-type path)
  (let ((str (cond ((string? path) path)
                   ((path?   path) (path->string path))
                   (else (raise-type-error '%mediafile:path->extensions
                                           "path-string?"
                                           path)))))
    ;; TODO: Maybe make this handle arbitrary stacking of compression
    ;; encodings.  Maybe do split-path first, then do in a loop on the string.
    (cond ((regexp-match #rx"\\.([a-zA-Z][a-zA-Z]*)(\\.[gG][zZ])?$" str)
           => (lambda (m)
                (apply (lambda (all content-ext compression-ext)
                         (let ((content-type (hash-ref %mediafile:extension-to-mime-type-symbol-hash
                                                       (string-downcase content-ext)
                                                       #f)))
                           (if compression-ext
                               (list 'gzip content-type)
                               content-type)))
                       m)))
          (else #f))))

(module+ test
  (test (%mediafile:path->file-type "foo.ogg")    'audio/ogg)
  (test (%mediafile:path->file-type "foo.tif")    'image/tiff)
  (test (%mediafile:path->file-type "foo.tif.gz") '(gzip image/tiff))
  (test (%mediafile:path->file-type "foo.jpg")    'image/jpeg)
  (test (%mediafile:path->file-type "foo.JPEG")   'image/jpeg)
  (test (%mediafile:path->file-type "foo.c")      '#f)
  (test (%mediafile:path->file-type "foo.c.gz")   '(gzip #f)))

(doc (defproc (path->mediafile
               (path path-string?)
               (#:canonicalize-path? canonicalize-path? boolean?             #true)
               (#:old-mediafile      old-mediafile      (or/c #f mediafile?) #f)
               (#:type-mandatory?    type-mandatory?    boolean?             #false)
               (#:props-mandatory?   props-mandatory?   boolean?             #false)
               (#:exception?         exception?         boolean?             #true))
       mediafile?
       (para "Yields a "
             (racket mediafile)
             ", given a path to the file.  If "
             (racket #:old-mediafile)
             " is given, then that value will be returned if the file does not seem to have changed since that "
             (racket mediafile)
             " was created, which potentially saves the cost of scanning for properties.")
       (para "If there is a problem creating a "
             (racket mediafile)
             ", then the behavior depends on "
             (racket #:exception?)
             " -- if true, then an exception is raised; if false, then this procedure returns "
             (racket #false)
             " rather than a "
             (racket mediafile)
             ".  The "
             (racket #:type-mandatory?)
             " and "
             (racket #:props-mandatory?)
             " arguments specify what should be considered a ``problem'' for this purpose.")
       (para "The "
             (racket #:canonicalize-path?)
             " specifies whether or not to store a canonicalized path in the "
             (racket mediafile)
             ", rather than the "
             (racket path)
             " argument verbatim.  Most applications will want to have a
canonicalized path, which is the default behavior.")))
(provide path->mediafile)
(define (path->mediafile path
                         #:canonicalize-path? (canonicalize-path? #true)
                         #:old-mediafile      (old-mediafile      #f)
                         #:type-mandatory?    (type-mandatory?    #true)
                         #:props-mandatory?   (props-mandatory?   #false)
                         #:exception?         (exception?         #true))
  (let*-values (((path)
                 (if canonicalize-path?
                     (canonicalize-path path)
                     (cleanse-path      path)))
                ((identity+size+mtime)
                 (lambda ()
                   (values (or (file-or-directory-identity path #true)
                               (error 'path->mediafile "identity #f of file ~S" path))
                           (or (file-size path)
                               (error 'path->mediafile "size #f of file ~S" path))
                           (or (file-or-directory-modify-seconds path)
                               (error 'path->mediafile "mtime #f of file ~S" path)))))
                ((identity size mtime)
                 (if exception?
                     (identity+size+mtime)
                     (with-handlers ((exn:fail? (lambda (e) (values #f #f #f))))
                       (identity+size+mtime)))))
    (cond ((or (not identity) (not size) (not mtime)) #f)
          ((and old-mediafile
                (equal? path     (mediafile-path     old-mediafile))
                (equal? identity (mediafile-identity old-mediafile))
                (equal? size     (mediafile-size     old-mediafile))
                (equal? mtime    (mediafile-mtime    old-mediafile)))
           old-mediafile)
          (else
           ;; Note: We are returning file-type below from a lot of places because
           ;; we might make get-props-proc also return better file-type.
           (let-values (((file-type props)
                         (cond ((%mediafile:path->file-type path)
                                ;; Got initial file-type from filename.
                                => (lambda (file-type)
                                     (cond ((hash-ref %mediafile:file-type-to-get-props-proc-hash
                                                      file-type
                                                      #f)
                                            ;; Know how to get props.
                                            => (lambda (get-props-proc)
                                                 (if props-mandatory?
                                                     (if exception?
                                                         (values file-type
                                                                 (get-props-proc path))
                                                         (with-handlers ((exn:fail? (lambda (e)
                                                                                      (values file-type
                                                                                              #f))))
                                                           (values file-type
                                                                   (get-props-proc path))))
                                                     (with-handlers ((exn:fail? (lambda (e)
                                                                                  (values file-type
                                                                                          `((#f . ((error . ,(string-append "could not get props: "
                                                                                                                            (exn-message e))))))))))
                                                       (values file-type
                                                               (get-props-proc path))))))
                                           ;; Don't know how to get props.
                                           (else (if props-mandatory?
                                                     (if exception?
                                                         (error 'path->mediafile
                                                                "do not know how to get get props for file type ~S for path ~S"
                                                                file-type
                                                                path)
                                                         (values file-type #f))
                                                     (values #f
                                                             '((#f . ((error . "do not know how to get props"))))))))))
                               ;; Didn't get initial file-type from filename.
                               (else (if (or type-mandatory? props-mandatory?)
                                         (if exception?
                                             (error 'path->mediafile
                                                    "could not determine file type for path ~S"
                                                    path)
                                             ;; TODO: !!! Is this what we want to return here, and elsewhere in this procedure?
                                             ;; Maybe we need to separate the error message from
                                             ;; the information about whether or not we got props.
                                             (values #f #f))
                                         (values #f '((#f . ((error . "unknown file type"))))))))))
             (and props
                  (make-mediafile path
                                  file-type
                                  identity
                                  size
                                  mtime
                                  props)))))))

(doc (defproc (scan-mediafiles
               (start-path-or-paths (or/c path-string? (list-of path-string?)))
               (#:canonicalize-paths? canonicalize-paths? boolean?        #true)
               (#:type-mandatory?     type-mandatory?     boolean?        #false)
               (#:props-mandatory?    props-mandatory?    boolean?        #false)
               (#:old-hash            old-hash            immutable-hash? #f)
               (#:remove-other-paths? remove-other-paths? boolean?        #true))
       immutable-hash?
       (para "Scans filesystems recursively, beneath the paths given as "
             (racket start-path-or-paths)
             ", and returns a hash of paths to "
             (racket mediafile)
             " objects.")
       (para "If "
             (racket #:old-hash)
             " is provided, then this hash is used as a starting point for the hash that will ultimately be returned, such as for updating from a previous run of "
             (racket scan-mediafiles)
             ".  If "
             (racket #:old-hash)
             " is provided, then "
             (racket #:remove-other-paths?)
             " determines whether paths in the old hash that are not within the scope of "
             (racket start-path-or-paths)
             " should be removed before returning the new hash.")
       (para "The "
             (racket #:canonicalize-paths?)
             ", "
             (racket #:type-mandatory?)
             ", and "
             (racket #:props-mandatory?)
             " arguments are passed to "
             (racket path->mediafile)
             ".")))
(provide scan-mediafiles)
(define (scan-mediafiles start-path-or-paths
                         #:canonicalize-paths? (canonicalize-paths? #true)
                         #:type-mandatory?     (type-mandatory?    #true)
                         #:props-mandatory?    (props-mandatory?    #false)
                         #:old-hash            (old-hash            #f)
                         #:remove-other-paths? (remove-other-paths? #true))
  (let loop-paths ((paths          (map (lambda (path)
                                          (if canonicalize-paths?
                                              (simplify-path (path->complete-path path))
                                              (canonicalize-path path)))
                                        (if (path-string? start-path-or-paths)
                                            (list start-path-or-paths)
                                            start-path-or-paths)))
                   (mediafile-hash (or old-hash
                                       (make-immutable-hash)))
                   (visited-hash   (make-immutable-hash)))
    (log-mediafile-debug "scan-mediafiles: loop-paths paths ~S mediafile-hash ~S visited-hash ~S"
                         paths
                         mediafile-hash
                         visited-hash)
    (if (null? paths)
        (if remove-other-paths?
            (let loop-remove-others ((keys           (hash-keys mediafile-hash))
                                     (mediafile-hash mediafile-hash))
              (if (null? keys)
                  mediafile-hash
                  (let ((key (car keys)))
                    (loop-remove-others (cdr keys)
                                        (if (hash-has-key? visited-hash key)
                                            mediafile-hash
                                            (hash-remove mediafile-hash key))))))
            mediafile-hash)
        (let ((path (car paths)))
          (cond ((hash-has-key? visited-hash path)
                 (log-mediafile-debug "scan-mediafiles: already visited ~S" path)
                 (loop-paths (cdr paths)
                             mediafile-hash
                             visited-hash))
                ((link-exists? path)
                 (log-mediafile-debug "scan-mediafiles: symlink ~S" path)
                 (loop-paths (cdr paths)
                             (hash-remove mediafile-hash path)
                             (hash-set    visited-hash   path #true)))
                ((directory-exists? path)
                 (log-mediafile-debug "scan-mediafiles: directory ~S" path)
                 (loop-paths (let loop-subs ((subs  (directory-list path))
                                             (paths (cdr paths)))
                               (if (null? subs)
                                   paths
                                   (loop-subs (cdr subs)
                                              (let* ((sub     (car subs))
                                                     (sub-str (path->string sub)))
                                                (if (regexp-match? #rx"^\\." sub-str)
                                                    paths
                                                    (cons (build-path path sub)
                                                          paths))))))
                             (hash-remove mediafile-hash path)
                             (hash-set    visited-hash   path #true)))
                ;; TODO: !!! rule out fifos and other special files. do we have to do anything else?
                ((file-exists? path)
                 (log-mediafile-debug "scan-mediafiles: normal file ~S" path)
                 (cond ((path->mediafile path
                                         #:canonicalize-path? #false
                                         #:old-mediafile      #f
                                         #:type-mandatory?    type-mandatory?
                                         #:props-mandatory?   props-mandatory?
                                         #:exception?         #false)
                        => (lambda (mediafile)
                             (loop-paths (cdr paths)
                                         (hash-set mediafile-hash path mediafile)
                                         (hash-set visited-hash   path #true))))
                       (else (loop-paths (cdr paths)
                                         (hash-remove mediafile-hash path)
                                         (hash-set    visited-hash   path #true)))))
                (else
                 (log-mediafile-debug "scan-mediafiles: weird file ~S" path)
                 (loop-paths (cdr paths)
                             (hash-remove mediafile-hash path)
                             (hash-set    visited-hash   path #true))))))))

(module+ test
  (test (scan-mediafiles "test-files/exif-org"
                         #:canonicalize-paths? #true
                         #:type-mandatory?     #false
                         #:props-mandatory?    #false
                         #:old-hash            #f
                         #:remove-other-paths? #true)
        ;; TODO: !!! Fill in the expected value here.  We'll need to have
        ;; canonicalize-paths? false, or to provide a filter or our own test
        ;; comparison procedure.
        '!!!))

(doc (section "Test Files")
     
     (para "This package contains some files that are used for test data.
Contributions of particular kinds of additional files are welcome."))

(doc (subsection "Current Test Files")
     
     (para "The following directory structure exists in the source code
distribution for this package.")
     
     (itemlist
      
      (item (filepath "test-files/")
            (itemlist
             
             (item (filepath "exif-org/")
                   " -- JPEG/Exif and other files, from "
                   (url "http://exif.org/samples.html")
                   ", courtesy of John Hawkins.")
             
             (item (filepath "public-domain/")
                   " -- Files known to be in the legal public domain, for
testing with a breadth of file creators (e.g., different camera models) and
situations (e.g., different Ogg container layouts)."
                   (itemlist
                    
                    (item (filepath "jpeg/")
                          " -- JPEG/Exif and JPEG/JFIF files from public
domain, especially verbatim as saved by particular camera models.")))))))

(doc (subsection "Contributing Test Files")
     
     (para "If you'd like to contribute a JPEG file from a particular camera
model, that would be very welcome.  Here's how:")
     
     (itemlist #:style 'ordered
               
               (item "Set camera to capture the image in a relatively "
                     (italic "small")
                     " file size.  This means setting camera to low resolution,
high compression, low quality, etc.  (The small size is to make including files
with the package more practical.)")
               
               (item "Choose a photographic subject (e.g., stop sign, cloud,
thumbtack, light switch) that:"
                     
                     (itemlist
                      
                      (item "Does not contain any trademarks or copyrighted
material (no brand names, logos, book pages, etc.).")
                      
                      (item "Does not contain anything personally-identifiable,
such as faces.")
                      
                      (item "Is G-rated.  (No showing off Racket programmer abs.)")
                      
                      (item "Is not too complicated, so should compress well.")))
               
               (item "Take photo with camera.")
               
               (item "Do "
                     (italic "not")
                     " edit the photo in any way at all -- it must be
byte-for-byte identical to how the camera first wrote it to your memory card.")
               
               (item "Email the photo to: "
                     (tt "neil")
                     (bold (tt "@"))
                     (tt "neilvandyke.org")
                     (linebreak)
                     "In the text of the email, please state ``This image is in
the public domain.''  Note that you are legally giving up all copyright to this
image, to make including it in a regression test suite more practical.")))

;; TODO: "When Exif is employed for JPEG files, the Exif data are stored in
;; one of JPEG's defined utility Application Segments, the APP1 (segment marker
;; 0xFFE1), which in effect holds an entire TIFF file within. When Exif is
;; employed in TIFF files (also when used as "an embedded TIFF file" mentioned
;; earlier), the TIFF Private Tag 0x8769 defines a sub-Image File Directory
;; (IFD) that holds the Exif specified TIFF Tags. In addition, Exif also
;; defines a Global Positioning System sub-IFD using the TIFF Private Tag
;; 0x8825, holding location information, and an "Interoperability IFD"
;; specified within the Exif sub-IFD, using the Exif tag 0xA005."

(doc (section "Known Issues")
     
     (itemlist
      
      (item "Assemble a suite of test input files, without legal encumbrances.
Preferrably small enough file sizes to include in PLaneT package, as part of
built-in unit tests.")
      
      (item "Needs more real-world testing with diversity of files.")
      
      (item "Malformed or insufficiently supported TIFF and Exif files can
result in infinite loops or use excessive resources.")
      
      (item "Support for additional Exif MakerNotes, especially Nikon ones.")
      
      (item "A little bit more support for Canon Exif MakerNotes is possible,
such as decoding Custom Functions.")
      
      (item "Add support for getting info about multiple streams in Ogg
files.")
      
      (item "Support additional file types, especially JPEG/JFIF, PNG, and the
multiple MP3 ID3 variants."
            ;; http://id3.org/Developer%20Information
            )
      
      (item "Add feature to map properties of different formats to common ontology.")

      (item "Make a variation on "
            (racket scan-mediafiles)
            " with a fold interface, such as for various ways of getting
incremental results, including if the scanning is in concurrent thread or
process.")))

(doc history
     
     (#:planet 1:0 #:date "2012-11-27"
               (itemlist
                
                (item "Initial version, for alpha testing and soliciting
additional test files.  Some subsequent releases are likely to have
backward-incompatible changes, such as changes to types of properties."))))