unzip.ss
(module unzip mzscheme
  (require (planet "io.ss" ("dherman" "io.plt" 1)))
  (require (planet "file.ss" ("dherman" "io.plt" 1)))
  (require (lib "contract.ss"))
  (require (lib "etc.ss"))
  (require (lib "inflate.ss"))
  (require (lib "port.ss"))
  (require (lib "file.ss"))
  (require "private/zip-constants.ss")

  ;; ===========================================================================
  ;; DATATYPES AND UTILITIES
  ;; ===========================================================================

  ;; (alistof bytes zip-entry)
  (define-struct zip-directory (contents))

  ;; nat * boolean
  (define-struct zip-entry (offset dir?))

  (define-struct (exn:fail:unzip exn:fail) ())
  (define-struct (exn:fail:unzip:no-such-entry exn:fail:unzip) (entry))
  
  (define (raise-unzip-error message)
    (raise
     (make-exn:fail:unzip (string->immutable-string (format "unzip: ~a" message))
                          (current-continuation-marks))))
  
  (define (raise-entry-not-found entry)
    (raise
     (make-exn:fail:unzip:no-such-entry
      (string->immutable-string (format "unzip: entry not found: \"~a\"" (bytes->string/latin-1 entry)))
      (current-continuation-marks)
      entry)))

  ;; zip-directory-entries : zip-directory -> (listof bytes)
  (define (zip-directory-entries zipdir)
    (map car (zip-directory-contents zipdir)))

  ;; zip-directory-lookup : bytes zip-directory -> (option zip-entry)
  (define (zip-directory-lookup entry zipdir)
    (let loop ([contents (zip-directory-contents zipdir)])
      (cond
        [(null? contents) #f]
        [(or (bytes=? entry (caar contents))
             (bytes=? (bytes-append entry #"/") (caar contents)))
         (cdar contents)]
        [else (loop (cdr contents))])))

  ;; zip-directory-contains? : (union string path bytes) zip-directory -> boolean
  (define (zip-directory-contains? entry zipdir)
    (if (bytes? entry)
        (and (zip-directory-lookup entry zipdir) #t)
        (zip-directory-contains? (path->zip-path entry) zipdir)))

  ;; matches-directory? : bytes bytes -> boolean
  (define (bytes-prefix? dirname entry-name)
    (let ([dirname-len (bytes-length dirname)]
          [entry-name-len (bytes-length entry-name)])
      (and (>= entry-name-len dirname-len)
           (bytes=? (subbytes entry-name 0 dirname-len) dirname))))

  ;; zip-directory-includes-directory? : (union string path bytes) zip-directory -> boolean
  (define (zip-directory-includes-directory? dirname zipdir)
    (if (bytes? dirname)
        (ormap (lambda (pair)
                 (bytes-prefix? dirname (car pair)))
               (zip-directory-contents zipdir))
        (zip-directory-includes-directory? (path->zip-path dirname) zipdir)))

  ;; path->zip-path : (union path string) -> bytes
  (define (path->zip-path p)
    (if (path? p)
        (bytes->zip-bytes (path->bytes p))
        (bytes->zip-bytes (string->bytes/latin-1 p))))
  
  (define (bytes->zip-bytes b)
    (regexp-replace* *os-specific-separator-regexp* b #"/"))

  ;; ===========================================================================
  ;; UNZIPPING ENGINE
  ;; ===========================================================================

  (define *slash-byte* (char->integer #\/))

  (define (directory-entry? name)
    (= (bytes-ref name (sub1 (bytes-length name))) *slash-byte*))

  ;; unzip-one-entry : input-port (bytes boolean -> (option output-port)) boolean -> any
  ;; unzips a single file and positions the input port at the next section
  (define (unzip-one-entry in build-port close-when-done?)
    (let ([read-int (lambda (count) (read-integer count #f in #f))])
      (let* ([signature            (read-int 4)]
             [version              (read-bytes 2 in)]
             [bits                 (read-int 2)]
             [compression          (read-int 2)]
             [time                 (read-int 2)]
             [date                 (read-int 2)]
             [crc-32               (read-int 4)]
             [compressed           (read-int 4)]
             [uncompressed         (read-int 4)]
             [filename-length      (read-int 2)]
             [extra-length         (read-int 2)]
             [filename             (read-bytes filename-length in)]
             [extra                (read-bytes extra-length in)])
        (let* ([dir? (directory-entry? filename)]
               [out (build-port filename dir?)])
          (dynamic-wind
           void
           (lambda ()
             (let ([in0 (if (bit-set? 3 bits)
                            in
                            (make-limited-input-port in compressed #f))])
               (cond
                 [(not out) (skip-bytes compressed in0)]
                 [(zero? compression) (copy-port in0 out)]
                 [else (inflate in0 out)])
               (when (bit-set? 3 bits)
                 ;; TODO: how does Java do this?
                 ;; it seems the EXT header here is optional (weird)
                 (when (bytes=? (peek-bytes 2 0 in) #"PK")
                   (skip-bytes 4 in))
                 (skip-bytes 12 in))))
           (lambda ()
             (when (and out close-when-done?)
               (close-output-port out))))))))

  ;; find-central-directory : input-port nat -> nat nat nat
  (define (find-central-directory in size)
    (let loop ([pos (- size 18)])
      (unless (positive? pos)
        (raise-unzip-error "no central directory"))
      (file-position in pos)
      (let* ([read-int (lambda (count) (read-integer count #f in #f))]
             [signature (read-int 4)])
        (if (= signature *end-of-central-directory-record*)
            (let ([disk-number       (read-int 2)]
                  [directory-disk    (read-int 2)]
                  [disk-entries      (read-int 2)]
                  [entry-count       (read-int 2)]
                  [directory-length  (read-int 4)]
                  [directory-offset  (read-int 4)]
                  [comment-length    (read-int 2)])
              (if (= (- size (file-position in)) comment-length)
                  (values directory-offset directory-length entry-count)
                  (loop (sub1 pos))))
            (loop (sub1 pos))))))
  
  ;; read-central-directory : input-port nat -> (alistof bytes zip-entry)
  (define (read-central-directory in size)
    (let-values ([(offset length count) (find-central-directory in size)])
      (file-position in offset)
      (build-list count
                  (lambda (i)
                    (let* ([read-int (lambda (count)
                                       (read-integer count #f in #f))]
                           [signature (read-int 4)])
                      (unless (= signature *central-file-header*)
                        (raise-unzip-error
                         (format "bad central file header signature: ~a"
                                 signature)))
                      (let ([version             (read-int 2)]
                            [required            (read-int 2)]
                            [bits                (read-int 2)]
                            [compression         (read-int 2)]
                            [time                (read-int 2)]
                            [date                (read-int 2)]
                            [crc-32              (read-int 4)]
                            [compressed          (read-int 4)]
                            [uncompressed        (read-int 4)]
                            [filename-length     (read-int 2)]
                            [extra-length        (read-int 2)]
                            [comment-length      (read-int 2)]
                            [disk-number         (read-int 2)]
                            [internal-attributes (read-int 2)]
                            [external-attributes (read-int 4)]
                            [relative-offset     (read-int 4)])
                        (let* ([filename (read-bytes filename-length)]
                               [dir? (directory-entry? filename)])
                          (skip-bytes (+ extra-length comment-length) in)
                          (cons filename (make-zip-entry relative-offset dir?)))))))))

  ;; ===========================================================================
  ;; FRONT END
  ;; ===========================================================================

  ;; unzip : [input-port (bytes boolean -> (option output-port)) boolean] -> any
  (define unzip
    (opt-lambda ([in (current-input-port)] [build-port *default-entry-parser*] [close-when-done? #t])
      (when (= (peek-integer 4 #f in #f) *local-file-header*)
        (unzip-one-entry in build-port close-when-done?)
        (unzip in build-port close-when-done?))))
  
  ;; read-zip-directory : (union string path) -> zip-directory
  (define (read-zip-directory path)
    (make-zip-directory
     (with-input-from-file path
       (lambda ()
         (read-central-directory (current-input-port)
                                 (file-size path))))))

  ;; unzip-entry : (union string path) zip-directory bytes [(bytes boolean input-port -> a)] -> a
  (define unzip-entry
    (opt-lambda (path dir entry-name [parse-entry *default-entry-parser*])
      (cond
        [(zip-directory-lookup entry-name dir)
         => (lambda (entry)
              (let-values ([(entry-in entry-out) (make-pipe)])
                (let ([saved-exn #f])
                  ;; TODO: BLEACH!!!!
                  (thread
                   (lambda ()
                     (with-handlers ([(lambda (exn) #t)
                                      (lambda (exn)
                                        (set! saved-exn exn)
                                        (close-output-port entry-out))])
                       (with-input-from-file path
                         (lambda ()
                           (file-position (current-input-port)
                                          (zip-entry-offset entry))
                           (unzip-one-entry (current-input-port)
                                            (lambda (n d?) entry-out)
                                            #t))))))
                  (let ([result (with-handlers ([(lambda (exn) #t)
                                                 (lambda (exn)
                                                   (raise (or saved-exn exn)))])
                                  (parse-entry entry-name
                                               (zip-entry-dir? entry)
                                               entry-in))])
                    (when saved-exn (raise saved-exn))
                    result))))]
        [else (raise-entry-not-found entry-name)])))
  
  ;; ===========================================================================
  ;; ENTRY PARSERS
  ;; ===========================================================================

  ;; build-filesystem-entry-parser : [output-flag] -> entry-parser
  (define build-filesystem-entry-parser
    (opt-lambda ([flag 'error])
      (opt-lambda (name dir? [in #f])
        (cond
          [dir? (let ([dir (bytes->path name)])
                  (unless (directory-exists? dir)
                    (make-directory* dir))
                  #f)]
          [in (let* ([path (bytes->path name)]
                     [parent (dirname path)])
                (unless (directory-exists? parent)
                  (make-directory* parent))
                (with-output-to-file path
                  (lambda ()
                    (copy-port in (current-output-port)))
                  flag))]
          [else (let* ([path (bytes->path name)]
                       [parent (dirname path)])
                  (unless (directory-exists? parent)
                    (make-directory* parent))
                  (open-output-file path flag))]))))
  
  (define *default-entry-parser* (build-filesystem-entry-parser))

  ;; build-piped-entry-parser : output-port -> entry-parser
  (define (build-piped-entry-parser out)
    (opt-lambda (name dir? [in #f])
      (cond
        [dir? #f]
        [in (copy-port in out)]
        [else out])))
  
  (define output-flag/c
    (symbols 'error 'replace 'truncate 'truncate/replace 'append 'update))
  
  (define entry-parser/c
    (case->
     (bytes? boolean? . -> . (union output-port? false/c))
     (bytes? boolean? input-port? . -> . any)))
  
  (provide/contract
   [exn:fail:unzip? (any/c . -> . boolean?)]
   [exn:fail:unzip:no-such-entry? (any/c . -> . boolean?)]
   [make-exn:fail:unzip (string? continuation-mark-set? . -> . exn:fail:unzip?)]
   [make-exn:fail:unzip:no-such-entry (string? continuation-mark-set? bytes? . -> . exn:fail:unzip:no-such-entry?)]
   [exn:fail:unzip:no-such-entry-entry (exn:fail:unzip:no-such-entry? . -> . bytes?)]
   [zip-directory? (any/c . -> . boolean?)]
   [zip-directory-entries (zip-directory? . -> . (listof bytes?))]
   [zip-directory-contains? ((union string? path? bytes?) zip-directory? . -> . boolean?)]
   [zip-directory-includes-directory? ((union string? path? bytes?) zip-directory? . -> . boolean?)])

  (provide/contract
   [output-flag/c contract?]
   [entry-parser/c contract?])

  (provide/contract
   [unzip (() (input-port? (bytes? boolean? . -> . (union output-port? false/c)) boolean?) . opt-> . any)]
   [read-zip-directory ((union string? path?) . -> . zip-directory?)]
   [unzip-entry (((union string? path?) zip-directory? bytes?)
                 ((bytes? boolean? input-port? . -> . any))
                 . opt-> .
                 any)]
   [path->zip-path ((union string? path?) . -> . bytes?)]
   [build-filesystem-entry-parser (() (output-flag/c) . opt-> . entry-parser/c)]
   [build-piped-entry-parser (output-port? . -> . entry-parser/c)]))