scgi.rkt
#lang scheme/base
;;; @Package     scgi
;;; @Subtitle    Web Server HTTP SCGI and CGI in Racket
;;; @HomePage    http://www.neilvandyke.org/racket-scgi/
;;; @Author      Neil Van Dyke
;;; @Version     0.6
;;; @Date        2011-08-22
;;; @PLaneT      neil/scgi:1:=5

;; $Id: scgi.rkt,v 1.69 2011/08/22 11:06:20 neilpair Exp $

;;; @legal
;;; Copyright @copyright{} 2010--2011 Neil Van Dyke.  This program is Free
;;; Software; Software; you can redistribute it and/or modify it under the
;;; terms of the GNU Lesser General Public License as published by the Free
;;; Software Foundation; either version 3 of the License (LGPL 3), or (at your
;;; option) any later version.  This program is distributed in the hope that it
;;; will be useful, but without any warranty; without even the implied warranty
;;; of merchantability or fitness for a particular purpose.  See
;;; @indicateurl{http://www.gnu.org/licenses/} for details.  For other licenses
;;; and consulting, please contact the author.
;;; @end legal

(require (only-in scheme/port make-limited-input-port)
         (only-in scheme/tcp  tcp-accept/enable-break tcp-close tcp-listen))

;;; @section Introduction
;;;
;;; The @b{scgi} library implements fast Web CGI using the SCGI protocol.  This
;;; library is used in conjunction with an HTTP server supporting SCGI, such as
;;; Apache Server with the @code{mod_scgi} module.
;;;
;;; The @b{scgi} library also supports running as normal Web CGI without any
;;; change to the source code of the app, such as during development of an
;;; application intended to be deployed using SCGI.  This also gives
;;; flexibility in deployment, allowing a system administrator to switch
;;; between either mode just by editing the HTTP server configuration.
;;;
;;; The SCGI protocol was specified by Neil Schemenauer in ``SCGI: A Simple
;;; Common Gateway Interface alternative,'' dated 2008-06-23
;;; (@uref{http://python.ca/scgi/protocol.txt}).
;;;
;;; An example usage of this library:
;;;
;;; @lisp
;;; (require (planet neil/scgi))
;;;
;;; (cgi (lambda ()
;;;        (and (eq? (cgi-type) 'scgi)
;;;             (my-do-some-init-only-for-scgi)))
;;;
;;;      (lambda ()
;;;        (display "Content-type: text/html\r\n\r\n")
;;;        (display "<p>Hello, world</p>"))
;;;
;;;      (lambda ()
;;;        (my-shutdown-stuff)))
;;; @end lisp
;;;
;;; @subsection Apache mod_scgi
;;;
;;; Apache Server is one way to run SCGI, though not the only way.  Note that
;;; your Apache installation might not have the @code{mod_scgi} module
;;; installed or enabled by default.  If you happen to be running Debian Lenny
;;; (@code{stable}) or similar, this module can be installed via the Debian
;;; package @code{libapache2-mod-scgi}.
;;;
;;; Once you've installed @code{mod_scgi}, you need some standard SCGI
;;; directives to end up in your Apache config files, whether you accomplish
;;; that by editing config files manually, making symbolic links in a
;;; @code{mods-enabled} directory, or clicking in a GUI.
;;;
;;; For example, the following loads @code{mod_scgi}, maps URL paths under
;;; @code{/mypath} to the SCGI server on the local machine at the standard SCGI
;;; TCP port, and sets a 60-second timeout for the SCGI server to respond to a
;;; request before Apache drops the connection:
;;;
;;; @example
;;; LoadModule scgi_module /usr/lib/apache2/modules/mod_scgi.so
;;; SCGIMount /mypath 127.0.0.1:4000
;;; SCGIServerTimeout 60
;;; @end example
;;;
;;; There are additional @code{mod_scgi} Apache config directives, including
;;; @code{SCGIHandler} and @code{SCGIServer}.

(define-syntax %debug
  (syntax-rules ()
    ((_ FMT ARG ...)
     (log-debug (format (string-append "scgi: ~A " FMT)
                        (current-milliseconds)
                        ARG ...)))))

(define %cur-cgi-request-id               (make-parameter #f))
(define %cur-cgi-type                     (make-parameter #f))
(define %cur-end-cgi-request-ec           (make-parameter #f))
(define %cur-scgi-content-length          (make-parameter #f))
(define %cur-scgi-variables               (make-parameter #f))
(define %cur-stop-cgi-service-immediately (make-parameter #f))

(define (%invalid-cgi-type-error sym)
  (error sym "Invalid CGI type: ~S" (%cur-cgi-type)))

(define (%cgi-context-only-error sym)
  (error sym "Only works in CGI context"))

(define (%cgi-request-context-only-error sym)
  (error sym "Only works in CGI request context"))

(define (%assert-cgi-request-context sym)
  (or (%cur-cgi-request-id)
      (%cgi-request-context-only-error sym)))

(define (%bytes->number x)
  ;; TODO: Maybe make this more efficient.
  (string->number (bytes->string/latin-1 x)))

(define (%read-scgi-variables in)
  (begin0
      (cond
       ;; TODO: See whether these regexp operations on the port are sucking in
       ;; our entire input, including POST data.  If we limit this regexp-match
       ;; on input port to 5 chars or so, and also address the regexp below.
       ;; UPDATE: In stress-testing with doing an HTTP POST of approx 100MB
       ;; form file attachment, hundreds of times, under Racket 5.1.1 and PLT
       ;; Scheme 4.2.5, process size did not grow.  Additionally, at least
       ;; under 4.2.5, it appears that the regexps alone did *not* cause all
       ;; the POST data to be read.  We probably still want to limit or get rid
       ;; of these regexps, but the successful test results are reassuring for
       ;; now.
       ((regexp-match #rx"^[0-9]+" in)
        => (lambda (m)
             (or (eqv? 58 (read-byte in)) ; colon
                 (error '%read-scgi-variables
                        "Expected colon after ~S."
                        (car m)))
             ;; Note: We do the read non-tail-recursively, so that we don't
             ;; have to reverse the result.
             (let loop ((size-left (%bytes->number (car m))))
               (if (zero? size-left)
                   '()
                   (cond
                    ;; TODO: Should we limit this regexp as well?  Or rewrite
                    ;; it to do char-by-char I/O and not use regexp?
                    ((regexp-match #rx"^([A-Za-z0-9_]+)\000([^\000]*)\000"
                                   in
                                   0
                                   size-left)
                     =>
                     (lambda (m)
                       (cons (cons (list-ref m 1)
                                   (list-ref m 2))
                             (loop (- size-left
                                      (bytes-length (list-ref m 0)))))))
                    (else
                     (error
                      '%read-scgi-variables
                      "Could not read SCGI header with ~S bytes remaining."
                      size-left)))))))
       (else (error '%read-scgi-variables
                    "Did not read size number of SCGI header.")))
    (or (eqv? 44 (read-byte in))
        (error '%read-scgi-variables
               "Could not read comma in SCGI header."))))

(define (%handle-scgi-accept in out proc request-id)
  (%debug "%handle-scgi-accept request-id=~S begin"
          request-id)
  (let ((request-exit-status 0))
    (let* ((variables (%read-scgi-variables in))
           (content-length
            (%bytes->number
             (cond ((assoc #"CONTENT_LENGTH" variables) => cdr)
                   (error '%handle-scgi-accept
                          "~S missing CONTENT_LENGTH in ~S"
                          request-id
                          variables))))
           (content-in (make-limited-input-port in content-length)))
      (%debug "%handle-scgi-accept request-id=~S variables=~S"
              request-id
              variables)
      (parameterize ((current-input-port       content-in)
                     (current-output-port      out)
                     (%cur-scgi-variables      variables)
                     (%cur-scgi-content-length content-length)
                     (%cur-cgi-request-id      request-id))
        (%debug "%handle-scgi-accept request-id=~S proc" request-id)
        (proc)
        (%debug "%handle-scgi-accept request-id=~S flushing" request-id)
        (flush-output out)
        (%debug "%handle-scgi-accept request-id=~S end" request-id)))))

;;; @section Interface

;;; @defproc cgi startup-proc request-proc shutdown-proc
;;;
;;; Implement CGI.  Normal CGI is used if the @code{REQUEST_URI} environment
;;; variable is defined (which suggests that the code is being called in a CGI
;;; context); otherwise, SCGI is used.
;;;
;;; @var{startup-proc} is a thunk that is evaluated once (before listener
;;; starts).  @var{request-proc} is evaluated once for each request (which, in
;;; normal CGI, is once).  @var{shutdown-proc} is evaluated once, as processing
;;; of all CGI requests has finished.
;;;
;;; For evaluation of @var{request-proc}, the default input and output ports
;;; are as with normal CGI, regardless of whether normal CGI or SCGI is in use.
;;;
;;; This procedure also accepts a few optional keyword arguments, all of which
;;; apply to SCGI mode only:
;;;
;;; @lisp
;;; #:scgi-hostname       (scgi-hostname       "127.0.0.1")
;;; #:scgi-portnum        (scgi-portnum        4000)
;;; #:scgi-max-allow-wait (scgi-max-allow-wait 4)
;;; #:reuse-scgi-port?    (reuse-scgi-port?    #t)
;;; @end lisp
;;;
;;; @code{scgi-hostname} is the hostname or IP address (as a string) for the
;;; interface on which to listen.  @code{scgi-portnum} is the TCP port number
;;; on that interface.  @code{scgi-max-allow-wait} is the maximum number of
;;; unaccepted connections to permit waiting.  @code{reuse-scgi-port?} is
;;; whether or not to reuse the TCP port number, such as if a previous server
;;; exited and the port is in a @code{TIME_WAIT} state.

(define (cgi startup-proc
             request-proc
             shutdown-proc
             #:scgi-hostname       (scgi-hostname       "127.0.0.1")
             #:scgi-max-allow-wait (scgi-max-allow-wait 4)
             #:scgi-portnum        (scgi-portnum        4000)
             #:reuse-scgi-port?    (reuse-scgi-port?    #t))
  (let ((type (if (getenv "REQUEST_URI")
                  'normal
                  'scgi)))
    (parameterize ((%cur-cgi-type type))
      (startup-proc)

      (dynamic-wind

        (lambda () #f)

        (lambda ()
          (let/ec stop-cgi-service-immediately-ec
            (parameterize ((%cur-stop-cgi-service-immediately
                            stop-cgi-service-immediately-ec))
              (case type

                ((scgi)
                 (let ((listener-cust (make-custodian)))
                   (parameterize ((current-custodian listener-cust))
                     (let ((listener (tcp-listen scgi-portnum
                                                 scgi-max-allow-wait
                                                 reuse-scgi-port?
                                                 scgi-hostname)))
                       (dynamic-wind
                         (lambda () #f)
                         (lambda ()
                           (let loop ((request-id 1))
                             (let ((request-cust (make-custodian
                                                  listener-cust)))
                               (parameterize ((current-custodian request-cust))
                                 (let-values (((in out) (tcp-accept/enable-break
                                                         listener)))
                                   (%debug "cgi accepted request-id=~S"
                                           request-id)
                                   (thread (lambda ()
                                             (dynamic-wind
                                               (lambda () #f)
                                               (lambda ()
                                                 (%handle-scgi-accept
                                                  in
                                                  out
                                                  request-proc
                                                  request-id))
                                               (lambda ()
                                                 (custodian-shutdown-all
                                                  request-cust)))))))
                               (loop (+ 1 request-id)))))
                         (lambda ()
                           (%debug "scgi listener custodian shutdown")
                           (custodian-shutdown-all listener-cust)
                           ;; Note: We have verified under Racket 5.1.1 that
                           ;; the custodian already closes the listener, so
                           ;; this code is not necessary.
                           ;;
                           ;; (%debug "scgi listener tcp-close")
                           ;; (with-handlers
                           ;;     ((exn:fail?
                           ;;       (lambda (e)
                           ;;         (%debug
                           ;;          "scgi-listener tcp-close failed: ~S"
                           ;;          (exn-message e)))))
                           ;;   (tcp-close listener))
                           (%debug "scgi listener shutdown sleep")
                           (sleep 1)))))))

                ((normal)
                 ;; TODO: !!! Set up end-cgi-request
                 (parameterize ((%cur-cgi-request-id 0))
                   (request-proc)))

                (else (%invalid-cgi-type-error 'cgi))))))

        (lambda ()
          (parameterize-break #t
            (shutdown-proc)))))))

;;; @defproc cgi-content-length
;;;
;;; In a CGI request context, returns the CGI content length -- the number of
;;; bytes that can be read from the default input port -- as integer.

(define (cgi-content-length)
  (case (or (%cur-cgi-type)
            ;; Note: Error message here complains about CGI request context,
            ;; even though the test is only for any CGI context.
            (%cgi-request-context-only-error 'cgi-content-length))

    ((scgi)
     (%cur-scgi-content-length))

    ((normal)
     (%assert-cgi-request-context 'cgi-content-length)
     (string->number (getenv "CONTENT_LENGTH")))

    (else (%invalid-cgi-type-error 'cgi-content-length))))

;;; @defproc make-cgi-variable-proc sym name-bytes
;;;
;;; Produces a procedure for getting a CGI environment variable value as a
;;; string.  Works whether in normal CGI or SCGI.  This is useful for accessing
;;; non-standard variables, such as might be provided by an unusual Apache
;;; module.  Argument @var{sym} is a symbol for the name of the procedure,
;;; which is used in error reporting.  Argument @var{name-bytes} is the name of
;;; the environment variable, as a byte string.
;;;
;;; For example, the @code{cgi-remote-user} procedure could be defined as:
;;;
;;; @lisp
;;; (define cgi-remote-user
;;;   (make-cgi-variable-proc 'cgi-remote-user #"REMOTE_USER"))
;;; @end lisp

(define (make-cgi-variable-proc sym name-bytes)
  (let ((name-string (bytes->string/latin-1 name-bytes)))
    (lambda ()
      (case (%cur-cgi-type)

        ((scgi)
         (cond ((assoc name-bytes (or (%cur-scgi-variables)
                                      (%cgi-request-context-only-error sym)))
                => (lambda (pair)
                     (bytes->string/latin-1 (cdr pair))))
               (else #f)))

        ((normal)
         (%assert-cgi-request-context sym)
         (getenv name-string))

        (else (%invalid-cgi-type-error sym))))))

;;; @defproc  cgi-content-type
;;; @defprocx cgi-document-root
;;; @defprocx cgi-http-cookie
;;; @defprocx cgi-http-host
;;; @defprocx cgi-http-referer
;;; @defprocx cgi-http-user-agent
;;; @defprocx cgi-https
;;; @defprocx cgi-path-info
;;; @defprocx cgi-path-translated
;;; @defprocx cgi-query-string
;;; @defprocx cgi-remote-addr
;;; @defprocx cgi-remote-host
;;; @defprocx cgi-remote-user
;;; @defprocx cgi-request-method
;;; @defprocx cgi-request-uri
;;; @defprocx cgi-script-name
;;; @defprocx cgi-server-name
;;; @defprocx cgi-server-port
;;;
;;; In a CGI request context, returns the corresponding CGI value as a string.
;;; Note that @code{cgi-content-length} is @emph{not} in this list, and it
;;; returns a number rather than a string.

(define-syntax %define-cgi-variable-proc
  (syntax-rules ()
    ((_ ID BYTES)
     (define ID (make-cgi-variable-proc (quote ID) BYTES)))))

(%define-cgi-variable-proc cgi-content-type    #"CONTENT_TYPE")
(%define-cgi-variable-proc cgi-document-root   #"DOCUMENT_ROOT")
(%define-cgi-variable-proc cgi-http-cookie     #"HTTP_COOKIE")
(%define-cgi-variable-proc cgi-http-host       #"HTTP_HOST")
(%define-cgi-variable-proc cgi-http-referer    #"HTTP_REFERER")
(%define-cgi-variable-proc cgi-http-user-agent #"HTTP_USER_AGENT")
(%define-cgi-variable-proc cgi-https           #"HTTPS")
(%define-cgi-variable-proc cgi-path-info       #"PATH_INFO")
(%define-cgi-variable-proc cgi-path-translated #"PATH_TRANSLATED")
(%define-cgi-variable-proc cgi-query-string    #"QUERY_STRING")
(%define-cgi-variable-proc cgi-remote-addr     #"REMOTE_ADDR")
(%define-cgi-variable-proc cgi-remote-host     #"REMOTE_HOST")
(%define-cgi-variable-proc cgi-remote-user     #"REMOTE_USER")
(%define-cgi-variable-proc cgi-request-method  #"REQUEST_METHOD")
(%define-cgi-variable-proc cgi-request-uri     #"REQUEST_URI")
(%define-cgi-variable-proc cgi-script-name     #"SCRIPT_NAME")
(%define-cgi-variable-proc cgi-server-name     #"SERVER_NAME")
(%define-cgi-variable-proc cgi-server-port     #"SERVER_PORT")
     
;;; @defproc scgi-variables
;;;
;;; When called in SCGI mode, this procedure yields an alist of SCGI variables
;;; with both the key and value of each pair being byte strings.  Calling this
;;; procedure in normal CGI mode is an error.
;;;
;;; Note that normally you will not need to use this procedure, and will
;;; instead use procedures like @code{cgi-request-uri}, which work in both SCGI
;;; and normal CGI modes.

(define (scgi-variables)
  (or (%cur-scgi-variables)
      (if (eq? (%cur-cgi-type) 'scgi)
          (%cgi-request-context-only-error 'scgi-variables)
          (error 'scgi-variables
                 "Only works in SCGI type CGI request context. Type: ~S"
                 (%cur-cgi-type)))))

;;; @defproc cgi-type
;;;
;;; Returns a symbol indicating the CGI type: @code{normal} or @code{scgi}.
;;; Behavior outside of the @code{cgi} form is undefined.

(define (cgi-type)
  (or (%cur-cgi-type)
      (%cgi-context-only-error 'cgi-type)))

;;; @defproc cgi-request-id
;;;
;;; In CGI request context, yields a printable identifying object for the
;;; current request that is unique at least for the current requests being
;;; handled.  This identifying object is intended to be used in debugging
;;; messages.

(define (cgi-request-id)
  (or (%cur-cgi-request-id)
      (%cgi-request-context-only-error 'cgi-request-id)))

;; TODO: !!! finish implementing end-cgi-request

;; @defproc  end-cgi-request
;; @defprocx end-cgi-request/error
;;
;; (NOTE: THIS IS NOT YET FULLY IMPLEMENTED AND TESTED.)

(define (%end-cgi-request/sym/status sym status)
  ((or (%cur-end-cgi-request-ec)
       (%cgi-request-context-only-error sym))
   status))

(define (end-cgi-request)
  (%end-cgi-request/sym/status 'end-cgi-request 0))

(define (end-cgi-request/error)
  (%end-cgi-request/sym/status 'end-cgi-request/error 1))

;;; @defproc stop-cgi-service-immediately
;;;
;;; Stops processing all CGI requests.  This works only within the
;;; @var{request-proc} of the @code{cgi} form.

(define (stop-cgi-service-immediately)
  ((or (%cur-stop-cgi-service-immediately)
       (%cgi-context-only-error 'stop-cgi-service-immediately))))

;;; @section Troubleshooting

;;; This section has some troubleshooting tips.  Currently, these come from use
;;; with @code{mod_scgi} atop Apache 2.2.9 atop Debian GNU/Linux.
;;;
;;; @itemize
;;;
;;; @item
;;; Racket error ``tcp-write: error writing (Broken pipe; errno=32)'' or
;;; ``tcp-write: error writing (Connection reset by peer; errno=104)'' is
;;; likely due to the HTTP request having been dropped by the HTTP client
;;; (e.g., user stops a page load in their browser before page finishes
;;; loading) or by Apache hitting @code{SCGIServerTimeout} for the request.
;;; Note that buffered I/O means that you won't necessarily get this error even
;;; if the request is aborted this way.
;;;
;;; @item
;;; Apache error log entry ``Premature end of script headers: @i{PATH}''
;;; followed by ``(500)Unknown error 500: scgi: Unknown error 500: error
;;; reading response headers'' can mean that @code{SCGIServerTimeout} was hit
;;; before any HTTP headers from the SCGI request handler were started or
;;; completed.  Note that buffered I/O can mean that some of the Racket code of
;;; the handler wrote some text, but it was not yet flushed to the SCGI client.
;;;
;;; @item
;;; Apache error log entry ``(70007)The timeout specified has expired:
;;; ap_content_length_filter: apr_bucket_read() failed'' followed by
;;; ``(70007)The timeout specified has expired: scgi: The timeout specified has
;;; expired: ap_pass_brigade()'' can mean that @code{SCGIServerTimeout} was hit
;;; after HTTP headers from the request handler had been received by the SCGI
;;; client.
;;;
;;; @item
;;; Apache error log entry ``(32)Broken pipe: scgi: Broken pipe: error sending
;;; request body'' might be due to a request handler finished without consuming
;;; all of the HTTP @code{POST} data.
;;;
;;; @item
;;; Apache error log entry ``(103)Software caused connection abort: scgi:
;;; Software caused connection abort: ap_pass_brigade()'' is another one that
;;; can be caused by the HTTP client dropping the connection before the handler
;;; finishes.
;;;
;;; @end itemize

;;; @unnumberedsec History

;;; @table @asis
;;;
;;; @item Version 0.6 --- 2011-08-22 --- PLaneT @code{(1 5)}
;;; Small documentation fixes.
;;;
;;; @item Version 0.5 --- 2011-05-16 --- PLaneT @code{(1 4)}
;;; For closer to backward compatibility with PLT Scheme 4.x, changed
;;; references to @code{racket} modules to @code{scheme}.
;;; More documentation.
;;;
;;; @item Version 0.4 --- 2011-05-16 --- PLaneT @code{(1 3)}
;;; Added @code{#:reuse-scgi-port?} argument to procedure @code{cgi}.
;;; Added several new CGI environment variable procedures.
;;; Added @code{make-cgi-variable-proc}.
;;; The @code{cgi} procedure now uses @code{tcp-accept/enable-break} rather than
;;; @code{tcp-accept}.
;;; Added more @code{log-debug}.
;;; Added documentation about @code{mod_scgi} configuration, troubleshooting,
;;; and the keyword arguments to the @code{cgi} procedure.
;;; Removed documentation placeholders for @code{end-cgi-request} for now.
;;; Various additional quality assurance testing has been done, and more is the
;;; works.
;;;
;;; @item Version 0.3 --- 2010-11-14 --- PLaneT @code{(1 2)}
;;; Added @code{cgi-http-user-agent}.
;;;
;;; @item Version 0.2 --- 2010-10-11 --- PLaneT @code{(1 1)}
;;; Documentation changes to reflect that it is successfully in use in a real
;;; system, and some work remains.
;;;
;;; @item Version 0.1 --- 2010-05-25 --- PLaneT @code{(1 0)}
;;; Initial release.  Preliminary.
;;;
;;; @end table

(provide
 end-cgi-request
 end-cgi-request/error
 cgi
 cgi-content-length
 cgi-content-type
 cgi-document-root
 cgi-http-cookie
 cgi-http-host
 cgi-http-referer
 cgi-http-user-agent
 cgi-https
 cgi-path-info
 cgi-path-translated
 cgi-query-string
 cgi-remote-addr
 cgi-remote-host
 cgi-remote-user
 cgi-request-method
 cgi-request-id
 cgi-request-uri
 cgi-script-name
 cgi-server-name
 cgi-server-port
 cgi-type
 make-cgi-variable-proc
 scgi-variables
 stop-cgi-service-immediately)