statistics.ss
#lang scheme/base
;;; PLT Scheme Science Collection
;;; statistics.ss
;;; Copyright (c) 2004-2008 M. Douglas Williams
;;;
;;; This library is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Lesser General Public
;;; License as published by the Free Software Foundation; either
;;; version 2.1 of the License, or (at your option) any later version.
;;;
;;; This library is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; Lesser General Public License for more details.
;;;
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with this library; if not, write to the Free
;;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
;;; 02111-1307 USA.
;;;
;;; -------------------------------------------------------------------
;;;
;;; This modules implements various statistical functions.  It is based
;;; on the Statistics in the GNU Scientific Library, which is licensed
;;; under the GPL.
;;;
;;; Version  Date      Description
;;; 1.0.0    09/24/04  Marked as ready for Release 1.0.  (Doug
;;;                    Williams)
;;; 2.0.0    11/19/07  Added unchecked versions of functions.  (Doug
;;;                    Williams)
;;; 2.1.0    06/07/08  More V4.0 changes.  (Doug Williams)

(require (lib "contract.ss"))

;;; Predicates and/or flat contracts for contracts

;;; nonempty-vector-of-reals? : flat contract
;;; #t for nonempty vectors of real numbers, #f otherwise.  Safe for
;;; all values.
(define nonempty-vector-of-reals?
  (flat-named-contract
   "nonempty-vector-of-reals?"
   (lambda (x)
     (and (vector? x)
          (> (vector-length x) 0)
          (let ((n (vector-length x)))
            (let/ec exit
              (do ((i 0 (+ i 1)))
                  ((= i n) #t)
                (when (not (real? (vector-ref x i)))
                  (exit #f)))))))))

;;; sorted?: flat contract
;;; #t for sorted vectors, #f otherwise.  Will fail for vectors
;;; containing any element that is not a real number (because > will
;;; fail).
(define sorted?
  (flat-named-contract
   "sorted vector"
   (lambda (x)
     (and (vector? x)
          (let ((n (vector-length x)))
            (let/ec exit
              (do ((i 0 (+ i 1)))
                  ((>= i (- n 1)) #t)
                (when (> (vector-ref x i)
                         (vector-ref x (+ i 1)))
                  (exit #f)))))))))

(provide
 (rename-out (mean unchecked-mean)
             (variance unchecked-variance)
             (standard-deviation unchecked-standard-deviation)
             (variance-with-fixed-mean unchecked-variance-with-fixed-mean)
             (standard-deviation-with-fixed-mean unchecked-standard-deviation-with-fixed-mean)
             (absolute-deviation unchecked-absolute-deviation)
             (skew unchecked-skew)
             (kurtosis unchecked-kurtosis)
             (lag-1-autocorrelation unchecked-lag-1-autocorrelation)
             (covariance unchecked-covariance)
             (covariance-with-fixed-means unchecked-covariance-with-fixed-means)
             (weighted-mean unchecked-weighted-mean)
             (weighted-variance unchecked-weighted-variance)
             (weighted-standard-deviation unchecked-weighted-standard-deviation)
             (weighted-variance-with-fixed-mean unchecked-weighted-variance-with-fixed-mean)
             (weighted-standard-deviation-with-fixed-mean unchecked-weighted-standard-deviation-with-fixed-mean)
             (weighted-absolute-deviation unchecked-weighted-absolute-deviation)
             (weighted-skew unchecked-weighted-skew)
             (weighted-kurtosis unchecked-weighted-kurtosis)
             (maximum unchecked-maximum)
             (minimum unchecked-minimum)
             (minimum-maximum unchecked-minimum-maximum)
             (minimum-index unchecked-minimum-index)
             (maximum-index unchecked-maximum-index)
             (minimum-maximum-index unchecked-minimum-maximum-index)
             (median-from-sorted-data unchecked-median-from-sorted-data)
             (quantile-from-sorted-data unchecked-quantile-from-sorted-data)))

(provide/contract
 (mean
  (-> (vectorof real?) real?))
 (variance
  (case-> (-> (vectorof real?) real? (>=/c 0.0))
          (-> (vectorof real?) (>=/c 0.0))))
 (standard-deviation
  (case-> (-> (vectorof real?) real? (>=/c 0.0))
          (-> (vectorof real?) (>=/c 0.0))))
 (variance-with-fixed-mean 
  (-> (vectorof real?) real? (>=/c 0.0)))
 (standard-deviation-with-fixed-mean
  (-> (vectorof real?) real? (>=/c 0.0)))
 (absolute-deviation
  (case-> (-> (vectorof real?) real? (>=/c 0.0))
          (-> (vectorof real?) (>=/c 0.0))))
 (skew
  (case-> (-> (vectorof real?) real? (>=/c 0.0) real?)
          (-> (vectorof real?) real?)))
 (kurtosis
  (case-> (-> (vectorof real?) real? (>=/c 0.0) real?)
          (-> (vectorof real?) real?)))
 (lag-1-autocorrelation
  (case-> (-> nonempty-vector-of-reals? real? real?)
          (-> nonempty-vector-of-reals? real?)))
 (covariance
  (case-> (->r ((data1 (vectorof real?))
                (data2 (and/c (vectorof real?)
                              (lambda (x)
                                (= (vector-length data1)
                                   (vector-length data2)))))
                (mu1 real?)
                (mu2 real?))
               real?)
          (->r ((data1 (vectorof real?))
                (data2 (and/c (vectorof real?)
                              (lambda (x)
                                (= (vector-length data1)
                                   (vector-length data2))))))
               real?)))
 (covariance-with-fixed-means
  (->r ((data1 (vectorof real?))
        (data2 (and/c (vectorof real?)
                      (lambda (x)
                        (= (vector-length data1)
                           (vector-length data2)))))
        (mu1 real?)
        (mu2 real?))
       real?))
 (weighted-mean
  (->r ((w (vectorof real?))
        (data (and/c (vectorof real?)
                     (lambda (x)
                       (= (vector-length w)
                          (vector-length data))))))
       real?))
 (weighted-variance
  (case-> (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data)))))
                (mu real?))
               (>=/c 0.0))
          (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data))))))
               (>=/c 0.0))))
 (weighted-standard-deviation
  (case-> (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data)))))
                (mu real?))
               (>=/c 0.0))
          (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data))))))
               (>=/c 0.0))))
 (weighted-variance-with-fixed-mean
  (->r ((w (vectorof real?))
        (data (and/c (vectorof real?)
                     (lambda (x)
                       (= (vector-length w)
                          (vector-length data)))))
        (mu real?))
       (>=/c 0.0)))
 (weighted-standard-deviation-with-fixed-mean
  (->r ((w (vectorof real?))
        (data (and/c (vectorof real?)
                     (lambda (x)
                       (= (vector-length w)
                          (vector-length data)))))
        (mu real?))
       (>=/c 0.0)))
 (weighted-absolute-deviation
  (case-> (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data)))))
                (mu real?))
               (>=/c 0.0))
          (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data))))))
               (>=/c 0.0))))
 (weighted-skew
  (case-> (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data)))))
                (mu real?)
                (sigma (>=/c 0.0)))
               real?)
          (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data))))))
               real?)))
 (weighted-kurtosis
  (case-> (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data)))))
                (mu real?)
                (sigma (>=/c 0.0)))
               real?)
          (->r ((w (vectorof real?))
                (data (and/c (vectorof real?)
                             (lambda (x)
                               (= (vector-length w)
                                  (vector-length data))))))
               real?)))
 (maximum
  (-> nonempty-vector-of-reals? real?))
 (minimum
  (-> nonempty-vector-of-reals? real?))
 (minimum-maximum
  (-> nonempty-vector-of-reals? (values real? real?)))
 (maximum-index
  (-> nonempty-vector-of-reals? natural-number/c))
 (minimum-index
  (-> nonempty-vector-of-reals? natural-number/c))
 (minimum-maximum-index
  (-> nonempty-vector-of-reals? (values natural-number/c natural-number/c)))
 (median-from-sorted-data
  (-> (and/c nonempty-vector-of-reals? sorted?) real?))
 (quantile-from-sorted-data
  (-> (and/c nonempty-vector-of-reals? sorted?) (real-in 0.0 1.0) real?)))

;;; Mean, Standard Deviation, and Variance

;;; mean: vector -> real
(define (mean data)
  (let ((n (vector-length data))
        (mu 0.0))
    (do ((i 0 (+ i 1)))
        ((= i n) mu)
      (set! mu (+ mu (/ (- (vector-ref data i) mu) (+ i 1)))))))

;;; variance: vector x real -> real
;;; variance: vector -> real
(define variance
  (case-lambda
    ((data mu)
     (let ((n (vector-length data)))
       (* (variance-with-fixed-mean data mu)
          (exact->inexact (/ n (- n 1))))))
    ((data)
     (variance data (mean data)))))

;;; standard-deviation: vector x real -> real
;;; standard-deviation: vector -> real
(define standard-deviation
  (case-lambda
    ((data mu)
     (sqrt (variance data mu)))
    ((data)
     (sqrt (variance data)))))

;;; variance-with-fixed-mean: vector x real -> real
(define (variance-with-fixed-mean data mu)
  (let ((n (vector-length data))
        (var 0.0))
    (do ((i 0 (+ i 1)))
        ((= i n) var)
      (let ((delta (- (vector-ref data i) mu)))
        (set! var (+ var (/ (- (* delta delta) var) (+ i 1))))))))

;;; standard-deviation-with-fixed-mean: vector x real -> real
(define (standard-deviation-with-fixed-mean data mu)
  (sqrt (variance-with-fixed-mean data mu)))

;;; Absolute Deviation

;;; absolute-deviation: vector x real -> real
;;; absolute-deviation: vector -> real
(define absolute-deviation
  (case-lambda
    ((data mu)
     (let ((n (vector-length data))
           (sum 0.0))
       (do ((i 0 (+ i 1)))
           ((= i n) (/ sum n))
         (let ((delta (abs (- (vector-ref data i) mu))))
           (set! sum (+ sum delta))))))
    ((data)
     (absolute-deviation data (mean data)))))

;;; Higher Moments (Skewness and Kurtosis)

;;; skew: vector x real x real -> real
;;; skew: vector -> real
(define skew
  (case-lambda
    ((data mu sigma)
     (let ((n (vector-length data))
           (skew 0.0))
       (do ((i 0 (+ i 1)))
           ((= i n) skew)
         (let ((x (/ (- (vector-ref data i) mu) sigma)))
           (set! skew (+ skew (/ (- (* x x x) skew) (+ i 1))))))))
    ((data)
     (let* ((mu (mean data))
            (sigma (standard-deviation data mu)))
       (skew data mu sigma)))))

;;; kurtosis: vector x real x real -> real
;;; kurtosis: vector -> real
(define kurtosis
  (case-lambda
    ((data mu sigma)
     (let ((n (vector-length data))
           (avg 0.0))
       (do ((i 0 (+ i 1)))
           ((= i n) (- avg 3.0))
         (let ((x (/ (- (vector-ref data i) mu) sigma)))
           (set! avg (+ avg (/ (- (* x x x x) avg) (+ i 1))))))))
    ((data)
     (let* ((mu (mean data))
            (sigma (standard-deviation data mu)))
       (kurtosis data mu sigma)))))

;;; Autocorrelation

;;; lag-1-autocorrelation: vector x real -> real
;;; lag-1-autocorrelation: vector -> real
(define lag-1-autocorrelation
  (case-lambda
    ((data mu)
     (let ((n (vector-length data))
           (q 0.0)
           (v (* (- (vector-ref data 0) mu) (- (vector-ref data 0) mu))))
       (do ((i 1 (+ i 1)))
           ((= i n) (/ q v))
         (let ((delta0 (- (vector-ref data (- i 1)) mu))
               (delta1 (- (vector-ref data i) mu)))
           (set! q (+ q (/ (- (* delta0 delta1) q) (+ i 1))))
           (set! v (+ v (/ (- (* delta1 delta1) v) (+ i 1))))))))
    ((data)
     (lag-1-autocorrelation data (mean data)))))

;;; Covariance

;;; covariance: vector x vector x real x real -> real
;;; covariance: vector x vector -> real
(define covariance
  (case-lambda
    ((data1 data2 mu1 mu2)
     (let ((n (vector-length data1)))
       (* (covariance-with-fixed-means data1 data2 mu1 mu2)
          (exact->inexact (/ n (- n 1))))))
    ((data1 data2)
     (covariance data1 data2 (mean data1) (mean data2)))))

;;; covariance-with-fixed-means: vector x vector x real x real -> real
(define (covariance-with-fixed-means data1 data2 mu1 mu2)
  (let ((n (vector-length data1))
        (covar 0.0))
    (do ((i 0 (+ i 1)))
        ((= i n) covar)
      (let ((delta1 (- (vector-ref data1 i) mu1))
            (delta2 (- (vector-ref data2 i) mu2)))
        (set! covar (+ covar (/ (- (* delta1 delta2) covar) (+ i 1))))))))

;;; Weighted Samples

;;; weighted-mean: vector x vector -> real
(define (weighted-mean w data)
  (let ((n (vector-length data))
        (wmu 0.0)
        (wsum 0.0))
    (do ((i 0 (+ i 1)))
        ((= i n) wmu)
      (let ((wi (vector-ref w i)))
        (when (> wi 0.0)
          (set! wsum (+ wsum wi))
          (set! wmu (+ wmu (* (- (vector-ref data i) wmu)
                              (/ wi wsum)))))))))

;;; scale-factor: vector
;;; Internal function to compute the scale factor to be applied to
;;; weighted sample statistics.
(define (scale-factor w)
  (let ((n (vector-length w))
        (a 0.0)
        (b 0.0))
    (do ((i 0 (+ i 1)))
        ((= i n) (/ (* a a) (- (* a a) b)))
      (let ((wi (vector-ref w i)))
        (when (> wi 0.0)
          (set! a (+ a wi))
          (set! b (+ b (* wi wi))))))))

;;; weighted-variance: vector x vector x real -> real
;;; weighted-variance: vector x vector -> real
(define weighted-variance
  (case-lambda
    ((w data wmu)
     (* (scale-factor w)
        (weighted-variance-with-fixed-mean w data wmu)))
    ((w data)
     (weighted-variance w data (weighted-mean w data)))))

;;; weighted-standard-deviation: vector x vector x real -> real
;;; weighted-standard-deviation: vector x vector -> real
(define weighted-standard-deviation
  (case-lambda
    ((w data wmu)
     (sqrt (weighted-variance w data wmu)))
    ((w data)
     (sqrt (weighted-variance w data)))))

;;; weighted-variance-with-fixed-mean: vector x vector x real -> real
(define (weighted-variance-with-fixed-mean w data wmu)
  (let ((n (vector-length data))
        (wvar 0.0)
        (wsum 0.0))
    (do ((i 0 (+ i 1)))
        ((= i n) wvar)
      (let ((wi (vector-ref w i)))
        (when (> wi 0.0)
          (let ((delta (- (vector-ref data i) wmu)))
            (set! wsum (+ wsum wi))
            (set! wvar (+ wvar (* (- (* delta delta) wvar)
                                  (/ wi wsum))))))))))

;;; weighted-standard-deviation-with-fixed-mean:
;;;   vector x vector x real -> real
(define (weighted-standard-deviation-with-fixed-mean w data wmu)
  (sqrt (weighted-variance-with-fixed-mean w data wmu)))

;;; weighted-absolute-deviation: vector x vector x real -> real
;;; weighted-absoulte-deviation: vector x vector -> real
(define weighted-absolute-deviation
  (case-lambda
    ((w data wmu)
     (let ((n (vector-length data))
           (wabsdev 0.0)
           (wsum 0.0))
       (do ((i 0 (+ i 1)))
           ((= i n) wabsdev)
         (let ((wi (vector-ref w i)))
           (when (> wi 0.0)
             (let ((delta (abs (- (vector-ref data i) wmu))))
               (set! wsum (+ wsum wi))
               (set! wabsdev (+ wabsdev (* (- delta wabsdev)
                                           (/ wi wsum))))))))))
    ((w data)
     (weighted-absolute-deviation w data (weighted-mean w data)))))

;;; weighted-skew: vector x vector x real x real -> real
;;; weighted-skew: vector x vector -> real
(define weighted-skew
  (case-lambda
    ((w data wmu wsigma)
     (let ((n (vector-length data))
           (wskew 0.0)
           (wsum 0.0))
       (do ((i 0 (+ i 1)))
           ((= i n) wskew)
         (let ((wi (vector-ref w i)))
           (when (> wi 0.0)
             (let ((x (/ (- (vector-ref data i) wmu) wsigma)))
               (set! wsum (+ wsum wi))
               (set! wskew (+ wskew (* (- (* x x x) wskew)
                                       (/ wi wsum))))))))))
    ((w data)
     (let* ((wmu (weighted-mean w data))
            (wsigma (weighted-standard-deviation w data wmu)))
       (weighted-skew w data wmu wsigma)))))

;;; weighted-kurtosis: vector x vector x real x real -> real
;;; weighted-kurtosis: vector x vector -> real
(define weighted-kurtosis
  (case-lambda
    ((w data wmu wsigma)
     (let ((n (vector-length data))
           (wavg 0.0)
           (wsum 0.0))
       (do ((i 0 (+ i 1)))
           ((= i n) (- wavg 3.0))
         (let ((wi (vector-ref w i)))
           (when (> wi 0.0)
             (let ((x (/ (- (vector-ref data i) wmu) wsigma)))
               (set! wsum (+ wsum wi))
               (set! wavg (+ wavg (* (- (* x x x x) wavg)
                                     (/ wi wsum))))))))))
    ((w data)
     (let* ((wmu (weighted-mean w data))
            (wsigma (weighted-standard-deviation w data wmu)))
       (weighted-kurtosis w data wmu wsigma)))))

;;; Maximum and Minimum Values

;;; minimum-maximum-and-indices:
;;;   vector -> number x number x integer x integer
(define (minimum-maximum-and-indices data)
  (let ((n (vector-length data))
        (dmin (vector-ref data 0))
        (dmax (vector-ref data 0))
        (dmin-ndx 0)
        (dmax-ndx 0))
    (do ((i 0 (+ i 1)))
        ((= i n) (values dmin dmax dmin-ndx dmax-ndx))
      (let ((di (vector-ref data i)))
        (when (< di dmin)
          (set! dmin di)
          (set! dmin-ndx i))
        (when (> di dmax)
          (set! dmax di)
          (set! dmax-ndx i))))))

;;; maximum: vector -> number
(define (maximum data)
  (let-values (((dmin dmax dmin-ndx dmax-ndx)
                (minimum-maximum-and-indices data)))
    dmax))

;;; minimum: vector -> number
(define (minimum data)
  (let-values (((dmin dmax dmin-ndx dmax-ndx)
                (minimum-maximum-and-indices data)))
    dmin))

;;; minimum-maximum: vector -> number x number
(define (minimum-maximum data)
  (let-values (((dmin dmax dmin-ndx dmax-ndx)
                (minimum-maximum-and-indices data)))
    (values dmin dmax)))

;;; maximum-index: vector -> integer
(define (maximum-index data)
  (let-values (((dmin dmax dmin-ndx dmax-ndx)
                (minimum-maximum-and-indices data)))
    dmax-ndx))

;;; minimum-index: vector -> integer
(define (minimum-index data)
  (let-values (((dmin dmax dmin-ndx dmax-ndx)
                (minimum-maximum-and-indices data)))
    dmin-ndx))

;;; minimum-maximum-index: vector -> integer x integer
(define (minimum-maximum-index data)
  (let-values (((dmin dmax dmin-ndx dmax-ndx)
                (minimum-maximum-and-indices data)))
    (values dmin-ndx dmax-ndx)))

;;; Median and Percentiles

;;; median-from-sorted-data: sorted-vector -> real
(define (median-from-sorted-data sorted-data)
  (let* ((n (vector-length sorted-data))
         (lhs (quotient (- n 1) 2))
         (rhs (quotient n 2)))
    (if (= lhs rhs)
        (vector-ref sorted-data lhs)
        (/ (+ (vector-ref sorted-data lhs)
              (vector-ref sorted-data rhs))
           2.0))))

;;; quantile: sorted-vector x real -> real
(define (quantile-from-sorted-data sorted-data f)
  (let* ((n (vector-length sorted-data))
         (index (* f (- n 1)))
         (lhs (inexact->exact (truncate index)))
         (delta (- index lhs)))
    (if (= lhs (- n 1))
        (vector-ref sorted-data lhs)
        (+ (* (- 1.0 delta) (vector-ref sorted-data lhs))
           (* delta (vector-ref sorted-data (+ lhs 1)))))))