statistics.ss
;;; PLT Scheme Science Collection
;;; statistics.ss
;;; Copyright (c) 2004 M. Douglas Williams
;;;
;;; This library is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Lesser General Public
;;; License as published by the Free Software Foundation; either
;;; version 2.1 of the License, or (at your option) any later version.
;;;
;;; This library is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; Lesser General Public License for more details.
;;;
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with this library; if not, write to the Free
;;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
;;; 02111-1307 USA.
;;;
;;; -------------------------------------------------------------------
;;;
;;; This modules implements various statistical functions.  It is based
;;; on the Statistics in the GNU Scientific Library, which is licensed
;;; under the GPL.
;;;
;;; Version  Date      Description
;;; 1.0.0    09/24/04  Marked as ready for Release 1.0.  (Doug
;;;                    Williams)

(module statistics mzscheme
  
  (require (lib "contract.ss"))
  
  ;; Predicates and/or flat contracts for contracts
  
  ;; nonempty-vector-of-reals? : flat contract
  ;; #t for nonempty vectors of real numbers, #f otherwise.  Safe for
  ;; all values.
  (define nonempty-vector-of-reals?
    (flat-named-contract "nonempty-vector-of-reals?"
      (lambda (x)
       (and (vector? x)
            (> (vector-length x) 0)
            (let ((n (vector-length x)))
              (let/ec exit
                (do ((i 0 (+ i 1)))
                    ((= i n) #t)
                  (if (not (real? (vector-ref x i)))
                      (exit #f)))))))))
  
  ;; sorted?: flat contract
  ;; #t for sorted vectors, #f otherwise.  Will fail for vectors
  ;; containing any element that is not a real number (because > will
  ;; fail).
  (define sorted?
    (flat-named-contract "sorted vector"
      (lambda (x)
        (and (vector? x)
             (let ((n (vector-length x)))
               (let/ec exit
                 (do ((i 0 (+ i 1)))
                     ((>= i (- n 1)) #t)
                   (if (> (vector-ref x i)
                          (vector-ref x (+ i 1)))
                       (exit #f)))))))))
  
  (provide/contract
   (mean
    (-> (vectorof real?) real?))
   (variance
    (case-> (-> (vectorof real?) real? (>=/c 0.0))
            (-> (vectorof real?) (>=/c 0.0))))
   (standard-deviation
    (case-> (-> (vectorof real?) real? (>=/c 0.0))
            (-> (vectorof real?) (>=/c 0.0))))
   (variance-with-fixed-mean 
    (-> (vectorof real?) real? (>=/c 0.0)))
   (standard-deviation-with-fixed-mean
    (-> (vectorof real?) real? (>=/c 0.0)))
   (absolute-deviation
    (case-> (-> (vectorof real?) real? (>=/c 0.0))
            (-> (vectorof real?) (>=/c 0.0))))
   (skew
    (case-> (-> (vectorof real?) real? (>=/c 0.0) real?)
            (-> (vectorof real?) real?)))
   (kurtosis
    (case-> (-> (vectorof real?) real? (>=/c 0.0) real?)
            (-> (vectorof real?) real?)))
   (lag-1-autocorrelation
    (case-> (-> nonempty-vector-of-reals? real? real?)
            (-> nonempty-vector-of-reals? real?)))
   (covariance
    (case-> (->r ((data1 (vectorof real?))
                  (data2 (and/c (vectorof real?)
                                (lambda (x)
                                  (= (vector-length data1)
                                     (vector-length data2)))))
                  (mu1 real?)
                  (mu2 real?))
                 real?)
            (->r ((data1 (vectorof real?))
                  (data2 (and/c (vectorof real?)
                                (lambda (x)
                                  (= (vector-length data1)
                                     (vector-length data2))))))
                 real?)))
   (covariance-with-fixed-means
    (->r ((data1 (vectorof real?))
          (data2 (and/c (vectorof real?)
                        (lambda (x)
                          (= (vector-length data1)
                             (vector-length data2)))))
          (mu1 real?)
          (mu2 real?))
         real?))
   (weighted-mean
    (->r ((w (vectorof real?))
          (data (and/c (vectorof real?)
                       (lambda (x)
                         (= (vector-length w)
                            (vector-length data))))))
         real?))
   (weighted-variance
    (case-> (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data)))))
                  (mu real?))
                 (>=/c 0.0))
            (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data))))))
                 (>=/c 0.0))))
   (weighted-standard-deviation
    (case-> (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data)))))
                  (mu real?))
                 (>=/c 0.0))
            (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data))))))
                 (>=/c 0.0))))
   (weighted-variance-with-fixed-mean
    (->r ((w (vectorof real?))
          (data (and/c (vectorof real?)
                       (lambda (x)
                         (= (vector-length w)
                            (vector-length data)))))
          (mu real?))
         (>=/c 0.0)))
   (weighted-standard-deviation-with-fixed-mean
    (->r ((w (vectorof real?))
          (data (and/c (vectorof real?)
                       (lambda (x)
                         (= (vector-length w)
                            (vector-length data)))))
          (mu real?))
         (>=/c 0.0)))
   (weighted-absolute-deviation
    (case-> (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data)))))
                  (mu real?))
                 (>=/c 0.0))
            (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data))))))
                 (>=/c 0.0))))
   (weighted-skew
    (case-> (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data)))))
                  (mu real?)
                  (sigma (>=/c 0.0)))
                 real?)
            (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data))))))
                 real?)))
   (weighted-kurtosis
    (case-> (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data)))))
                  (mu real?)
                  (sigma (>=/c 0.0)))
                 real?)
            (->r ((w (vectorof real?))
                  (data (and/c (vectorof real?)
                               (lambda (x)
                                 (= (vector-length w)
                                    (vector-length data))))))
                 real?)))
   (maximum
    (-> nonempty-vector-of-reals? real?))
   (minimum
    (-> nonempty-vector-of-reals? real?))
   (minimum-maximum
    (-> nonempty-vector-of-reals? (values real? real?)))
   (maximum-index
    (-> nonempty-vector-of-reals? natural-number/c))
   (minimum-index
    (-> nonempty-vector-of-reals? natural-number/c))
   (minimum-maximum-index
    (-> nonempty-vector-of-reals? (values natural-number/c natural-number/c)))
   (median-from-sorted-data
    (-> (and/c nonempty-vector-of-reals? sorted?) real?))
   (quantile-from-sorted-data
    (-> (and/c nonempty-vector-of-reals? sorted?) (real-in 0.0 1.0) real?)))
  
  ;; Mean, Standard Deviation, and Variance
  
  ;; mean: vector -> real
  (define (mean data)
    (let ((n (vector-length data))
          (mu 0.0))
      (do ((i 0 (+ i 1)))
          ((= i n) mu)
        (set! mu (+ mu (/ (- (vector-ref data i) mu) (+ i 1)))))))
  
  ;; variance: vector x real -> real
  ;; variance: vector -> real
  (define variance
    (case-lambda
      ((data mu)
       (let ((n (vector-length data)))
         (* (variance-with-fixed-mean data mu)
            (exact->inexact (/ n (- n 1))))))
      ((data)
       (variance data (mean data)))))
  
  ;; standard-deviation: vector x real -> real
  ;; standard-deviation: vector -> real
  (define standard-deviation
    (case-lambda
      ((data mu)
       (sqrt (variance data mu)))
      ((data)
       (sqrt (variance data)))))
  
  ;; variance-with-fixed-mean: vector x real -> real
  (define (variance-with-fixed-mean data mu)
    (let ((n (vector-length data))
          (var 0.0))
      (do ((i 0 (+ i 1)))
          ((= i n) var)
        (let ((delta (- (vector-ref data i) mu)))
          (set! var (+ var (/ (- (* delta delta) var) (+ i 1))))))))
  
  ;; standard-deviation-with-fixed-mean: vector x real -> real
  (define (standard-deviation-with-fixed-mean data mu)
    (sqrt (variance-with-fixed-mean data mu)))
  
  ;; Absolute Deviation
  
  ;; absolute-deviation: vector x real -> real
  ;; absolute-deviation: vector -> real
  (define absolute-deviation
    (case-lambda
      ((data mu)
       (let ((n (vector-length data))
             (sum 0.0))
         (do ((i 0 (+ i 1)))
             ((= i n) (/ sum n))
           (let ((delta (abs (- (vector-ref data i) mu))))
             (set! sum (+ sum delta))))))
      ((data)
       (absolute-deviation data (mean data)))))
  
  ;; Higher Moments (Skewness and Kurtosis)
  
  ;; skew: vector x real x real -> real
  ;; skew: vector -> real
  (define skew
    (case-lambda
      ((data mu sigma)
       (let ((n (vector-length data))
             (skew 0.0))
         (do ((i 0 (+ i 1)))
             ((= i n) skew)
           (let ((x (/ (- (vector-ref data i) mu) sigma)))
             (set! skew (+ skew (/ (- (* x x x) skew) (+ i 1))))))))
      ((data)
       (let* ((mu (mean data))
              (sigma (standard-deviation data mu)))
         (skew data mu sigma)))))
  
  ;; kurtosis: vector x real x real -> real
  ;; kurtosis: vector -> real
  (define kurtosis
    (case-lambda
      ((data mu sigma)
       (let ((n (vector-length data))
             (avg 0.0))
         (do ((i 0 (+ i 1)))
             ((= i n) (- avg 3.0))
           (let ((x (/ (- (vector-ref data i) mu) sigma)))
             (set! avg (+ avg (/ (- (* x x x x) avg) (+ i 1))))))))
      ((data)
       (let* ((mu (mean data))
              (sigma (standard-deviation data mu)))
         (kurtosis data mu sigma)))))
  
  ;; Autocorrelation
  
  ;; lag-1-autocorrelation: vector x real -> real
  ;; lag-1-autocorrelation: vector -> real
  (define lag-1-autocorrelation
    (case-lambda
      ((data mu)
       (let ((n (vector-length data))
             (q 0.0)
             (v (* (- (vector-ref data 0) mu) (- (vector-ref data 0) mu))))
         (do ((i 1 (+ i 1)))
             ((= i n) (/ q v))
           (let ((delta0 (- (vector-ref data (- i 1)) mu))
                 (delta1 (- (vector-ref data i) mu)))
             (set! q (+ q (/ (- (* delta0 delta1) q) (+ i 1))))
             (set! v (+ v (/ (- (* delta1 delta1) v) (+ i 1))))))))
      ((data)
       (lag-1-autocorrelation data (mean data)))))
  
  
  ;; Covariance
  
  ;; covariance: vector x vector x real x real -> real
  ;; covariance: vector x vector -> real
  (define covariance
    (case-lambda
      ((data1 data2 mu1 mu2)
       (let ((n (vector-length data1)))
         (* (covariance-with-fixed-means data1 data2 mu1 mu2)
            (exact->inexact (/ n (- n 1))))))
      ((data1 data2)
       (covariance data1 data2 (mean data1) (mean data2)))))
  
  ;; covariance-with-fixed-means: vector x vector x real x real -> real
  (define (covariance-with-fixed-means data1 data2 mu1 mu2)
    (let ((n (vector-length data1))
          (covar 0.0))
       (do ((i 0 (+ i 1)))
           ((= i n) covar)
         (let ((delta1 (- (vector-ref data1 i) mu1))
               (delta2 (- (vector-ref data2 i) mu2)))
           (set! covar (+ covar (/ (- (* delta1 delta2) covar) (+ i 1))))))))
  
  ;; Weighted Samples
  
  ;; weighted-mean: vector x vector -> real
  (define (weighted-mean w data)
    (let ((n (vector-length data))
          (wmu 0.0)
          (wsum 0.0))
      (do ((i 0 (+ i 1)))
          ((= i n) wmu)
        (let ((wi (vector-ref w i)))
          (if (> wi 0.0)
              (begin
                (set! wsum (+ wsum wi))
                (set! wmu (+ wmu (* (- (vector-ref data i) wmu)
                                    (/ wi wsum))))))))))
  
  ;; scale-factor: vector
  ;; Internal function to compute the scale factor to be applied to
  ;; weighted sample statistics.
  (define (scale-factor w)
    (let ((n (vector-length w))
          (a 0.0)
          (b 0.0))
      (do ((i 0 (+ i 1)))
          ((= i n) (/ (* a a) (- (* a a) b)))
        (let ((wi (vector-ref w i)))
          (if (> wi 0.0)
              (begin
                (set! a (+ a wi))
                (set! b (+ b (* wi wi)))))))))
  
  ;; weighted-variance: vector x vector x real -> real
  ;; weighted-variance: vector x vector -> real
  (define weighted-variance
    (case-lambda
      ((w data wmu)
       (* (scale-factor w)
          (weighted-variance-with-fixed-mean w data wmu)))
      ((w data)
       (weighted-variance w data (weighted-mean w data)))))
  
  ;; weighted-standard-deviation: vector x vector x real -> real
  ;; weighted-standard-deviation: vector x vector -> real
  (define weighted-standard-deviation
    (case-lambda
      ((w data wmu)
       (sqrt (weighted-variance w data wmu)))
      ((w data)
       (sqrt (weighted-variance w data)))))
  
  ;; weighted-variance-with-fixed-mean: vector x vector x real -> real
  (define (weighted-variance-with-fixed-mean w data wmu)
    (let ((n (vector-length data))
          (wvar 0.0)
          (wsum 0.0))
      (do ((i 0 (+ i 1)))
          ((= i n) wvar)
        (let ((wi (vector-ref w i)))
          (if (> wi 0.0)
              (let ((delta (- (vector-ref data i) wmu)))
                (set! wsum (+ wsum wi))
                (set! wvar (+ wvar (* (- (* delta delta) wvar)
                                      (/ wi wsum))))))))))
  
  ;; weighted-standard-deviation-with-fixed-mean:
  ;;   vector x vector x real -> real
  (define (weighted-standard-deviation-with-fixed-mean w data wmu)
    (sqrt (weighted-variance-with-fixed-mean w data wmu)))
  
  ;; weighted-absolute-deviation: vector x vector x real -> real
  ;; weighted-absoulte-deviation: vector x vector -> real
  (define weighted-absolute-deviation
    (case-lambda
      ((w data wmu)
       (let ((n (vector-length data))
             (wabsdev 0.0)
             (wsum 0.0))
         (do ((i 0 (+ i 1)))
             ((= i n) wabsdev)
           (let ((wi (vector-ref w i)))
             (if (> wi 0.0)
                 (let ((delta (abs (- (vector-ref data i) wmu))))
                   (set! wsum (+ wsum wi))
                   (set! wabsdev (+ wabsdev (* (- delta wabsdev)
                                               (/ wi wsum))))))))))
      ((w data)
       (weighted-absolute-deviation w data (weighted-mean w data)))))
  
  ;; weighted-skew: vector x vector x real x real -> real
  ;; weighted-skew: vector x vector -> real
  (define weighted-skew
    (case-lambda
      ((w data wmu wsigma)
       (let ((n (vector-length data))
             (wskew 0.0)
             (wsum 0.0))
         (do ((i 0 (+ i 1)))
             ((= i n) wskew)
           (let ((wi (vector-ref w i)))
             (if (> wi 0.0)
                 (let ((x (/ (- (vector-ref data i) wmu) wsigma)))
                   (set! wsum (+ wsum wi))
                   (set! wskew (+ wskew (* (- (* x x x) wskew)
                                           (/ wi wsum))))))))))
      ((w data)
       (let* ((wmu (weighted-mean w data))
              (wsigma (weighted-standard-deviation w data wmu)))
         (weighted-skew w data wmu wsigma)))))
  
  ;; weighted-kurtosis: vector x vector x real x real -> real
  ;; weighted-kurtosis: vector x vector -> real
  (define weighted-kurtosis
    (case-lambda
      ((w data wmu wsigma)
       (let ((n (vector-length data))
             (wavg 0.0)
             (wsum 0.0))
         (do ((i 0 (+ i 1)))
             ((= i n) (- wavg 3.0))
           (let ((wi (vector-ref w i)))
             (if (> wi 0.0)
                 (let ((x (/ (- (vector-ref data i) wmu) wsigma)))
                   (set! wsum (+ wsum wi))
                   (set! wavg (+ wavg (* (- (* x x x x) wavg)
                                         (/ wi wsum))))))))))
      ((w data)
       (let* ((wmu (weighted-mean w data))
              (wsigma (weighted-standard-deviation w data wmu)))
         (weighted-kurtosis w data wmu wsigma)))))
  
  ;; Maximum and Minimum Values
  
  ;; minimum-maximum-and-indices:
  ;;   vector -> number x number x integer x integer
  (define (minimum-maximum-and-indices data)
    (let ((n (vector-length data))
          (dmin (vector-ref data 0))
          (dmax (vector-ref data 0))
          (dmin-ndx 0)
          (dmax-ndx 0))
      (do ((i 0 (+ i 1)))
          ((= i n) (values dmin dmax dmin-ndx dmax-ndx))
        (let ((di (vector-ref data i)))
          (if (< di dmin)
              (begin
                (set! dmin di)
                (set! dmin-ndx i)))
          (if (> di dmax)
              (begin
                (set! dmax di)
                (set! dmax-ndx i)))))))
  
  ;; maximum: vector -> number
  (define (maximum data)
    (let-values (((dmin dmax dmin-ndx dmax-ndx)
                  (minimum-maximum-and-indices data)))
      dmax))
  
  ;; minimum: vector -> number
  (define (minimum data)
    (let-values (((dmin dmax dmin-ndx dmax-ndx)
                  (minimum-maximum-and-indices data)))
      dmin))
  
  ;; minimum-maximum: vector -> number x number
  (define (minimum-maximum data)
    (let-values (((dmin dmax dmin-ndx dmax-ndx)
                  (minimum-maximum-and-indices data)))
      (values dmin dmax)))
  
  ;; maximum-index: vector -> integer
  (define (maximum-index data)
    (let-values (((dmin dmax dmin-ndx dmax-ndx)
                  (minimum-maximum-and-indices data)))
      dmax-ndx))
  
  ;; minimum-index: vector -> integer
  (define (minimum-index data)
    (let-values (((dmin dmax dmin-ndx dmax-ndx)
                  (minimum-maximum-and-indices data)))
      dmin-ndx))
  
  ;; minimum-maximum-index: vector -> integer x integer
  (define (minimum-maximum-index data)
    (let-values (((dmin dmax dmin-ndx dmax-ndx)
                  (minimum-maximum-and-indices data)))
      (values dmin-ndx dmax-ndx)))
  
  ;; Median and Percentiles
  
  ;; median-from-sorted-data: sorted-vector -> real
  (define (median-from-sorted-data sorted-data)
    (let* ((n (vector-length sorted-data))
           (lhs (quotient (- n 1) 2))
           (rhs (quotient n 2)))
      (if (= lhs rhs)
          (vector-ref sorted-data lhs)
          (/ (+ (vector-ref sorted-data lhs)
                (vector-ref sorted-data rhs))
             2.0))))
  
  ;; quantile: sorted-vector x real -> real
  (define (quantile-from-sorted-data sorted-data f)
    (let* ((n (vector-length sorted-data))
           (index (* f (- n 1)))
           (lhs (inexact->exact (truncate index)))
           (delta (- index lhs)))
      (if (= lhs (- n 1))
          (vector-ref sorted-data lhs)
          (+ (* (- 1.0 delta) (vector-ref sorted-data lhs))
             (* delta (vector-ref sorted-data (+ lhs 1)))))))
  
)