source: orange-bioinformatics/Orange/bioinformatics/stats.py @ 1632:9cf919d0f343

Revision 1632:9cf919d0f343, 147.9 KB checked in by mitar, 2 years ago (diff)

Fixing imports.

Line 
1# Copyright (c) 1999-2002 Gary Strangman; All Rights Reserved.
2#
3# This software is distributable under the terms of the GNU
4# General Public License (GPL) v2, the text of which can be found at
5# http://www.gnu.org/copyleft/gpl.html. Installing, importing or otherwise
6# using this module constitutes acceptance of the terms of this License.
7#
8# Disclaimer
9#
10# This software is provided "as-is".  There are no expressed or implied
11# warranties of any kind, including, but not limited to, the warranties
12# of merchantability and fittness for a given application.  In no event
13# shall Gary Strangman be liable for any direct, indirect, incidental,
14# special, exemplary or consequential damages (including, but not limited
15# to, loss of use, data or profits, or business interruption) however
16# caused and on any theory of liability, whether in contract, strict
17# liability or tort (including negligence or otherwise) arising in any way
18# out of the use of this software, even if advised of the possibility of
19# such damage.
20#
21# Comments and/or additions are welcome (send e-mail to:
22# strang@nmr.mgh.harvard.edu).
23#
24"""
25stats.py module
26
27(Requires pstat.py module.)
28
29#################################################
30#######  Written by:  Gary Strangman  ###########
31#######  Last modified:  May 10, 2002 ###########
32#################################################
33
34A collection of basic statistical functions for python.  The function
35names appear below.
36
37IMPORTANT:  There are really *3* sets of functions.  The first set has an 'l'
38prefix, which can be used with list or tuple arguments.  The second set has
39an 'a' prefix, which can accept NumPy array arguments.  These latter
40functions are defined only when NumPy is available on the system.  The third
41type has NO prefix (i.e., has the name that appears below).  Functions of
42this set are members of a "Dispatch" class, c/o David Ascher.  This class
43allows different functions to be called depending on the type of the passed
44arguments.  Thus, stats.mean is a member of the Dispatch class and
45stats.mean(range(20)) will call stats.lmean(range(20)) while
46stats.mean(Numeric.arange(20)) will call stats.amean(Numeric.arange(20)).
47This is a handy way to keep consistent function names when different
48argument types require different functions to be called.  Having
49implementated the Dispatch class, however, means that to get info on
50a given function, you must use the REAL function name ... that is
51"print stats.lmean.__doc__" or "print stats.amean.__doc__" work fine,
52while "print stats.mean.__doc__" will print the doc for the Dispatch
53class.  NUMPY FUNCTIONS ('a' prefix) generally have more argument options
54but should otherwise be consistent with the corresponding list functions.
55
56Disclaimers:  The function list is obviously incomplete and, worse, the
57functions are not optimized.  All functions have been tested (some more
58so than others), but they are far from bulletproof.  Thus, as with any
59free software, no warranty or guarantee is expressed or implied. :-)  A
60few extra functions that don't appear in the list below can be found by
61interested treasure-hunters.  These functions don't necessarily have
62both list and array versions but were deemed useful
63
64CENTRAL TENDENCY:  geometricmean
65                   harmonicmean
66                   mean
67                   median
68                   medianscore
69                   mode
70
71MOMENTS:  moment
72          variation
73          skew
74          kurtosis
75          skewtest   (for Numpy arrays only)
76          kurtosistest (for Numpy arrays only)
77          normaltest (for Numpy arrays only)
78
79ALTERED VERSIONS:  tmean  (for Numpy arrays only)
80                   tvar   (for Numpy arrays only)
81                   tmin   (for Numpy arrays only)
82                   tmax   (for Numpy arrays only)
83                   tstdev (for Numpy arrays only)
84                   tsem   (for Numpy arrays only)
85                   describe
86
87FREQUENCY STATS:  itemfreq
88                  scoreatpercentile
89                  percentileofscore
90                  histogram
91                  cumfreq
92                  relfreq
93
94VARIABILITY:  obrientransform
95              samplevar
96              samplestdev
97              signaltonoise (for Numpy arrays only)
98              var
99              stdev
100              sterr
101              sem
102              z
103              zs
104              zmap (for Numpy arrays only)
105
106TRIMMING FCNS:  threshold (for Numpy arrays only)
107                trimboth
108                trim1
109                round (round all vals to 'n' decimals; Numpy only)
110
111CORRELATION FCNS:  covariance  (for Numpy arrays only)
112                   correlation (for Numpy arrays only)
113                   paired
114                   pearsonr
115                   spearmanr
116                   pointbiserialr
117                   kendalltau
118                   linregress
119
120INFERENTIAL STATS:  ttest_1samp
121                    ttest_ind
122                    ttest_rel
123                    chisquare
124                    ks_2samp
125                    mannwhitneyu
126                    ranksums
127                    wilcoxont
128                    kruskalwallish
129                    friedmanchisquare
130
131PROBABILITY CALCS:  chisqprob
132                    erfcc
133                    zprob
134                    ksprob
135                    fprob
136                    betacf
137                    gammln
138                    betai
139
140ANOVA FUNCTIONS:  F_oneway
141                  F_value
142
143SUPPORT FUNCTIONS:  writecc
144                    incr
145                    sign  (for Numpy arrays only)
146                    sum
147                    cumsum
148                    ss
149                    summult
150                    sumdiffsquared
151                    square_of_sums
152                    shellsort
153                    rankdata
154                    outputpairedstats
155                    findwithin
156"""
157## CHANGE LOG:
158## ===========
159## 02-11-19 ... fixed attest_ind and attest_rel for div-by-zero Overflows
160## 02-05-10 ... fixed lchisqprob indentation (failed when df=even)
161## 00-12-28 ... removed aanova() to separate module, fixed licensing to
162##                   match Python License, fixed doc string & imports
163## 00-04-13 ... pulled all "global" statements, except from aanova()
164##              added/fixed lots of documentation, removed io.py dependency
165##              changed to version 0.5
166## 99-11-13 ... added asign() function
167## 99-11-01 ... changed version to 0.4 ... enough incremental changes now
168## 99-10-25 ... added acovariance and acorrelation functions
169## 99-10-10 ... fixed askew/akurtosis to avoid divide-by-zero errors
170##              added aglm function (crude, but will be improved)
171## 99-10-04 ... upgraded acumsum, ass, asummult, asamplevar, avar, etc. to
172##                   all handle lists of 'dimension's and keepdims
173##              REMOVED ar0, ar2, ar3, ar4 and replaced them with around
174##              reinserted fixes for abetai to avoid math overflows
175## 99-09-05 ... rewrote achisqprob/aerfcc/aksprob/afprob/abetacf/abetai to
176##                   handle multi-dimensional arrays (whew!)
177## 99-08-30 ... fixed l/amoment, l/askew, l/akurtosis per D'Agostino (1990)
178##              added anormaltest per same reference
179##              re-wrote azprob to calc arrays of probs all at once
180## 99-08-22 ... edited attest_ind printing section so arrays could be rounded
181## 99-08-19 ... fixed amean and aharmonicmean for non-error(!) overflow on
182##                   short/byte arrays (mean of #s btw 100-300 = -150??)
183## 99-08-09 ... fixed asum so that the None case works for Byte arrays
184## 99-08-08 ... fixed 7/3 'improvement' to handle t-calcs on N-D arrays
185## 99-07-03 ... improved attest_ind, attest_rel (zero-division errortrap)
186## 99-06-24 ... fixed bug(?) in attest_ind (n1=a.shape[0])
187## 04/11/99 ... added asignaltonoise, athreshold functions, changed all
188##                   max/min in array section to N.maximum/N.minimum,
189##                   fixed square_of_sums to prevent integer overflow
190## 04/10/99 ... !!! Changed function name ... sumsquared ==> square_of_sums
191## 03/18/99 ... Added ar0, ar2, ar3 and ar4 rounding functions
192## 02/28/99 ... Fixed aobrientransform to return an array rather than a list
193## 01/15/99 ... Essentially ceased updating list-versions of functions (!!!)
194## 01/13/99 ... CHANGED TO VERSION 0.3
195##              fixed bug in a/lmannwhitneyu p-value calculation
196## 12/31/98 ... fixed variable-name bug in ldescribe
197## 12/19/98 ... fixed bug in findwithin (fcns needed pstat. prefix)
198## 12/16/98 ... changed amedianscore to return float (not array) for 1 score
199## 12/14/98 ... added atmin and atmax functions
200##              removed umath from import line (not needed)
201##              l/ageometricmean modified to reduce chance of overflows (take
202##                   nth root first, then multiply)
203## 12/07/98 ... added __version__variable (now 0.2)
204##              removed all 'stats.' from anova() fcn
205## 12/06/98 ... changed those functions (except shellsort) that altered
206##                   arguments in-place ... cumsum, ranksort, ...
207##              updated (and fixed some) doc-strings
208## 12/01/98 ... added anova() function (requires NumPy)
209##              incorporated Dispatch class
210## 11/12/98 ... added functionality to amean, aharmonicmean, ageometricmean
211##              added 'asum' function (added functionality to N.add.reduce)
212##              fixed both moment and amoment (two errors)
213##              changed name of skewness and askewness to skew and askew
214##              fixed (a)histogram (which sometimes counted points <lowerlimit)
215
216from __future__ import absolute_import
217
218import math, string, copy  # required python modules
219from types import *
220
221from . import pstat               # required 3rd party module
222
223__version__ = 0.5
224
225############# DISPATCH CODE ##############
226
227
228class Dispatch:
229    """
230The Dispatch class, care of David Ascher, allows different functions to
231be called depending on the argument types.  This way, there can be one
232function name regardless of the argument type.  To access function doc
233in stats.py module, prefix the function with an 'l' or 'a' for list or
234array arguments, respectively.  That is, print stats.lmean.__doc__ or
235print stats.amean.__doc__ or whatever.
236"""
237
238    def __init__(self, *tuples):
239        self._dispatch = {}
240        for func, types in tuples:
241            for t in types:
242                if t in self._dispatch.keys():
243                    raise ValueError, "can't have two dispatches on "+str(t)
244                self._dispatch[t] = func
245        self._types = self._dispatch.keys()
246
247    def __call__(self, arg1, *args, **kw):
248        if type(arg1) not in self._types:
249            raise TypeError, "don't know how to dispatch %s arguments" %  type(arg1)
250        return apply(self._dispatch[type(arg1)], (arg1,) + args, kw)
251
252
253##########################################################################
254########################   LIST-BASED FUNCTIONS   ########################
255##########################################################################
256
257### Define these regardless
258
259####################################
260#######  CENTRAL TENDENCY  #########
261####################################
262
263def lgeometricmean (inlist):
264    """
265Calculates the geometric mean of the values in the passed list.
266That is:  n-th root of (x1 * x2 * ... * xn).  Assumes a '1D' list.
267
268Usage:   lgeometricmean(inlist)
269"""
270    mult = 1.0
271    one_over_n = 1.0/len(inlist)
272    for item in inlist:
273        mult = mult * pow(item,one_over_n)
274    return mult
275
276
277def lharmonicmean (inlist):
278    """
279Calculates the harmonic mean of the values in the passed list.
280That is:  n / (1/x1 + 1/x2 + ... + 1/xn).  Assumes a '1D' list.
281
282Usage:   lharmonicmean(inlist)
283"""
284    sum = 0
285    for item in inlist:
286        sum = sum + 1.0/item
287    return len(inlist) / sum
288
289
290def lmean (inlist):
291    """
292Returns the arithematic mean of the values in the passed list.
293Assumes a '1D' list, but will function on the 1st dim of an array(!).
294
295Usage:   lmean(inlist)
296"""
297    sum = 0
298    for item in inlist:
299        sum = sum + item
300    return sum/float(len(inlist))
301
302
303def lmedian (inlist,numbins=1000):
304    """
305Returns the computed median value of a list of numbers, given the
306number of bins to use for the histogram (more bins brings the computed value
307closer to the median score, default number of bins = 1000).  See G.W.
308Heiman's Basic Stats (1st Edition), or CRC Probability & Statistics.
309
310Usage:   lmedian (inlist, numbins=1000)
311"""
312    (hist, smallest, binsize, extras) = histogram(inlist,numbins) # make histog
313    cumhist = cumsum(hist)              # make cumulative histogram
314    for i in range(len(cumhist)):        # get 1st(!) index holding 50%ile score
315        if cumhist[i]>=len(inlist)/2.0:
316            cfbin = i
317            break
318    LRL = smallest + binsize*cfbin        # get lower read limit of that bin
319    cfbelow = cumhist[cfbin-1]
320    freq = float(hist[cfbin])                # frequency IN the 50%ile bin
321    median = LRL + ((len(inlist)/2.0 - cfbelow)/float(freq))*binsize  # median formula
322    return median
323
324
325def lmedianscore (inlist):
326    """
327Returns the 'middle' score of the passed list.  If there is an even
328number of scores, the mean of the 2 middle scores is returned.
329
330Usage:   lmedianscore(inlist)
331"""
332
333    newlist = copy.deepcopy(inlist)
334    newlist.sort()
335    if len(newlist) % 2 == 0:   # if even number of scores, average middle 2
336        index = len(newlist)/2  # integer division correct
337        median = float(newlist[index] + newlist[index-1]) /2
338    else:
339        index = len(newlist)/2  # int divsion gives mid value when count from 0
340        median = newlist[index]
341    return median
342
343
344def lmode(inlist):
345    """
346Returns a list of the modal (most common) score(s) in the passed
347list.  If there is more than one such score, all are returned.  The
348bin-count for the mode(s) is also returned.
349
350Usage:   lmode(inlist)
351Returns: bin-count for mode(s), a list of modal value(s)
352"""
353
354    scores = pstat.unique(inlist)
355    scores.sort()
356    freq = []
357    for item in scores:
358        freq.append(inlist.count(item))
359    maxfreq = max(freq)
360    mode = []
361    stillmore = 1
362    while stillmore:
363        try:
364            indx = freq.index(maxfreq)
365            mode.append(scores[indx])
366            del freq[indx]
367            del scores[indx]
368        except ValueError:
369            stillmore=0
370    return maxfreq, mode
371
372
373####################################
374############  MOMENTS  #############
375####################################
376
377def lmoment(inlist,moment=1):
378    """
379Calculates the nth moment about the mean for a sample (defaults to
380the 1st moment).  Used to calculate coefficients of skewness and kurtosis.
381
382Usage:   lmoment(inlist,moment=1)
383Returns: appropriate moment (r) from ... 1/n * SUM((inlist(i)-mean)**r)
384"""
385    if moment == 1:
386        return 0.0
387    else:
388        mn = mean(inlist)
389        n = len(inlist)
390        s = 0
391        for x in inlist:
392            s = s + (x-mn)**moment
393        return s/float(n)
394
395
396def lvariation(inlist):
397    """
398Returns the coefficient of variation, as defined in CRC Standard
399Probability and Statistics, p.6.
400
401Usage:   lvariation(inlist)
402"""
403    return 100.0*samplestdev(inlist)/float(mean(inlist))
404
405
406def lskew(inlist):
407    """
408Returns the skewness of a distribution, as defined in Numerical
409Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.)
410
411Usage:   lskew(inlist)
412"""
413    return moment(inlist,3)/pow(moment(inlist,2),1.5)
414
415
416def lkurtosis(inlist):
417    """
418Returns the kurtosis of a distribution, as defined in Numerical
419Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.)
420
421Usage:   lkurtosis(inlist)
422"""
423    return moment(inlist,4)/pow(moment(inlist,2),2.0)
424
425
426def ldescribe(inlist):
427    """
428Returns some descriptive statistics of the passed list (assumed to be 1D).
429
430Usage:   ldescribe(inlist)
431Returns: n, mean, standard deviation, skew, kurtosis
432"""
433    n = len(inlist)
434    mm = (min(inlist),max(inlist))
435    m = mean(inlist)
436    sd = stdev(inlist)
437    sk = skew(inlist)
438    kurt = kurtosis(inlist)
439    return n, mm, m, sd, sk, kurt
440
441
442####################################
443#######  FREQUENCY STATS  ##########
444####################################
445
446def litemfreq(inlist):
447    """
448Returns a list of pairs.  Each pair consists of one of the scores in inlist
449and it's frequency count.  Assumes a 1D list is passed.
450
451Usage:   litemfreq(inlist)
452Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies)
453"""
454    scores = pstat.unique(inlist)
455    scores.sort()
456    freq = []
457    for item in scores:
458        freq.append(inlist.count(item))
459    return pstat.abut(scores, freq)
460
461
462def lscoreatpercentile (inlist, percent):
463    """
464Returns the score at a given percentile relative to the distribution
465given by inlist.
466
467Usage:   lscoreatpercentile(inlist,percent)
468"""
469    if percent > 1:
470        print "\nDividing percent>1 by 100 in lscoreatpercentile().\n"
471        percent = percent / 100.0
472    targetcf = percent*len(inlist)
473    h, lrl, binsize, extras = histogram(inlist)
474    cumhist = cumsum(copy.deepcopy(h))
475    for i in range(len(cumhist)):
476        if cumhist[i] >= targetcf:
477            break
478    score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i)
479    return score
480
481
482def lpercentileofscore (inlist, score,histbins=10,defaultlimits=None):
483    """
484Returns the percentile value of a score relative to the distribution
485given by inlist.  Formula depends on the values used to histogram the data(!).
486
487Usage:   lpercentileofscore(inlist,score,histbins=10,defaultlimits=None)
488"""
489
490    h, lrl, binsize, extras = histogram(inlist,histbins,defaultlimits)
491    cumhist = cumsum(copy.deepcopy(h))
492    i = int((score - lrl)/float(binsize))
493    pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inlist)) * 100
494    return pct
495
496
497def lhistogram (inlist,numbins=10,defaultreallimits=None,printextras=0):
498    """
499Returns (i) a list of histogram bin counts, (ii) the smallest value
500of the histogram binning, and (iii) the bin width (the last 2 are not
501necessarily integers).  Default number of bins is 10.  If no sequence object
502is given for defaultreallimits, the routine picks (usually non-pretty) bins
503spanning all the numbers in the inlist.
504
505Usage:   lhistogram (inlist, numbins=10, defaultreallimits=None,suppressoutput=0)
506Returns: list of bin values, lowerreallimit, binsize, extrapoints
507"""
508    if (defaultreallimits <> None):
509        if type(defaultreallimits) not in [ListType,TupleType] or len(defaultreallimits)==1: # only one limit given, assumed to be lower one & upper is calc'd
510            lowerreallimit = defaultreallimits
511            upperreallimit = 1.0001 * max(inlist)
512        else: # assume both limits given
513            lowerreallimit = defaultreallimits[0]
514            upperreallimit = defaultreallimits[1]
515        binsize = (upperreallimit-lowerreallimit)/float(numbins)
516    else:     # no limits given for histogram, both must be calc'd
517        estbinwidth=(max(inlist)-min(inlist))/float(numbins) + 1 # 1=>cover all
518        binsize = ((max(inlist)-min(inlist)+estbinwidth))/float(numbins)
519        lowerreallimit = min(inlist) - binsize/2 #lower real limit,1st bin
520    bins = [0]*(numbins)
521    extrapoints = 0
522    for num in inlist:
523        try:
524            if (num-lowerreallimit) < 0:
525                extrapoints = extrapoints + 1
526            else:
527                bintoincrement = int((num-lowerreallimit)/float(binsize))
528                bins[bintoincrement] = bins[bintoincrement] + 1
529        except:
530            extrapoints = extrapoints + 1
531    if (extrapoints > 0 and printextras == 1):
532        print '\nPoints outside given histogram range =',extrapoints
533    return (bins, lowerreallimit, binsize, extrapoints)
534
535
536def lcumfreq(inlist,numbins=10,defaultreallimits=None):
537    """
538Returns a cumulative frequency histogram, using the histogram function.
539
540Usage:   lcumfreq(inlist,numbins=10,defaultreallimits=None)
541Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints
542"""
543    h,l,b,e = histogram(inlist,numbins,defaultreallimits)
544    cumhist = cumsum(copy.deepcopy(h))
545    return cumhist,l,b,e
546
547
548def lrelfreq(inlist,numbins=10,defaultreallimits=None):
549    """
550Returns a relative frequency histogram, using the histogram function.
551
552Usage:   lrelfreq(inlist,numbins=10,defaultreallimits=None)
553Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints
554"""
555    h,l,b,e = histogram(inlist,numbins,defaultreallimits)
556    for i in range(len(h)):
557        h[i] = h[i]/float(len(inlist))
558    return h,l,b,e
559
560
561####################################
562#####  VARIABILITY FUNCTIONS  ######
563####################################
564
565def lobrientransform(*args):
566    """
567Computes a transform on input data (any number of columns).  Used to
568test for homogeneity of variance prior to running one-way stats.  From
569Maxwell and Delaney, p.112.
570
571Usage:   lobrientransform(*args)
572Returns: transformed data for use in an ANOVA
573"""
574    TINY = 1e-10
575    k = len(args)
576    n = [0.0]*k
577    v = [0.0]*k
578    m = [0.0]*k
579    nargs = []
580    for i in range(k):
581        nargs.append(copy.deepcopy(args[i]))
582        n[i] = float(len(nargs[i]))
583        v[i] = var(nargs[i])
584        m[i] = mean(nargs[i])
585    for j in range(k):
586        for i in range(n[j]):
587            t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2
588            t2 = 0.5*v[j]*(n[j]-1.0)
589            t3 = (n[j]-1.0)*(n[j]-2.0)
590            nargs[j][i] = (t1-t2) / float(t3)
591    check = 1
592    for j in range(k):
593        if v[j] - mean(nargs[j]) > TINY:
594            check = 0
595    if check <> 1:
596        raise ValueError, 'Problem in obrientransform.'
597    else:
598        return nargs
599
600
601def lsamplevar (inlist):
602    """
603Returns the variance of the values in the passed list using
604N for the denominator (i.e., DESCRIBES the sample variance only).
605
606Usage:   lsamplevar(inlist)
607"""
608    n = len(inlist)
609    mn = mean(inlist)
610    deviations = []
611    for item in inlist:
612        deviations.append(item-mn)
613    return ss(deviations)/float(n)
614
615
616def lsamplestdev (inlist):
617    """
618Returns the standard deviation of the values in the passed list using
619N for the denominator (i.e., DESCRIBES the sample stdev only).
620
621Usage:   lsamplestdev(inlist)
622"""
623    return math.sqrt(samplevar(inlist))
624
625
626def lvar (inlist):
627    """
628Returns the variance of the values in the passed list using N-1
629for the denominator (i.e., for estimating population variance).
630
631Usage:   lvar(inlist)
632"""
633    n = len(inlist)
634    mn = mean(inlist)
635    deviations = [0]*len(inlist)
636    for i in range(len(inlist)):
637        deviations[i] = inlist[i] - mn
638    return ss(deviations)/float(n-1)
639
640
641def lstdev (inlist):
642    """
643Returns the standard deviation of the values in the passed list
644using N-1 in the denominator (i.e., to estimate population stdev).
645
646Usage:   lstdev(inlist)
647"""
648    return math.sqrt(var(inlist))
649
650
651def lsterr(inlist):
652    """
653Returns the standard error of the values in the passed list using N-1
654in the denominator (i.e., to estimate population standard error).
655
656Usage:   lsterr(inlist)
657"""
658    return stdev(inlist) / float(math.sqrt(len(inlist)))
659
660
661def lsem (inlist):
662    """
663Returns the estimated standard error of the mean (sx-bar) of the
664values in the passed list.  sem = stdev / sqrt(n)
665
666Usage:   lsem(inlist)
667"""
668    sd = stdev(inlist)
669    n = len(inlist)
670    return sd/math.sqrt(n)
671
672
673def lz (inlist, score):
674    """
675Returns the z-score for a given input score, given that score and the
676list from which that score came.  Not appropriate for population calculations.
677
678Usage:   lz(inlist, score)
679"""
680    z = (score-mean(inlist))/samplestdev(inlist)
681    return z
682
683
684def lzs (inlist):
685    """
686Returns a list of z-scores, one for each score in the passed list.
687
688Usage:   lzs(inlist)
689"""
690    zscores = []
691    for item in inlist:
692        zscores.append(z(inlist,item))
693    return zscores
694
695
696####################################
697#######  TRIMMING FUNCTIONS  #######
698####################################
699
700def ltrimboth (l,proportiontocut):
701    """
702Slices off the passed proportion of items from BOTH ends of the passed
703list (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND 'rightmost'
70410% of scores.  Assumes list is sorted by magnitude.  Slices off LESS if
705proportion results in a non-integer slice index (i.e., conservatively
706slices off proportiontocut).
707
708Usage:   ltrimboth (l,proportiontocut)
709Returns: trimmed version of list l
710"""
711    lowercut = int(proportiontocut*len(l))
712    uppercut = len(l) - lowercut
713    return l[lowercut:uppercut]
714
715
716def ltrim1 (l,proportiontocut,tail='right'):
717    """
718Slices off the passed proportion of items from ONE end of the passed
719list (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost'
72010% of scores).  Slices off LESS if proportion results in a non-integer
721slice index (i.e., conservatively slices off proportiontocut).
722
723Usage:   ltrim1 (l,proportiontocut,tail='right')  or set tail='left'
724Returns: trimmed version of list l
725"""
726    if tail == 'right':
727        lowercut = 0
728        uppercut = len(l) - int(proportiontocut*len(l))
729    elif tail == 'left':
730        lowercut = int(proportiontocut*len(l))
731        uppercut = len(l)
732    return l[lowercut:uppercut]
733
734
735####################################
736#####  CORRELATION FUNCTIONS  ######
737####################################
738
739def lpaired(x,y):
740    """
741Interactively determines the type of data and then runs the
742appropriated statistic for paired group data.
743
744Usage:   lpaired(x,y)
745Returns: appropriate statistic name, value, and probability
746"""
747    samples = ''
748    while samples not in ['i','r','I','R','c','C']:
749        print '\nIndependent or related samples, or correlation (i,r,c): ',
750        samples = raw_input()
751
752    if samples in ['i','I','r','R']:
753        print '\nComparing variances ...',
754# USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112
755        r = obrientransform(x,y)
756        f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1))
757        if p<0.05:
758            vartype='unequal, p='+str(round(p,4))
759        else:
760            vartype='equal'
761        print vartype
762        if samples in ['i','I']:
763            if vartype[0]=='e':
764                t,p = ttest_ind(x,y,0)
765                print '\nIndependent samples t-test:  ', round(t,4),round(p,4)
766            else:
767                if len(x)>20 or len(y)>20:
768                    z,p = ranksums(x,y)
769                    print '\nRank Sums test (NONparametric, n>20):  ', round(z,4),round(p,4)
770                else:
771                    u,p = mannwhitneyu(x,y)
772                    print '\nMann-Whitney U-test (NONparametric, ns<20):  ', round(u,4),round(p,4)
773
774        else:  # RELATED SAMPLES
775            if vartype[0]=='e':
776                t,p = ttest_rel(x,y,0)
777                print '\nRelated samples t-test:  ', round(t,4),round(p,4)
778            else:
779                t,p = ranksums(x,y)
780                print '\nWilcoxon T-test (NONparametric):  ', round(t,4),round(p,4)
781    else:  # CORRELATION ANALYSIS
782        corrtype = ''
783        while corrtype not in ['c','C','r','R','d','D']:
784            print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ',
785            corrtype = raw_input()
786        if corrtype in ['c','C']:
787            m,b,r,p,see = linregress(x,y)
788            print '\nLinear regression for continuous variables ...'
789            lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]]
790            pstat.printcc(lol)
791        elif corrtype in ['r','R']:
792            r,p = spearmanr(x,y)
793            print '\nCorrelation for ranked variables ...'
794            print "Spearman's r: ",round(r,4),round(p,4)
795        else: # DICHOTOMOUS
796            r,p = pointbiserialr(x,y)
797            print '\nAssuming x contains a dichotomous variable ...'
798            print 'Point Biserial r: ',round(r,4),round(p,4)
799    print '\n\n'
800    return None
801
802
803def lpearsonr(x,y):
804    """
805Calculates a Pearson correlation coefficient and the associated
806probability value.  Taken from Heiman's Basic Statistics for the Behav.
807Sci (2nd), p.195.
808
809Usage:   lpearsonr(x,y)      where x and y are equal-length lists
810Returns: Pearson's r value, two-tailed p-value
811"""
812    TINY = 1.0e-30
813    if len(x) <> len(y):
814        raise ValueError, 'Input values not paired in pearsonr.  Aborting.'
815    n = len(x)
816    x = map(float,x)
817    y = map(float,y)
818    xmean = mean(x)
819    ymean = mean(y)
820    r_num = n*(summult(x,y)) - sum(x)*sum(y)
821    r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y)))
822    r = (r_num / r_den)  # denominator already a float
823    df = n-2
824    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
825    prob = betai(0.5*df,0.5,df/float(df+t*t))
826    return r, prob
827
828
829def lspearmanr(x,y):
830    """
831Calculates a Spearman rank-order correlation coefficient.  Taken
832from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.
833
834Usage:   lspearmanr(x,y)      where x and y are equal-length lists
835Returns: Spearman's r, two-tailed p-value
836"""
837    TINY = 1e-30
838    if len(x) <> len(y):
839        raise ValueError, 'Input values not paired in spearmanr.  Aborting.'
840    n = len(x)
841    rankx = rankdata(x)
842    ranky = rankdata(y)
843    dsq = sumdiffsquared(rankx,ranky)
844    rs = 1 - 6*dsq / float(n*(n**2-1))
845    t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
846    df = n-2
847    probrs = betai(0.5*df,0.5,df/(df+t*t))  # t already a float
848# probability values for rs are from part 2 of the spearman function in
849# Numerical Recipies, p.510.  They are close to tables, but not exact. (?)
850    return rs, probrs
851
852
853def lpointbiserialr(x,y):
854    """
855Calculates a point-biserial correlation coefficient and the associated
856probability value.  Taken from Heiman's Basic Statistics for the Behav.
857Sci (1st), p.194.
858
859Usage:   lpointbiserialr(x,y)      where x,y are equal-length lists
860Returns: Point-biserial r, two-tailed p-value
861"""
862    TINY = 1e-30
863    if len(x) <> len(y):
864        raise ValueError, 'INPUT VALUES NOT PAIRED IN pointbiserialr.  ABORTING.'
865    data = pstat.abut(x,y)
866    categories = pstat.unique(x)
867    if len(categories) <> 2:
868        raise ValueError, "Exactly 2 categories required for pointbiserialr()."
869    else:   # there are 2 categories, continue
870        codemap = pstat.abut(categories,range(2))
871        recoded = pstat.recode(data,codemap,0)
872        x = pstat.linexand(data,0,categories[0])
873        y = pstat.linexand(data,0,categories[1])
874        xmean = mean(pstat.colex(x,1))
875        ymean = mean(pstat.colex(y,1))
876        n = len(data)
877        adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n)))
878        rpb = (ymean - xmean)/samplestdev(pstat.colex(data,1))*adjust
879        df = n-2
880        t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY)))
881        prob = betai(0.5*df,0.5,df/(df+t*t))  # t already a float
882        return rpb, prob
883
884
885def lkendalltau(x,y):
886    """
887Calculates Kendall's tau ... correlation of ordinal data.  Adapted
888from function kendl1 in Numerical Recipies.  Needs good test-routine.@@@
889
890Usage:   lkendalltau(x,y)
891Returns: Kendall's tau, two-tailed p-value
892"""
893    n1 = 0
894    n2 = 0
895    iss = 0
896    for j in range(len(x)-1):
897        for k in range(j,len(y)):
898            a1 = x[j] - x[k]
899            a2 = y[j] - y[k]
900            aa = a1 * a2
901            if (aa):             # neither list has a tie
902                n1 = n1 + 1
903                n2 = n2 + 1
904                if aa > 0:
905                    iss = iss + 1
906                else:
907                    iss = iss -1
908            else:
909                if (a1):
910                    n1 = n1 + 1
911                else:
912                    n2 = n2 + 1
913    tau = iss / math.sqrt(n1*n2)
914    svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
915    z = tau / math.sqrt(svar)
916    prob = erfcc(abs(z)/1.4142136)
917    return tau, prob
918
919
920def llinregress(x,y):
921    """
922Calculates a regression line on x,y pairs. 
923
924Usage:   llinregress(x,y)      x,y are equal-length lists of x-y coordinates
925Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate
926"""
927    TINY = 1.0e-20
928    if len(x) <> len(y):
929        raise ValueError, 'Input values not paired in linregress.  Aborting.'
930    n = len(x)
931    x = map(float,x)
932    y = map(float,y)
933    xmean = mean(x)
934    ymean = mean(y)
935    r_num = float(n*(summult(x,y)) - sum(x)*sum(y))
936    r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y)))
937    r = r_num / r_den
938    z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY))
939    df = n-2
940    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
941    prob = betai(0.5*df,0.5,df/(df+t*t))
942    slope = r_num / float(n*ss(x) - square_of_sums(x))
943    intercept = ymean - slope*xmean
944    sterrest = math.sqrt(1-r*r)*samplestdev(y)
945    return slope, intercept, r, prob, sterrest
946
947
948####################################
949#####  INFERENTIAL STATISTICS  #####
950####################################
951
952def lttest_1samp(a,popmean,printit=0,name='Sample',writemode='a'):
953    """
954Calculates the t-obtained for the independent samples T-test on ONE group
955of scores a, given a population mean.  If printit=1, results are printed
956to the screen.  If printit='filename', the results are output to 'filename'
957using the given writemode (default=append).  Returns t-value, and prob.
958
959Usage:   lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a')
960Returns: t-value, two-tailed prob
961"""
962    x = mean(a)
963    v = var(a)
964    n = len(a)
965    df = n-1
966    svar = ((n-1)*v)/float(df)
967    t = (x-popmean)/math.sqrt(svar*(1.0/n))
968    prob = betai(0.5*df,0.5,float(df)/(df+t*t))
969
970    if printit <> 0:
971        statname = 'Single-sample T-test.'
972        outputpairedstats(printit,writemode,
973                          'Population','--',popmean,0,0,0,
974                          name,n,x,v,min(a),max(a),
975                          statname,t,prob)
976    return t,prob
977
978
979def lttest_ind (a, b, printit=0, name1='Samp1', name2='Samp2', writemode='a'):
980    """
981Calculates the t-obtained T-test on TWO INDEPENDENT samples of
982scores a, and b.  From Numerical Recipies, p.483.  If printit=1, results
983are printed to the screen.  If printit='filename', the results are output
984to 'filename' using the given writemode (default=append).  Returns t-value,
985and prob.
986
987Usage:   lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a')
988Returns: t-value, two-tailed prob
989"""
990    x1 = mean(a)
991    x2 = mean(b)
992    v1 = stdev(a)**2
993    v2 = stdev(b)**2
994    n1 = len(a)
995    n2 = len(b)
996    df = n1+n2-2
997    svar = ((n1-1)*v1+(n2-1)*v2)/float(df)
998    t = (x1-x2)/math.sqrt(svar*(1.0/n1 + 1.0/n2))
999    prob = betai(0.5*df,0.5,df/(df+t*t))
1000
1001    if printit <> 0:
1002        statname = 'Independent samples T-test.'
1003        outputpairedstats(printit,writemode,
1004                          name1,n1,x1,v1,min(a),max(a),
1005                          name2,n2,x2,v2,min(b),max(b),
1006                          statname,t,prob)
1007    return t,prob
1008
1009
1010def lttest_rel (a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a'):
1011    """
1012Calculates the t-obtained T-test on TWO RELATED samples of scores,
1013a and b.  From Numerical Recipies, p.483.  If printit=1, results are
1014printed to the screen.  If printit='filename', the results are output to
1015'filename' using the given writemode (default=append).  Returns t-value,
1016and prob.
1017
1018Usage:   lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a')
1019Returns: t-value, two-tailed prob
1020"""
1021    if len(a)<>len(b):
1022        raise ValueError, 'Unequal length lists in ttest_rel.'
1023    x1 = mean(a)
1024    x2 = mean(b)
1025    v1 = var(a)
1026    v2 = var(b)
1027    n = len(a)
1028    cov = 0
1029    for i in range(len(a)):
1030        cov = cov + (a[i]-x1) * (b[i]-x2)
1031    df = n-1
1032    cov = cov / float(df)
1033    sd = math.sqrt((v1+v2 - 2.0*cov)/float(n))
1034    t = (x1-x2)/sd
1035    prob = betai(0.5*df,0.5,df/(df+t*t))
1036
1037    if printit <> 0:
1038        statname = 'Related samples T-test.'
1039        outputpairedstats(printit,writemode,
1040                          name1,n,x1,v1,min(a),max(a),
1041                          name2,n,x2,v2,min(b),max(b),
1042                          statname,t,prob)
1043    return t, prob
1044
1045
1046def lchisquare(f_obs,f_exp=None):
1047    """
1048Calculates a one-way chi square for list of observed frequencies and returns
1049the result.  If no expected frequencies are given, the total N is assumed to
1050be equally distributed across all groups.
1051
1052Usage:   lchisquare(f_obs, f_exp=None)   f_obs = list of observed cell freq.
1053Returns: chisquare-statistic, associated p-value
1054"""
1055    k = len(f_obs)                 # number of groups
1056    if f_exp == None:
1057        f_exp = [sum(f_obs)/float(k)] * len(f_obs) # create k bins with = freq.
1058    chisq = 0
1059    for i in range(len(f_obs)):
1060        chisq = chisq + (f_obs[i]-f_exp[i])**2 / float(f_exp[i])
1061    return chisq, chisqprob(chisq, k-1)
1062
1063
1064def lks_2samp (data1,data2):
1065    """
1066Computes the Kolmogorov-Smirnof statistic on 2 samples.  From
1067Numerical Recipies in C, page 493.
1068
1069Usage:   lks_2samp(data1,data2)   data1&2 are lists of values for 2 conditions
1070Returns: KS D-value, associated p-value
1071"""
1072    j1 = 0
1073    j2 = 0
1074    fn1 = 0.0
1075    fn2 = 0.0
1076    n1 = len(data1)
1077    n2 = len(data2)
1078    en1 = n1
1079    en2 = n2
1080    d = 0.0
1081    data1.sort()
1082    data2.sort()
1083    while j1 < n1 and j2 < n2:
1084        d1=data1[j1]
1085        d2=data2[j2]
1086        if d1 <= d2:
1087            fn1 = (j1)/float(en1)
1088            j1 = j1 + 1
1089        if d2 <= d1:
1090            fn2 = (j2)/float(en2)
1091            j2 = j2 + 1
1092        dt = (fn2-fn1)
1093        if math.fabs(dt) > math.fabs(d):
1094            d = dt
1095    try:
1096        en = math.sqrt(en1*en2/float(en1+en2))
1097        prob = ksprob((en+0.12+0.11/en)*abs(d))
1098    except:
1099        prob = 1.0
1100    return d, prob
1101
1102
1103def lmannwhitneyu(x,y):
1104    """
1105Calculates a Mann-Whitney U statistic on the provided scores and
1106returns the result.  Use only when the n in each condition is < 20 and
1107you have 2 independent samples of ranks.  NOTE: Mann-Whitney U is
1108significant if the u-obtained is LESS THAN or equal to the critical
1109value of U found in the tables.  Equivalent to Kruskal-Wallis H with
1110just 2 groups.
1111
1112Usage:   lmannwhitneyu(data)
1113Returns: u-statistic, one-tailed p-value (i.e., p(z(U)))
1114"""
1115    n1 = len(x)
1116    n2 = len(y)
1117    ranked = rankdata(x+y)
1118    rankx = ranked[0:n1]       # get the x-ranks
1119    ranky = ranked[n1:]        # the rest are y-ranks
1120    u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx)  # calc U for x
1121    u2 = n1*n2 - u1                            # remainder is U for y
1122    bigu = max(u1,u2)
1123    smallu = min(u1,u2)
1124    T = math.sqrt(tiecorrect(ranked))  # correction factor for tied scores
1125    if T == 0:
1126        raise ValueError, 'All numbers are identical in lmannwhitneyu'
1127    sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0)
1128    z = abs((bigu-n1*n2/2.0) / sd)  # normal approximation for prob calc
1129    return smallu, 1.0 - zprob(z)
1130
1131
1132def ltiecorrect(rankvals):
1133    """
1134Corrects for ties in Mann Whitney U and Kruskal Wallis H tests.  See
1135Siegel, S. (1956) Nonparametric Statistics for the Behavioral Sciences.
1136New York: McGraw-Hill.  Code adapted from |Stat rankind.c code.
1137
1138Usage:   ltiecorrect(rankvals)
1139Returns: T correction factor for U or H
1140"""
1141    sorted,posn = shellsort(rankvals)
1142    n = len(sorted)
1143    T = 0.0
1144    i = 0
1145    while (i<n-1):
1146        if sorted[i] == sorted[i+1]:
1147            nties = 1
1148            while (i<n-1) and (sorted[i] == sorted[i+1]):
1149                nties = nties +1
1150                i = i +1
1151            T = T + nties**3 - nties
1152        i = i+1
1153    T = T / float(n**3-n)
1154    return 1.0 - T
1155
1156
1157def lranksums(x,y):
1158    """
1159Calculates the rank sums statistic on the provided scores and
1160returns the result.  Use only when the n in each condition is > 20 and you
1161have 2 independent samples of ranks.
1162
1163Usage:   lranksums(x,y)
1164Returns: a z-statistic, two-tailed p-value
1165"""
1166    n1 = len(x)
1167    n2 = len(y)
1168    alldata = x+y
1169    ranked = rankdata(alldata)
1170    x = ranked[:n1]
1171    y = ranked[n1:]
1172    s = sum(x)
1173    expected = n1*(n1+n2+1) / 2.0
1174    z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0)
1175    prob = 2*(1.0 -zprob(abs(z)))
1176    return z, prob
1177
1178
1179def lwilcoxont(x,y):
1180    """
1181Calculates the Wilcoxon T-test for related samples and returns the
1182result.  A non-parametric T-test.
1183
1184Usage:   lwilcoxont(x,y)
1185Returns: a t-statistic, two-tail probability estimate
1186"""
1187    if len(x) <> len(y):
1188        raise ValueError, 'Unequal N in wilcoxont.  Aborting.'
1189    d=[]
1190    for i in range(len(x)):
1191        diff = x[i] - y[i]
1192        if diff <> 0:
1193            d.append(diff)
1194    count = len(d)
1195    absd = map(abs,d)
1196    absranked = rankdata(absd)
1197    r_plus = 0.0
1198    r_minus = 0.0
1199    for i in range(len(absd)):
1200        if d[i] < 0:
1201            r_minus = r_minus + absranked[i]
1202        else:
1203            r_plus = r_plus + absranked[i]
1204    wt = min(r_plus, r_minus)
1205    mn = count * (count+1) * 0.25
1206    se =  math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0)
1207    z = math.fabs(wt-mn) / se
1208    prob = 2*(1.0 -zprob(abs(z)))
1209    return wt, prob
1210
1211
1212def lkruskalwallish(*args):
1213    """
1214The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more
1215groups, requiring at least 5 subjects in each group.  This function
1216calculates the Kruskal-Wallis H-test for 3 or more independent samples
1217and returns the result. 
1218
1219Usage:   lkruskalwallish(*args)
1220Returns: H-statistic (corrected for ties), associated p-value
1221"""
1222    args = list(args)
1223    n = [0]*len(args)
1224    all = []
1225    n = map(len,args)
1226    for i in range(len(args)):
1227        all = all + args[i]
1228    ranked = rankdata(all)
1229    T = tiecorrect(ranked)
1230    for i in range(len(args)):
1231        args[i] = ranked[0:n[i]]
1232        del ranked[0:n[i]]
1233    rsums = []
1234    for i in range(len(args)):
1235        rsums.append(sum(args[i])**2)
1236        rsums[i] = rsums[i] / float(n[i])
1237    ssbn = sum(rsums)
1238    totaln = sum(n)
1239    h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
1240    df = len(args) - 1
1241    if T == 0:
1242        raise ValueError, 'All numbers are identical in lkruskalwallish'
1243    h = h / float(T)
1244    return h, chisqprob(h,df)
1245
1246
1247def lfriedmanchisquare(*args):
1248    """
1249Friedman Chi-Square is a non-parametric, one-way within-subjects
1250ANOVA.  This function calculates the Friedman Chi-square test for repeated
1251measures and returns the result, along with the associated probability
1252value.  It assumes 3 or more repeated measures.  Only 3 levels requires a
1253minimum of 10 subjects in the study.  Four levels requires 5 subjects per
1254level(??).
1255
1256Usage:   lfriedmanchisquare(*args)
1257Returns: chi-square statistic, associated p-value
1258"""
1259    k = len(args)
1260    if k < 3:
1261        raise ValueError, 'Less than 3 levels.  Friedman test not appropriate.'
1262    n = len(args[0])
1263    data = apply(pstat.abut,tuple(args))
1264    for i in range(len(data)):
1265        data[i] = rankdata(data[i])
1266    ssbn = 0
1267    for i in range(k):
1268        ssbn = ssbn + sum(args[i])**2
1269    chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)
1270    return chisq, chisqprob(chisq,k-1)
1271
1272
1273####################################
1274####  PROBABILITY CALCULATIONS  ####
1275####################################
1276
1277def lchisqprob(chisq,df):
1278    """
1279Returns the (1-tailed) probability value associated with the provided
1280chi-square value and df.  Adapted from chisq.c in Gary Perlman's |Stat.
1281
1282Usage:   lchisqprob(chisq,df)
1283"""
1284    BIG = 20.0
1285    def ex(x):
1286        BIG = 20.0
1287        if x < -BIG:
1288            return 0.0
1289        else:
1290            return math.exp(x)
1291
1292    if chisq <=0 or df < 1:
1293        return 1.0
1294    a = 0.5 * chisq
1295    if df%2 == 0:
1296        even = 1
1297    else:
1298        even = 0
1299    if df > 1:
1300        y = ex(-a)
1301    if even:
1302        s = y
1303    else:
1304        s = 2.0 * zprob(-math.sqrt(chisq))
1305    if (df > 2):
1306        chisq = 0.5 * (df - 1.0)
1307        if even:
1308            z = 1.0
1309        else:
1310            z = 0.5
1311        if a > BIG:
1312            if even:
1313                e = 0.0
1314            else:
1315                e = math.log(math.sqrt(math.pi))
1316            c = math.log(a)
1317            while (z <= chisq):
1318                e = math.log(z) + e
1319                s = s + ex(c*z-a-e)
1320                z = z + 1.0
1321            return s
1322        else:
1323            if even:
1324                e = 1.0
1325            else:
1326                e = 1.0 / math.sqrt(math.pi) / math.sqrt(a)
1327            c = 0.0
1328            while (z <= chisq):
1329                e = e * (a/float(z))
1330                c = c + e
1331                z = z + 1.0
1332            return (c*y+s)
1333    else:
1334        return s
1335
1336
1337def lerfcc(x):
1338    """
1339Returns the complementary error function erfc(x) with fractional
1340error everywhere less than 1.2e-7.  Adapted from Numerical Recipies.
1341
1342Usage:   lerfcc(x)
1343"""
1344    z = abs(x)
1345    t = 1.0 / (1.0+0.5*z)
1346    ans = t * math.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277)))))))))
1347    if x >= 0:
1348        return ans
1349    else:
1350        return 2.0 - ans
1351
1352
1353def lzprob(z):
1354    """
1355Returns the area under the normal curve 'to the left of' the given z value.
1356Thus,
1357    for z<0, zprob(z) = 1-tail probability
1358    for z>0, 1.0-zprob(z) = 1-tail probability
1359    for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability
1360Adapted from z.c in Gary Perlman's |Stat.
1361
1362Usage:   lzprob(z)
1363"""
1364    Z_MAX = 6.0    # maximum meaningful z-value
1365    if z == 0.0:
1366        x = 0.0
1367    else:
1368        y = 0.5 * math.fabs(z)
1369        if y >= (Z_MAX*0.5):
1370            x = 1.0
1371        elif (y < 1.0):
1372            w = y*y
1373            x = ((((((((0.000124818987 * w
1374                        -0.001075204047) * w +0.005198775019) * w
1375                      -0.019198292004) * w +0.059054035642) * w
1376                    -0.151968751364) * w +0.319152932694) * w
1377                  -0.531923007300) * w +0.797884560593) * y * 2.0
1378        else:
1379            y = y - 2.0
1380            x = (((((((((((((-0.000045255659 * y
1381                             +0.000152529290) * y -0.000019538132) * y
1382                           -0.000676904986) * y +0.001390604284) * y
1383                         -0.000794620820) * y -0.002034254874) * y
1384                       +0.006549791214) * y -0.010557625006) * y
1385                     +0.011630447319) * y -0.009279453341) * y
1386                   +0.005353579108) * y -0.002141268741) * y
1387                 +0.000535310849) * y +0.999936657524
1388    if z > 0.0:
1389        prob = ((x+1.0)*0.5)
1390    else:
1391        prob = ((1.0-x)*0.5)
1392    return prob
1393
1394
1395def lksprob(alam):
1396    """
1397Computes a Kolmolgorov-Smirnov t-test significance level.  Adapted from
1398Numerical Recipies.
1399
1400Usage:   lksprob(alam)
1401"""
1402    fac = 2.0
1403    sum = 0.0
1404    termbf = 0.0
1405    a2 = -2.0*alam*alam
1406    for j in range(1,201):
1407        term = fac*math.exp(a2*j*j)
1408        sum = sum + term
1409        if math.fabs(term) <= (0.001*termbf) or math.fabs(term) < (1.0e-8*sum):
1410            return sum
1411        fac = -fac
1412        termbf = math.fabs(term)
1413    return 1.0             # Get here only if fails to converge; was 0.0!!
1414
1415
1416def lfprob (dfnum, dfden, F):
1417    """
1418Returns the (1-tailed) significance level (p-value) of an F
1419statistic given the degrees of freedom for the numerator (dfR-dfF) and
1420the degrees of freedom for the denominator (dfF).
1421
1422Usage:   lfprob(dfnum, dfden, F)   where usually dfnum=dfbn, dfden=dfwn
1423"""
1424    p = betai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F))
1425    return p
1426
1427
1428def lbetacf(a,b,x):
1429    """
1430This function evaluates the continued fraction form of the incomplete
1431Beta function, betai.  (Adapted from: Numerical Recipies in C.)
1432
1433Usage:   lbetacf(a,b,x)
1434"""
1435    ITMAX = 200
1436    EPS = 3.0e-7
1437
1438    bm = az = am = 1.0
1439    qab = a+b
1440    qap = a+1.0
1441    qam = a-1.0
1442    bz = 1.0-qab*x/qap
1443    for i in range(ITMAX+1):
1444        em = float(i+1)
1445        tem = em + em
1446        d = em*(b-em)*x/((qam+tem)*(a+tem))
1447        ap = az + d*am
1448        bp = bz+d*bm
1449        d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem))
1450        app = ap+d*az
1451        bpp = bp+d*bz
1452        aold = az
1453        am = ap/bpp
1454        bm = bp/bpp
1455        az = app/bpp
1456        bz = 1.0
1457        if (abs(az-aold)<(EPS*abs(az))):
1458            return az
1459    print 'a or b too big, or ITMAX too small in Betacf.'
1460
1461
1462def lgammln(xx):
1463    """
1464Returns the gamma function of xx.
1465    Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
1466(Adapted from: Numerical Recipies in C.)
1467
1468Usage:   lgammln(xx)
1469"""
1470
1471    coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
1472             0.120858003e-2, -0.536382e-5]
1473    x = xx - 1.0
1474    tmp = x + 5.5
1475    tmp = tmp - (x+0.5)*math.log(tmp)
1476    ser = 1.0
1477    for j in range(len(coeff)):
1478        x = x + 1
1479        ser = ser + coeff[j]/x
1480    return -tmp + math.log(2.50662827465*ser)
1481
1482
1483def lbetai(a,b,x):
1484    """
1485Returns the incomplete beta function:
1486
1487    I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt)
1488
1489where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma
1490function of a.  The continued fraction formulation is implemented here,
1491using the betacf function.  (Adapted from: Numerical Recipies in C.)
1492
1493Usage:   lbetai(a,b,x)
1494"""
1495    if (x<0.0 or x>1.0):
1496        raise ValueError, 'Bad x in lbetai'
1497    if (x==0.0 or x==1.0):
1498        bt = 0.0
1499    else:
1500        bt = math.exp(gammln(a+b)-gammln(a)-gammln(b)+a*math.log(x)+b*
1501                      math.log(1.0-x))
1502    if (x<(a+1.0)/(a+b+2.0)):
1503        return bt*betacf(a,b,x)/float(a)
1504    else:
1505        return 1.0-bt*betacf(b,a,1.0-x)/float(b)
1506
1507
1508####################################
1509#######  ANOVA CALCULATIONS  #######
1510####################################
1511
1512def lF_oneway(*lists):
1513    """
1514Performs a 1-way ANOVA, returning an F-value and probability given
1515any number of groups.  From Heiman, pp.394-7.
1516
1517Usage:   F_oneway(*lists)    where *lists is any number of lists, one per
1518                                  treatment group
1519Returns: F value, one-tailed p-value
1520"""
1521    a = len(lists)           # ANOVA on 'a' groups, each in it's own list
1522    means = [0]*a
1523    vars = [0]*a
1524    ns = [0]*a
1525    alldata = []
1526    tmp = map(N.array,lists)
1527    means = map(amean,tmp)
1528    vars = map(avar,tmp)
1529    ns = map(len,lists)
1530    for i in range(len(lists)):
1531        alldata = alldata + lists[i]
1532    alldata = N.array(alldata)
1533    bign = len(alldata)
1534    sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign))
1535    ssbn = 0
1536    for list in lists:
1537        ssbn = ssbn + asquare_of_sums(N.array(list))/float(len(list))
1538    ssbn = ssbn - (asquare_of_sums(alldata)/float(bign))
1539    sswn = sstot-ssbn
1540    dfbn = a-1
1541    dfwn = bign - a
1542    msb = ssbn/float(dfbn)
1543    msw = sswn/float(dfwn)
1544    f = msb/msw
1545    prob = fprob(dfbn,dfwn,f)
1546    return f, prob
1547
1548
1549def lF_value (ER,EF,dfnum,dfden):
1550    """
1551Returns an F-statistic given the following:
1552        ER  = error associated with the null hypothesis (the Restricted model)
1553        EF  = error associated with the alternate hypothesis (the Full model)
1554        dfR-dfF = degrees of freedom of the numerator
1555        dfF = degrees of freedom associated with the denominator/Full model
1556
1557Usage:   lF_value(ER,EF,dfnum,dfden)
1558"""
1559    return ((ER-EF)/float(dfnum) / (EF/float(dfden)))
1560
1561
1562####################################
1563########  SUPPORT FUNCTIONS  #######
1564####################################
1565
1566def writecc (listoflists,file,writetype='w',extra=2):
1567    """
1568Writes a list of lists to a file in columns, customized by the max
1569size of items within the columns (max size of items in col, +2 characters)
1570to specified file.  File-overwrite is the default.
1571
1572Usage:   writecc (listoflists,file,writetype='w',extra=2)
1573Returns: None
1574"""
1575    if type(listoflists[0]) not in [ListType,TupleType]:
1576        listoflists = [listoflists]
1577    outfile = open(file,writetype)
1578    rowstokill = []
1579    list2print = copy.deepcopy(listoflists)
1580    for i in range(len(listoflists)):
1581        if listoflists[i] == ['\n'] or listoflists[i]=='\n' or listoflists[i]=='dashes':
1582            rowstokill = rowstokill + [i]
1583    rowstokill.reverse()
1584    for row in rowstokill:
1585        del list2print[row]
1586    maxsize = [0]*len(list2print[0])
1587    for col in range(len(list2print[0])):
1588        items = pstat.colex(list2print,col)
1589        items = map(pstat.makestr,items)
1590        maxsize[col] = max(map(len,items)) + extra
1591    for row in listoflists:
1592        if row == ['\n'] or row == '\n':
1593            outfile.write('\n')
1594        elif row == ['dashes'] or row == 'dashes':
1595            dashes = [0]*len(maxsize)
1596            for j in range(len(maxsize)):
1597                dashes[j] = '-'*(maxsize[j]-2)
1598            outfile.write(pstat.lineincustcols(dashes,maxsize))
1599        else:
1600            outfile.write(pstat.lineincustcols(row,maxsize))
1601        outfile.write('\n')
1602    outfile.close()
1603    return None
1604
1605
1606def lincr(l,cap):        # to increment a list up to a max-list of 'cap'
1607    """
1608Simulate a counting system from an n-dimensional list.
1609
1610Usage:   lincr(l,cap)   l=list to increment, cap=max values for each list pos'n
1611Returns: next set of values for list l, OR -1 (if overflow)
1612"""
1613    l[0] = l[0] + 1     # e.g., [0,0,0] --> [2,4,3] (=cap)
1614    for i in range(len(l)):
1615        if l[i] > cap[i] and i < len(l)-1: # if carryover AND not done
1616            l[i] = 0
1617            l[i+1] = l[i+1] + 1
1618        elif l[i] > cap[i] and i == len(l)-1: # overflow past last column, must be finished
1619            l = -1
1620    return l
1621
1622
1623def lsum (inlist):
1624    """
1625Returns the sum of the items in the passed list.
1626
1627Usage:   lsum(inlist)
1628"""
1629    s = 0
1630    for item in inlist:
1631        s = s + item
1632    return s
1633
1634
1635def lcumsum (inlist):
1636    """
1637Returns a list consisting of the cumulative sum of the items in the
1638passed list.
1639
1640Usage:   lcumsum(inlist)
1641"""
1642    newlist = copy.deepcopy(inlist)
1643    for i in range(1,len(newlist)):
1644        newlist[i] = newlist[i] + newlist[i-1]
1645    return newlist
1646
1647
1648def lss(inlist):
1649    """
1650Squares each value in the passed list, adds up these squares and
1651returns the result.
1652
1653Usage:   lss(inlist)
1654"""
1655    ss = 0
1656    for item in inlist:
1657        ss = ss + item*item
1658    return ss
1659
1660
1661def lsummult (list1,list2):
1662    """
1663Multiplies elements in list1 and list2, element by element, and
1664returns the sum of all resulting multiplications.  Must provide equal
1665length lists.
1666
1667Usage:   lsummult(list1,list2)
1668"""
1669    if len(list1) <> len(list2):
1670        raise ValueError, "Lists not equal length in summult."
1671    s = 0
1672    for item1,item2 in pstat.abut(list1,list2):
1673        s = s + item1*item2
1674    return s
1675
1676
1677def lsumdiffsquared(x,y):
1678    """
1679Takes pairwise differences of the values in lists x and y, squares
1680these differences, and returns the sum of these squares.
1681
1682Usage:   lsumdiffsquared(x,y)
1683Returns: sum[(x[i]-y[i])**2]
1684"""
1685    sds = 0
1686    for i in range(len(x)):
1687        sds = sds + (x[i]-y[i])**2
1688    return sds
1689
1690
1691def lsquare_of_sums(inlist):
1692    """
1693Adds the values in the passed list, squares the sum, and returns
1694the result.
1695
1696Usage:   lsquare_of_sums(inlist)
1697Returns: sum(inlist[i])**2
1698"""
1699    s = sum(inlist)
1700    return float(s)*s
1701
1702
1703def lshellsort(inlist):
1704    """
1705Shellsort algorithm.  Sorts a 1D-list.
1706
1707Usage:   lshellsort(inlist)
1708Returns: sorted-inlist, sorting-index-vector (for original list)
1709"""
1710    n = len(inlist)
1711    svec = copy.deepcopy(inlist)
1712    ivec = range(n)
1713    gap = n/2   # integer division needed
1714    while gap >0:
1715        for i in range(gap,n):
1716            for j in range(i-gap,-1,-gap):
1717                while j>=0 and svec[j]>svec[j+gap]:
1718                    temp        = svec[j]
1719                    svec[j]     = svec[j+gap]
1720                    svec[j+gap] = temp
1721                    itemp       = ivec[j]
1722                    ivec[j]     = ivec[j+gap]
1723                    ivec[j+gap] = itemp
1724        gap = gap / 2  # integer division needed
1725# svec is now sorted inlist, and ivec has the order svec[i] = vec[ivec[i]]
1726    return svec, ivec
1727
1728
1729def lrankdata(inlist):
1730    """
1731Ranks the data in inlist, dealing with ties appropritely.  Assumes
1732a 1D inlist.  Adapted from Gary Perlman's |Stat ranksort.
1733
1734Usage:   lrankdata(inlist)
1735Returns: a list of length equal to inlist, containing rank scores
1736"""
1737    n = len(inlist)
1738    svec, ivec = shellsort(inlist)
1739    sumranks = 0
1740    dupcount = 0
1741    newlist = [0]*n
1742    for i in range(n):
1743        sumranks = sumranks + i
1744        dupcount = dupcount + 1
1745        if i==n-1 or svec[i] <> svec[i+1]:
1746            averank = sumranks / float(dupcount) + 1
1747            for j in range(i-dupcount+1,i+1):
1748                newlist[ivec[j]] = averank
1749            sumranks = 0
1750            dupcount = 0
1751    return newlist
1752
1753
1754def outputpairedstats(fname,writemode,name1,n1,m1,se1,min1,max1,name2,n2,m2,se2,min2,max2,statname,stat,prob):
1755    """
1756Prints or write to a file stats for two groups, using the name, n,
1757mean, sterr, min and max for each group, as well as the statistic name,
1758its value, and the associated p-value.
1759
1760Usage:   outputpairedstats(fname,writemode,
1761                           name1,n1,mean1,stderr1,min1,max1,
1762                           name2,n2,mean2,stderr2,min2,max2,
1763                           statname,stat,prob)
1764Returns: None
1765"""
1766    suffix = ''                       # for *s after the p-value
1767    try:
1768        x = prob.shape
1769        prob = prob[0]
1770    except:
1771        pass
1772    if  prob < 0.001:  suffix = '  ***'
1773    elif prob < 0.01:  suffix = '  **'
1774    elif prob < 0.05:  suffix = '  *'
1775    title = [['Name','N','Mean','SD','Min','Max']]
1776    lofl = title+[[name1,n1,round(m1,3),round(math.sqrt(se1),3),min1,max1],
1777                  [name2,n2,round(m2,3),round(math.sqrt(se2),3),min2,max2]]
1778    if type(fname)<>StringType or len(fname)==0:
1779        print
1780        print statname
1781        print
1782        pstat.printcc(lofl)
1783        print
1784        try:
1785            if stat.shape == ():
1786                stat = stat[0]
1787            if prob.shape == ():
1788                prob = prob[0]
1789        except:
1790            pass
1791        print 'Test statistic = ',round(stat,3),'   p = ',round(prob,3),suffix
1792        print
1793    else:
1794        file = open(fname,writemode)
1795        file.write('\n'+statname+'\n\n')
1796        file.close()
1797        writecc(lofl,fname,'a')
1798        file = open(fname,'a')
1799        try:
1800            if stat.shape == ():
1801                stat = stat[0]
1802            if prob.shape == ():
1803                prob = prob[0]
1804        except:
1805            pass
1806        file.write(pstat.list2string(['\nTest statistic = ',round(stat,4),'   p = ',round(prob,4),suffix,'\n\n']))
1807        file.close()
1808    return None
1809
1810
1811def lfindwithin (data):
1812    """
1813Returns an integer representing a binary vector, where 1=within-
1814subject factor, 0=between.  Input equals the entire data 2D list (i.e.,
1815column 0=random factor, column -1=measured values (those two are skipped).
1816Note: input data is in |Stat format ... a list of lists ("2D list") with
1817one row per measured value, first column=subject identifier, last column=
1818score, one in-between column per factor (these columns contain level
1819designations on each factor).  See also stats.anova.__doc__.
1820
1821Usage:   lfindwithin(data)     data in |Stat format
1822"""
1823
1824    numfact = len(data[0])-1
1825    withinvec = 0
1826    for col in range(1,numfact):
1827        examplelevel = pstat.unique(pstat.colex(data,col))[0]
1828        rows = pstat.linexand(data,col,examplelevel)  # get 1 level of this factor
1829        factsubjs = pstat.unique(pstat.colex(rows,0))
1830        allsubjs = pstat.unique(pstat.colex(data,0))
1831        if len(factsubjs) == len(allsubjs):  # fewer Ss than scores on this factor?
1832            withinvec = withinvec + (1 << col)
1833    return withinvec
1834
1835
1836#########################################################
1837#########################################################
1838####### DISPATCH LISTS AND TUPLES TO ABOVE FCNS #########
1839#########################################################
1840#########################################################
1841
1842## CENTRAL TENDENCY:
1843geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)), )
1844harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)), )
1845mean = Dispatch ( (lmean, (ListType, TupleType)), )
1846median = Dispatch ( (lmedian, (ListType, TupleType)), )
1847medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)), )
1848mode = Dispatch ( (lmode, (ListType, TupleType)), )
1849
1850## MOMENTS:
1851moment = Dispatch ( (lmoment, (ListType, TupleType)), )
1852variation = Dispatch ( (lvariation, (ListType, TupleType)), )
1853skew = Dispatch ( (lskew, (ListType, TupleType)), )
1854kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)), )
1855describe = Dispatch ( (ldescribe, (ListType, TupleType)), )
1856
1857## FREQUENCY STATISTICS:
1858itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)), )
1859scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)), )
1860percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)), )
1861histogram = Dispatch ( (lhistogram, (ListType, TupleType)), )
1862cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)), )
1863relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)), )
1864
1865## VARIABILITY:
1866obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)), )
1867samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)), )
1868samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)), )
1869var = Dispatch ( (lvar, (ListType, TupleType)), )
1870stdev = Dispatch ( (lstdev, (ListType, TupleType)), )
1871sterr = Dispatch ( (lsterr, (ListType, TupleType)), )
1872sem = Dispatch ( (lsem, (ListType, TupleType)), )
1873z = Dispatch ( (lz, (ListType, TupleType)), )
1874zs = Dispatch ( (lzs, (ListType, TupleType)), )
1875
1876## TRIMMING FCNS:
1877trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)), )
1878trim1 = Dispatch ( (ltrim1, (ListType, TupleType)), )
1879
1880## CORRELATION FCNS:
1881paired = Dispatch ( (lpaired, (ListType, TupleType)), )
1882pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)), )
1883spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)), )
1884pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)), )
1885kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)), )
1886linregress = Dispatch ( (llinregress, (ListType, TupleType)), )
1887
1888## INFERENTIAL STATS:
1889ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)), )
1890ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)), )
1891ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)), )
1892chisquare = Dispatch ( (lchisquare, (ListType, TupleType)), )
1893ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)), )
1894mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)), )
1895ranksums = Dispatch ( (lranksums, (ListType, TupleType)), )
1896tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)), )
1897wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)), )
1898kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)), )
1899friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)), )
1900
1901## PROBABILITY CALCS:
1902chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)), )
1903zprob = Dispatch ( (lzprob, (IntType, FloatType)), )
1904ksprob = Dispatch ( (lksprob, (IntType, FloatType)), )
1905fprob = Dispatch ( (lfprob, (IntType, FloatType)), )
1906betacf = Dispatch ( (lbetacf, (IntType, FloatType)), )
1907betai = Dispatch ( (lbetai, (IntType, FloatType)), )
1908erfcc = Dispatch ( (lerfcc, (IntType, FloatType)), )
1909gammln = Dispatch ( (lgammln, (IntType, FloatType)), )
1910
1911## ANOVA FUNCTIONS:
1912F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)), )
1913F_value = Dispatch ( (lF_value, (ListType, TupleType)), )
1914
1915## SUPPORT FUNCTIONS:
1916incr = Dispatch ( (lincr, (ListType, TupleType)), )
1917sum = Dispatch ( (lsum, (ListType, TupleType)), )
1918cumsum = Dispatch ( (lcumsum, (ListType, TupleType)), )
1919ss = Dispatch ( (lss, (ListType, TupleType)), )
1920summult = Dispatch ( (lsummult, (ListType, TupleType)), )
1921square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)), )
1922sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)), )
1923shellsort = Dispatch ( (lshellsort, (ListType, TupleType)), )
1924rankdata = Dispatch ( (lrankdata, (ListType, TupleType)), )
1925findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)), )
1926
1927
1928#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1929#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1930#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1931#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1932#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1933#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1934#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1935#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1936#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1937#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1938#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1939#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1940#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1941#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1942#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1943#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1944#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1945#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1946#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1947
1948try:                         # DEFINE THESE *ONLY* IF NUMERIC IS AVAILABLE
1949 import Numeric
1950 N = Numeric
1951 import LinearAlgebra
1952 LA = LinearAlgebra
1953
1954
1955#####################################
1956########  ACENTRAL TENDENCY  ########
1957#####################################
1958
1959 def ageometricmean (inarray,dimension=None,keepdims=0):
1960    """
1961Calculates the geometric mean of the values in the passed array.
1962That is:  n-th root of (x1 * x2 * ... * xn).  Defaults to ALL values in
1963the passed array.  Use dimension=None to flatten array first.  REMEMBER: if
1964dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
1965if dimension is a sequence, it collapses over all specified dimensions.  If
1966keepdims is set to 1, the resulting array will have as many dimensions as
1967inarray, with only 1 'level' per dim that was collapsed over.
1968
1969Usage:   ageometricmean(inarray,dimension=None,keepdims=0)
1970Returns: geometric mean computed over dim(s) listed in dimension
1971"""
1972    inarray = N.array(inarray,N.Float)
1973    if dimension == None:
1974        inarray = N.ravel(inarray)
1975        size = len(inarray)
1976        mult = N.power(inarray,1.0/size)
1977        mult = N.multiply.reduce(mult)
1978    elif type(dimension) in [IntType,FloatType]:
1979        size = inarray.shape[dimension]
1980        mult = N.power(inarray,1.0/size)
1981        mult = N.multiply.reduce(mult,dimension)
1982        if keepdims == 1:
1983            shp = list(inarray.shape)
1984            shp[dimension] = 1
1985            sum = N.reshape(sum,shp)
1986    else: # must be a SEQUENCE of dims to average over
1987        dims = list(dimension)
1988        dims.sort()
1989        dims.reverse()
1990        size = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.Float)
1991        mult = N.power(inarray,1.0/size)
1992        for dim in dims:
1993            mult = N.multiply.reduce(mult,dim)
1994        if keepdims == 1:
1995            shp = list(inarray.shape)
1996            for dim in dims:
1997                shp[dim] = 1
1998            mult = N.reshape(mult,shp)
1999    return mult
2000
2001
2002 def aharmonicmean (inarray,dimension=None,keepdims=0):
2003    """
2004Calculates the harmonic mean of the values in the passed array.
2005That is:  n / (1/x1 + 1/x2 + ... + 1/xn).  Defaults to ALL values in
2006the passed array.  Use dimension=None to flatten array first.  REMEMBER: if
2007dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
2008if dimension is a sequence, it collapses over all specified dimensions.  If
2009keepdims is set to 1, the resulting array will have as many dimensions as
2010inarray, with only 1 'level' per dim that was collapsed over.
2011
2012Usage:   aharmonicmean(inarray,dimension=None,keepdims=0)
2013Returns: harmonic mean computed over dim(s) in dimension
2014"""
2015    inarray = inarray.astype(N.Float)
2016    if dimension == None:
2017        inarray = N.ravel(inarray)
2018        size = len(inarray)
2019        s = N.add.reduce(1.0 / inarray)
2020    elif type(dimension) in [IntType,FloatType]:
2021        size = float(inarray.shape[dimension])
2022        s = N.add.reduce(1.0/inarray, dimension)
2023        if keepdims == 1:
2024            shp = list(inarray.shape)
2025            shp[dimension] = 1
2026            s = N.reshape(s,shp)
2027    else: # must be a SEQUENCE of dims to average over
2028        dims = list(dimension)
2029        dims.sort()
2030        nondims = []
2031        for i in range(len(inarray.shape)):
2032            if i not in dims:
2033                nondims.append(i)
2034        tinarray = N.transpose(inarray,nondims+dims) # put keep-dims first
2035        idx = [0] *len(nondims)
2036        if idx == []:
2037            size = len(N.ravel(inarray))
2038            s = asum(1.0 / inarray)
2039            if keepdims == 1:
2040                s = N.reshape([s],N.ones(len(inarray.shape)))
2041        else:
2042            idx[0] = -1
2043            loopcap = N.array(tinarray.shape[0:len(nondims)]) -1
2044            s = N.zeros(loopcap+1,N.Float)
2045            while incr(idx,loopcap) <> -1:
2046                s[idx] = asum(1.0/tinarray[idx])
2047            size = N.multiply.reduce(N.take(inarray.shape,dims))
2048            if keepdims == 1:
2049                shp = list(inarray.shape)
2050                for dim in dims:
2051                    shp[dim] = 1
2052                s = N.reshape(s,shp)
2053    return size / s
2054
2055
2056 def amean (inarray,dimension=None,keepdims=0):
2057    """
2058Calculates the arithmatic mean of the values in the passed array.
2059That is:  1/n * (x1 + x2 + ... + xn).  Defaults to ALL values in the
2060passed array.  Use dimension=None to flatten array first.  REMEMBER: if
2061dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
2062if dimension is a sequence, it collapses over all specified dimensions.  If
2063keepdims is set to 1, the resulting array will have as many dimensions as
2064inarray, with only 1 'level' per dim that was collapsed over.
2065
2066Usage:   amean(inarray,dimension=None,keepdims=0)
2067Returns: arithematic mean calculated over dim(s) in dimension
2068"""
2069    if inarray.typecode() in ['l','s','b']:
2070        inarray = inarray.astype(N.Float)
2071    if dimension == None:
2072        inarray = N.ravel(inarray)
2073        sum = N.add.reduce(inarray)
2074        denom = float(len(inarray))
2075    elif type(dimension) in [IntType,FloatType]:
2076        sum = asum(inarray,dimension)
2077        denom = float(inarray.shape[dimension])
2078        if keepdims == 1:
2079            shp = list(inarray.shape)
2080            shp[dimension] = 1
2081            sum = N.reshape(sum,shp)
2082    else: # must be a TUPLE of dims to average over
2083        dims = list(dimension)
2084        dims.sort()
2085        dims.reverse()
2086        sum = inarray *1.0
2087        for dim in dims:
2088            sum = N.add.reduce(sum,dim)
2089        denom = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.Float)
2090        if keepdims == 1:
2091            shp = list(inarray.shape)
2092            for dim in dims:
2093                shp[dim] = 1
2094            sum = N.reshape(sum,shp)
2095    return sum/denom
2096
2097
2098 def amedian (inarray,numbins=1000):
2099    """
2100Calculates the COMPUTED median value of an array of numbers, given the
2101number of bins to use for the histogram (more bins approaches finding the
2102precise median value of the array; default number of bins = 1000).  From
2103G.W. Heiman's Basic Stats, or CRC Probability & Statistics.
2104NOTE:  THIS ROUTINE ALWAYS uses the entire passed array (flattens it first).
2105
2106Usage:   amedian(inarray,numbins=1000)
2107Returns: median calculated over ALL values in inarray
2108"""
2109    inarray = N.ravel(inarray)
2110    (hist, smallest, binsize, extras) = ahistogram(inarray,numbins)
2111    cumhist = N.cumsum(hist)            # make cumulative histogram
2112    otherbins = N.greater_equal(cumhist,len(inarray)/2.0)
2113    otherbins = list(otherbins)         # list of 0/1s, 1s start at median bin
2114    cfbin = otherbins.index(1)                # get 1st(!) index holding 50%ile score
2115    LRL = smallest + binsize*cfbin        # get lower read limit of that bin
2116    cfbelow = N.add.reduce(hist[0:cfbin])        # cum. freq. below bin
2117    freq = hist[cfbin]                        # frequency IN the 50%ile bin
2118    median = LRL + ((len(inarray)/2.0-cfbelow)/float(freq))*binsize # MEDIAN
2119    return median
2120
2121
2122 def amedianscore (inarray,dimension=None):
2123    """
2124Returns the 'middle' score of the passed array.  If there is an even
2125number of scores, the mean of the 2 middle scores is returned.  Can function
2126with 1D arrays, or on the FIRST dimension of 2D arrays (i.e., dimension can
2127be None, to pre-flatten the array, or else dimension must equal 0).
2128
2129Usage:   amedianscore(inarray,dimension=None)
2130Returns: 'middle' score of the array, or the mean of the 2 middle scores
2131"""
2132    if dimension == None:
2133        inarray = N.ravel(inarray)
2134        dimension = 0
2135    inarray = N.sort(inarray,dimension)
2136    if inarray.shape[dimension] % 2 == 0:   # if even number of elements
2137        indx = inarray.shape[dimension]/2   # integer division correct
2138        median = N.asarray(inarray[indx]+inarray[indx-1]) / 2.0
2139    else:
2140        indx = inarray.shape[dimension] / 2 # integer division correct
2141        median = N.take(inarray,[indx],dimension)
2142        if median.shape == (1,):
2143            median = median[0]
2144    return median
2145
2146
2147 def amode(a, dimension=None):
2148    """
2149Returns an array of the modal (most common) score in the passed array.
2150If there is more than one such score, ONLY THE FIRST is returned.
2151The bin-count for the modal values is also returned.  Operates on whole
2152array (dimension=None), or on a given dimension.
2153
2154Usage:   amode(a, dimension=None)
2155Returns: array of bin-counts for mode(s), array of corresponding modal values
2156"""
2157
2158    if dimension == None:
2159        a = N.ravel(a)
2160        dimension = 0
2161    scores = pstat.aunique(N.ravel(a))       # get ALL unique values
2162    testshape = list(a.shape)
2163    testshape[dimension] = 1
2164    oldmostfreq = N.zeros(testshape)
2165    oldcounts = N.zeros(testshape)
2166    for score in scores:
2167        template = N.equal(a,score)
2168        counts = asum(template,dimension,1)
2169        mostfrequent = N.where(N.greater(counts,oldcounts),score,oldmostfreq)
2170        oldcounts = N.where(N.greater(counts,oldcounts),counts,oldcounts)
2171        oldmostfreq = mostfrequent
2172    return oldcounts, mostfrequent
2173
2174
2175 def atmean(a,limits=None,inclusive=(1,1)):
2176     """
2177Returns the arithmetic mean of all values in an array, ignoring values
2178strictly outside the sequence passed to 'limits'.   Note: either limit
2179in the sequence, or the value of limits itself, can be set to None.  The
2180inclusive list/tuple determines whether the lower and upper limiting bounds
2181(respectively) are open/exclusive (0) or closed/inclusive (1).
2182
2183Usage:   atmean(a,limits=None,inclusive=(1,1))
2184"""
2185     if a.typecode() in ['l','s','b']:
2186         a = a.astype(N.Float)
2187     if limits == None:
2188         return mean(a)
2189     assert type(limits) in [ListType,TupleType,N.ArrayType], "Wrong type for limits in atmean"
2190     if inclusive[0]:         lowerfcn = N.greater_equal
2191     else:               lowerfcn = N.greater
2192     if inclusive[1]:         upperfcn = N.less_equal
2193     else:               upperfcn = N.less
2194     if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
2195         raise ValueError, "No array values within given limits (atmean)."
2196     elif limits[0]==None and limits[1]<>None:
2197         mask = upperfcn(a,limits[1])
2198     elif limits[0]<>None and limits[1]==None:
2199         mask = lowerfcn(a,limits[0])
2200     elif limits[0]<>None and limits[1]<>None:
2201         mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
2202     s = float(N.add.reduce(N.ravel(a*mask)))
2203     n = float(N.add.reduce(N.ravel(mask)))
2204     return s/n
2205
2206
2207 def atvar(a,limits=None,inclusive=(1,1)):
2208     """
2209Returns the sample variance of values in an array, (i.e., using N-1),
2210ignoring values strictly outside the sequence passed to 'limits'. 
2211Note: either limit in the sequence, or the value of limits itself,
2212can be set to None.  The inclusive list/tuple determines whether the lower
2213and upper limiting bounds (respectively) are open/exclusive (0) or
2214closed/inclusive (1).
2215
2216Usage:   atvar(a,limits=None,inclusive=(1,1))
2217"""
2218     a = a.astype(N.Float)
2219     if limits == None or limits == [None,None]:
2220         term1 = N.add.reduce(N.ravel(a*a))
2221         n = float(len(N.ravel(a))) - 1
2222         term2 = N.add.reduce(N.ravel(a))**2 / n
2223         print term1, term2, n
2224         return (term1 - term2) / n
2225     assert type(limits) in [ListType,TupleType,N.ArrayType], "Wrong type for limits in atvar"
2226     if inclusive[0]:         lowerfcn = N.greater_equal
2227     else:               lowerfcn = N.greater
2228     if inclusive[1]:         upperfcn = N.less_equal
2229     else:               upperfcn = N.less
2230     if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
2231         raise ValueError, "No array values within given limits (atvar)."
2232     elif limits[0]==None and limits[1]<>None:
2233         mask = upperfcn(a,limits[1])
2234     elif limits[0]<>None and limits[1]==None:
2235         mask = lowerfcn(a,limits[0])
2236     elif limits[0]<>None and limits[1]<>None:
2237         mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
2238     term1 = N.add.reduce(N.ravel(a*a*mask))
2239     n = float(N.add.reduce(N.ravel(mask))) - 1
2240     term2 = N.add.reduce(N.ravel(a*mask))**2 / n
2241     print term1, term2, n
2242     return (term1 - term2) / n
2243
2244
2245 def atmin(a,lowerlimit=None,dimension=None,inclusive=1):
2246     """
2247Returns the minimum value of a, along dimension, including only values less
2248than (or equal to, if inclusive=1) lowerlimit.  If the limit is set to None,
2249all values in the array are used.
2250
2251Usage:   atmin(a,lowerlimit=None,dimension=None,inclusive=1)
2252"""
2253     if inclusive:         lowerfcn = N.greater
2254     else:               lowerfcn = N.greater_equal
2255     if dimension == None:
2256         a = N.ravel(a)
2257         dimension = 0
2258     if lowerlimit == None:
2259         lowerlimit = N.minimum.reduce(N.ravel(a))-11
2260     biggest = N.maximum.reduce(N.ravel(a))
2261     ta = N.where(lowerfcn(a,lowerlimit),a,biggest)
2262     return N.minimum.reduce(ta,dimension)
2263
2264
2265 def atmax(a,upperlimit,dimension=None,inclusive=1):
2266     """
2267Returns the maximum value of a, along dimension, including only values greater
2268than (or equal to, if inclusive=1) upperlimit.  If the limit is set to None,
2269a limit larger than the max value in the array is used.
2270
2271Usage:   atmax(a,upperlimit,dimension=None,inclusive=1)
2272"""
2273     if inclusive:         upperfcn = N.less
2274     else:               upperfcn = N.less_equal
2275     if dimension == None:
2276         a = N.ravel(a)
2277         dimension = 0
2278     if upperlimit == None:
2279         upperlimit = N.maximum.reduce(N.ravel(a))+1
2280     smallest = N.minimum.reduce(N.ravel(a))
2281     ta = N.where(upperfcn(a,upperlimit),a,smallest)
2282     return N.maximum.reduce(ta,dimension)
2283
2284
2285 def atstdev(a,limits=None,inclusive=(1,1)):
2286     """
2287Returns the standard deviation of all values in an array, ignoring values
2288strictly outside the sequence passed to 'limits'.   Note: either limit
2289in the sequence, or the value of limits itself, can be set to None.  The
2290inclusive list/tuple determines whether the lower and upper limiting bounds
2291(respectively) are open/exclusive (0) or closed/inclusive (1).
2292
2293Usage:   atstdev(a,limits=None,inclusive=(1,1))
2294"""
2295     return N.sqrt(tvar(a,limits,inclusive))
2296
2297
2298 def atsem(a,limits=None,inclusive=(1,1)):
2299     """
2300Returns the standard error of the mean for the values in an array,
2301(i.e., using N for the denominator), ignoring values strictly outside
2302the sequence passed to 'limits'.   Note: either limit in the sequence,
2303or the value of limits itself, can be set to None.  The inclusive list/tuple
2304determines whether the lower and upper limiting bounds (respectively) are
2305open/exclusive (0) or closed/inclusive (1).
2306
2307Usage:   atsem(a,limits=None,inclusive=(1,1))
2308"""
2309     sd = tstdev(a,limits,inclusive)
2310     if limits == None or limits == [None,None]:
2311         n = float(len(N.ravel(a)))
2312     assert type(limits) in [ListType,TupleType,N.ArrayType], "Wrong type for limits in atsem"
2313     if inclusive[0]:         lowerfcn = N.greater_equal
2314     else:               lowerfcn = N.greater
2315     if inclusive[1]:         upperfcn = N.less_equal
2316     else:               upperfcn = N.less
2317     if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
2318         raise ValueError, "No array values within given limits (atsem)."
2319     elif limits[0]==None and limits[1]<>None:
2320         mask = upperfcn(a,limits[1])
2321     elif limits[0]<>None and limits[1]==None:
2322         mask = lowerfcn(a,limits[0])
2323     elif limits[0]<>None and limits[1]<>None:
2324         mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
2325     term1 = N.add.reduce(N.ravel(a*a*mask))
2326     n = float(N.add.reduce(N.ravel(mask)))
2327     return sd/math.sqrt(n)
2328
2329
2330#####################################
2331############  AMOMENTS  #############
2332#####################################
2333
2334 def amoment(a,moment=1,dimension=None):
2335    """
2336Calculates the nth moment about the mean for a sample (defaults to the
23371st moment).  Generally used to calculate coefficients of skewness and
2338kurtosis.  Dimension can equal None (ravel array first), an integer
2339(the dimension over which to operate), or a sequence (operate over
2340multiple dimensions).
2341
2342Usage:   amoment(a,moment=1,dimension=None)
2343Returns: appropriate moment along given dimension
2344"""
2345    if dimension == None:
2346        a = N.ravel(a)
2347        dimension = 0
2348    if moment == 1:
2349        return 0.0
2350    else:
2351        mn = amean(a,dimension,1)  # 1=keepdims
2352        s = N.power((a-mn),moment)
2353        return amean(s,dimension)
2354
2355
2356 def avariation(a,dimension=None):
2357    """
2358Returns the coefficient of variation, as defined in CRC Standard
2359Probability and Statistics, p.6. Dimension can equal None (ravel array
2360first), an integer (the dimension over which to operate), or a
2361sequence (operate over multiple dimensions).
2362
2363Usage:   avariation(a,dimension=None)
2364"""
2365    return 100.0*asamplestdev(a,dimension)/amean(a,dimension)
2366
2367
2368 def askew(a,dimension=None): 
2369    """
2370Returns the skewness of a distribution (normal ==> 0.0; >0 means extra
2371weight in left tail).  Use askewtest() to see if it's close enough.
2372Dimension can equal None (ravel array first), an integer (the
2373dimension over which to operate), or a sequence (operate over multiple
2374dimensions).
2375
2376Usage:   askew(a, dimension=None)
2377Returns: skew of vals in a along dimension, returning ZERO where all vals equal
2378"""
2379    denom = N.power(amoment(a,2,dimension),1.5)
2380    zero = N.equal(denom,0)
2381    if type(denom) == N.ArrayType and asum(zero) <> 0:
2382        print "Number of zeros in askew: ",asum(zero)
2383    denom = denom + zero  # prevent divide-by-zero
2384    return N.where(zero, 0, amoment(a,3,dimension)/denom)
2385
2386
2387 def akurtosis(a,dimension=None):
2388    """
2389Returns the kurtosis of a distribution (normal ==> 3.0; >3 means
2390heavier in the tails, and usually more peaked).  Use akurtosistest()
2391to see if it's close enough.  Dimension can equal None (ravel array
2392first), an integer (the dimension over which to operate), or a
2393sequence (operate over multiple dimensions).
2394
2395Usage:   akurtosis(a,dimension=None)
2396Returns: kurtosis of values in a along dimension, and ZERO where all vals equal
2397"""
2398    denom = N.power(amoment(a,2,dimension),2)
2399    zero = N.equal(denom,0)
2400    if type(denom) == N.ArrayType and asum(zero) <> 0:
2401        print "Number of zeros in akurtosis: ",asum(zero)
2402    denom = denom + zero  # prevent divide-by-zero
2403    return N.where(zero,0,amoment(a,4,dimension)/denom)
2404
2405
2406 def adescribe(inarray,dimension=None):
2407     """
2408Returns several descriptive statistics of the passed array.  Dimension
2409can equal None (ravel array first), an integer (the dimension over
2410which to operate), or a sequence (operate over multiple dimensions).
2411
2412Usage:   adescribe(inarray,dimension=None)
2413Returns: n, (min,max), mean, standard deviation, skew, kurtosis
2414"""
2415     if dimension == None:
2416         inarray = N.ravel(inarray)
2417         dimension = 0
2418     n = inarray.shape[dimension]
2419     mm = (N.minimum.reduce(inarray),N.maximum.reduce(inarray))
2420     m = amean(inarray,dimension)
2421     sd = astdev(inarray,dimension)
2422     skew = askew(inarray,dimension)
2423     kurt = akurtosis(inarray,dimension)
2424     return n, mm, m, sd, skew, kurt
2425
2426
2427#####################################
2428########  NORMALITY TESTS  ##########
2429#####################################
2430
2431 def askewtest(a,dimension=None):
2432    """
2433Tests whether the skew is significantly different from a normal
2434distribution.  Dimension can equal None (ravel array first), an
2435integer (the dimension over which to operate), or a sequence (operate
2436over multiple dimensions).
2437
2438Usage:   askewtest(a,dimension=None)
2439Returns: z-score and 2-tail z-probability
2440"""
2441    if dimension == None:
2442        a = N.ravel(a)
2443        dimension = 0
2444    b2 = askew(a,dimension)
2445    n = float(a.shape[dimension])
2446    y = b2 * N.sqrt(((n+1)*(n+3)) / (6.0*(n-2)) )
2447    beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) )
2448    W2 = -1 + N.sqrt(2*(beta2-1))
2449    delta = 1/N.sqrt(N.log(N.sqrt(W2)))
2450    alpha = N.sqrt(2/(W2-1))
2451    y = N.where(N.equal(y,0),1,y)
2452    Z = delta*N.log(y/alpha + N.sqrt((y/alpha)**2+1))
2453    return Z, (1.0-zprob(Z))*2
2454
2455
2456 def akurtosistest(a,dimension=None):
2457    """
2458Tests whether a dataset has normal kurtosis (i.e.,
2459kurtosis=3(n-1)/(n+1)) Valid only for n>20.  Dimension can equal None
2460(ravel array first), an integer (the dimension over which to operate),
2461or a sequence (operate over multiple dimensions).
2462
2463Usage:   akurtosistest(a,dimension=None)
2464Returns: z-score and 2-tail z-probability, returns 0 for bad pixels
2465"""
2466    if dimension == None:
2467        a = N.ravel(a)
2468        dimension = 0
2469    n = float(a.shape[dimension])
2470    if n<20:
2471        print "akurtosistest only valid for n>=20 ... continuing anyway, n=",n
2472    b2 = akurtosis(a,dimension)
2473    E = 3.0*(n-1) /(n+1)
2474    varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5))
2475    x = (b2-E)/N.sqrt(varb2)
2476    sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * N.sqrt((6.0*(n+3)*(n+5))/
2477                                                       (n*(n-2)*(n-3)))
2478    A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + N.sqrt(1+4.0/(sqrtbeta1**2)))
2479    term1 = 1 -2/(9.0*A)
2480    denom = 1 +x*N.sqrt(2/(A-4.0))
2481    denom = N.where(N.less(denom,0), 99, denom)
2482    term2 = N.where(N.equal(denom,0), term1, N.power((1-2.0/A)/denom,1/3.0))
2483    Z = ( term1 - term2 ) / N.sqrt(2/(9.0*A))
2484    Z = N.where(N.equal(denom,99), 0, Z)
2485    return Z, (1.0-zprob(Z))*2
2486
2487
2488 def anormaltest(a,dimension=None):
2489    """
2490Tests whether skew and/OR kurtosis of dataset differs from normal
2491curve.  Can operate over multiple dimensions.  Dimension can equal
2492None (ravel array first), an integer (the dimension over which to
2493operate), or a sequence (operate over multiple dimensions).
2494
2495Usage:   anormaltest(a,dimension=None)
2496Returns: z-score and 2-tail probability
2497"""
2498    if dimension == None:
2499        a = N.ravel(a)
2500        dimension = 0
2501    s,p = askewtest(a,dimension)
2502    k,p = akurtosistest(a,dimension)
2503    k2 = N.power(s,2) + N.power(k,2)
2504    return k2, achisqprob(k2,2)
2505
2506
2507#####################################
2508######  AFREQUENCY FUNCTIONS  #######
2509#####################################
2510
2511 def aitemfreq(a):
2512    """
2513Returns a 2D array of item frequencies.  Column 1 contains item values,
2514column 2 contains their respective counts.  Assumes a 1D array is passed.
2515
2516Usage:   aitemfreq(a)
2517Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies)
2518"""
2519    scores = pstat.aunique(a)
2520    scores = N.sort(scores)
2521    freq = N.zeros(len(scores))
2522    for i in range(len(scores)):
2523        freq[i] = N.add.reduce(N.equal(a,scores[i]))
2524    return N.array(pstat.aabut(scores, freq))
2525
2526
2527 def ascoreatpercentile (inarray, percent):
2528    """
2529Usage:   ascoreatpercentile(inarray,percent)   0<percent<100
2530Returns: score at given percentile, relative to inarray distribution
2531"""
2532    percent = percent / 100.0
2533    targetcf = percent*len(inarray)
2534    h, lrl, binsize, extras = histogram(inarray)
2535    cumhist = cumsum(h*1)
2536    for i in range(len(cumhist)):
2537        if cumhist[i] >= targetcf:
2538            break
2539    score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i)
2540    return score
2541
2542
2543 def apercentileofscore (inarray,score,histbins=10,defaultlimits=None):
2544    """
2545Note: result of this function depends on the values used to histogram
2546the data(!).
2547
2548Usage:   apercentileofscore(inarray,score,histbins=10,defaultlimits=None)
2549Returns: percentile-position of score (0-100) relative to inarray
2550"""
2551    h, lrl, binsize, extras = histogram(inarray,histbins,defaultlimits)
2552    cumhist = cumsum(h*1)
2553    i = int((score - lrl)/float(binsize))
2554    pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inarray)) * 100
2555    return pct
2556
2557
2558 def ahistogram (inarray,numbins=10,defaultlimits=None,printextras=1):
2559    """
2560Returns (i) an array of histogram bin counts, (ii) the smallest value
2561of the histogram binning, and (iii) the bin width (the last 2 are not
2562necessarily integers).  Default number of bins is 10.  Defaultlimits
2563can be None (the routine picks bins spanning all the numbers in the
2564inarray) or a 2-sequence (lowerlimit, upperlimit).  Returns all of the
2565following: array of bin values, lowerreallimit, binsize, extrapoints.
2566
2567Usage:   ahistogram(inarray,numbins=10,defaultlimits=None,printextras=1)
2568Returns: (array of bin counts, bin-minimum, min-width, #-points-outside-range)
2569"""
2570    inarray = N.ravel(inarray)               # flatten any >1D arrays
2571    if (defaultlimits <> None):
2572        lowerreallimit = defaultlimits[0]
2573        upperreallimit = defaultlimits[1]
2574        binsize = (upperreallimit-lowerreallimit) / float(numbins)
2575    else:
2576        Min = N.minimum.reduce(inarray)
2577        Max = N.maximum.reduce(inarray)
2578        estbinwidth = float(Max - Min)/float(numbins) + 1
2579        binsize = (Max-Min+estbinwidth)/float(numbins)
2580        lowerreallimit = Min - binsize/2.0  #lower real limit,1st bin
2581    bins = N.zeros(numbins)
2582    extrapoints = 0
2583    for num in inarray:
2584        try:
2585            if (num-lowerreallimit) < 0:
2586                extrapoints = extrapoints + 1
2587            else:
2588                bintoincrement = int((num-lowerreallimit) / float(binsize))
2589                bins[bintoincrement] = bins[bintoincrement] + 1
2590        except:                           # point outside lower/upper limits
2591            extrapoints = extrapoints + 1
2592    if (extrapoints > 0 and printextras == 1):
2593        print '\nPoints outside given histogram range =',extrapoints
2594    return (bins, lowerreallimit, binsize, extrapoints)
2595
2596
2597 def acumfreq(a,numbins=10,defaultreallimits=None):
2598    """
2599Returns a cumulative frequency histogram, using the histogram function.
2600Defaultreallimits can be None (use all data), or a 2-sequence containing
2601lower and upper limits on values to include.
2602
2603Usage:   acumfreq(a,numbins=10,defaultreallimits=None)
2604Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints
2605"""
2606    h,l,b,e = histogram(a,numbins,defaultreallimits)
2607    cumhist = cumsum(h*1)
2608    return cumhist,l,b,e
2609
2610
2611 def arelfreq(a,numbins=10,defaultreallimits=None):
2612    """
2613Returns a relative frequency histogram, using the histogram function.
2614Defaultreallimits can be None (use all data), or a 2-sequence containing
2615lower and upper limits on values to include.
2616
2617Usage:   arelfreq(a,numbins=10,defaultreallimits=None)
2618Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints
2619"""
2620    h,l,b,e = histogram(a,numbins,defaultreallimits)
2621    h = N.array(h/float(a.shape[0]))
2622    return h,l,b,e
2623
2624
2625#####################################
2626######  AVARIABILITY FUNCTIONS  #####
2627#####################################
2628
2629 def aobrientransform(*args):
2630    """
2631Computes a transform on input data (any number of columns).  Used to
2632test for homogeneity of variance prior to running one-way stats.  Each
2633array in *args is one level of a factor.  If an F_oneway() run on the
2634transformed data and found significant, variances are unequal.   From
2635Maxwell and Delaney, p.112.
2636
2637Usage:   aobrientransform(*args)    *args = 1D arrays, one per level of factor
2638Returns: transformed data for use in an ANOVA
2639"""
2640    TINY = 1e-10
2641    k = len(args)
2642    n = N.zeros(k,N.Float)
2643    v = N.zeros(k,N.Float)
2644    m = N.zeros(k,N.Float)
2645    nargs = []
2646    for i in range(k):
2647        nargs.append(args[i].astype(N.Float))
2648        n[i] = float(len(nargs[i]))
2649        v[i] = var(nargs[i])
2650        m[i] = mean(nargs[i])
2651    for j in range(k):
2652        for i in range(n[j]):
2653            t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2
2654            t2 = 0.5*v[j]*(n[j]-1.0)
2655            t3 = (n[j]-1.0)*(n[j]-2.0)
2656            nargs[j][i] = (t1-t2) / float(t3)
2657    check = 1
2658    for j in range(k):
2659        if v[j] - mean(nargs[j]) > TINY:
2660            check = 0
2661    if check <> 1:
2662        raise ValueError, 'Lack of convergence in obrientransform.'
2663    else:
2664        return N.array(nargs)
2665
2666
2667 def asamplevar (inarray,dimension=None,keepdims=0):
2668    """
2669Returns the sample standard deviation of the values in the passed
2670array (i.e., using N).  Dimension can equal None (ravel array first),
2671an integer (the dimension over which to operate), or a sequence
2672(operate over multiple dimensions).  Set keepdims=1 to return an array
2673with the same number of dimensions as inarray.
2674
2675Usage:   asamplevar(inarray,dimension=None,keepdims=0)
2676"""
2677    if dimension == None:
2678        inarray = N.ravel(inarray)
2679        dimension = 0
2680    if dimension == 1:
2681        mn = amean(inarray,dimension)[:,N.NewAxis]
2682    else:
2683        mn = amean(inarray,dimension,keepdims=1)
2684    deviations = inarray - mn
2685    if type(dimension) == ListType:
2686        n = 1
2687        for d in dimension:
2688            n = n*inarray.shape[d]
2689    else:
2690        n = inarray.shape[dimension]
2691    svar = ass(deviations,dimension,keepdims) / float(n)
2692    return svar
2693
2694
2695 def asamplestdev (inarray, dimension=None, keepdims=0):
2696    """
2697Returns the sample standard deviation of the values in the passed
2698array (i.e., using N).  Dimension can equal None (ravel array first),
2699an integer (the dimension over which to operate), or a sequence
2700(operate over multiple dimensions).  Set keepdims=1 to return an array
2701with the same number of dimensions as inarray.
2702
2703Usage:   asamplestdev(inarray,dimension=None,keepdims=0)
2704"""
2705    return N.sqrt(asamplevar(inarray,dimension,keepdims))
2706
2707
2708 def asignaltonoise(instack,dimension=0):
2709    """
2710Calculates signal-to-noise.  Dimension can equal None (ravel array
2711first), an integer (the dimension over which to operate), or a
2712sequence (operate over multiple dimensions).
2713
2714Usage:   asignaltonoise(instack,dimension=0):
2715Returns: array containing the value of (mean/stdev) along dimension,
2716         or 0 when stdev=0
2717"""
2718    m = mean(instack,dimension)
2719    sd = stdev(instack,dimension)
2720    return N.where(N.equal(sd,0),0,m/sd)
2721
2722
2723 def avar (inarray, dimension=None,keepdims=0):
2724    """
2725Returns the estimated population variance of the values in the passed
2726array (i.e., N-1).  Dimension can equal None (ravel array first), an
2727integer (the dimension over which to operate), or a sequence (operate
2728over multiple dimensions).  Set keepdims=1 to return an array with the
2729same number of dimensions as inarray.
2730
2731Usage:   avar(inarray,dimension=None,keepdims=0)
2732"""
2733    if dimension == None:
2734        inarray = N.ravel(inarray)
2735        dimension = 0
2736    mn = amean(inarray,dimension,1)
2737    deviations = inarray - mn
2738    if type(dimension) == ListType:
2739        n = 1
2740        for d in dimension:
2741            n = n*inarray.shape[d]
2742    else:
2743        n = inarray.shape[dimension]
2744    var = ass(deviations,dimension,keepdims)/float(n-1)
2745    return var
2746
2747
2748 def astdev (inarray, dimension=None, keepdims=0):
2749    """
2750Returns the estimated population standard deviation of the values in
2751the passed array (i.e., N-1).  Dimension can equal None (ravel array
2752first), an integer (the dimension over which to operate), or a
2753sequence (operate over multiple dimensions).  Set keepdims=1 to return
2754an array with the same number of dimensions as inarray.
2755
2756Usage:   astdev(inarray,dimension=None,keepdims=0)
2757"""
2758    return N.sqrt(avar(inarray,dimension,keepdims))
2759
2760
2761 def asterr (inarray, dimension=None, keepdims=0):
2762    """
2763Returns the estimated population standard error of the values in the
2764passed array (i.e., N-1).  Dimension can equal None (ravel array
2765first), an integer (the dimension over which to operate), or a
2766sequence (operate over multiple dimensions).  Set keepdims=1 to return
2767an array with the same number of dimensions as inarray.
2768
2769Usage:   asterr(inarray,dimension=None,keepdims=0)
2770"""
2771    if dimension == None:
2772        inarray = N.ravel(inarray)
2773        dimension = 0
2774    return astdev(inarray,dimension,keepdims) / float(N.sqrt(inarray.shape[dimension]))
2775
2776
2777 def asem (inarray, dimension=None, keepdims=0):
2778    """
2779Returns the standard error of the mean (i.e., using N) of the values
2780in the passed array.  Dimension can equal None (ravel array first), an
2781integer (the dimension over which to operate), or a sequence (operate
2782over multiple dimensions).  Set keepdims=1 to return an array with the
2783same number of dimensions as inarray.
2784
2785Usage:   asem(inarray,dimension=None, keepdims=0)
2786"""
2787    if dimension == None:
2788        inarray = N.ravel(inarray)
2789        dimension = 0
2790    if type(dimension) == ListType:
2791        n = 1
2792        for d in dimension:
2793            n = n*inarray.shape[d]
2794    else:
2795        n = inarray.shape[dimension]
2796    s = asamplestdev(inarray,dimension,keepdims) / N.sqrt(n-1)
2797    return s
2798
2799
2800 def az (a, score):
2801    """
2802Returns the z-score of a given input score, given thearray from which
2803that score came.  Not appropriate for population calculations, nor for
2804arrays > 1D.
2805
2806Usage:   az(a, score)
2807"""
2808    z = (score-amean(a)) / asamplestdev(a)
2809    return z
2810
2811
2812 def azs (a):
2813    """
2814Returns a 1D array of z-scores, one for each score in the passed array,
2815computed relative to the passed array.
2816
2817Usage:   azs(a)
2818"""
2819    zscores = []
2820    for item in a:
2821        zscores.append(z(a,item))
2822    return N.array(zscores)
2823
2824
2825 def azmap (scores, compare, dimension=0):
2826    """
2827Returns an array of z-scores the shape of scores (e.g., [x,y]), compared to
2828array passed to compare (e.g., [time,x,y]).  Assumes collapsing over dim 0
2829of the compare array.
2830
2831Usage:   azs(scores, compare, dimension=0)
2832"""
2833    mns = amean(compare,dimension)
2834    sstd = asamplestdev(compare,0)
2835    return (scores - mns) / sstd
2836
2837
2838#####################################
2839#######  ATRIMMING FUNCTIONS  #######
2840#####################################
2841
2842 def around(a,digits=1):
2843     """
2844Rounds all values in array a to 'digits' decimal places.
2845
2846Usage:   around(a,digits)
2847Returns: a, where each value is rounded to 'digits' decimals
2848"""
2849     def ar(x,d=digits):
2850         return round(x,d)
2851
2852     if type(a) <> N.ArrayType:
2853         try:
2854             a = N.array(a)
2855         except:
2856             a = N.array(a,'O')
2857     shp = a.shape
2858     if a.typecode() in ['f','F','d','D']:
2859         b = N.ravel(a)
2860         b = N.array(map(ar,b))
2861         b.shape = shp
2862     elif a.typecode() in ['o','O']:
2863         b = N.ravel(a)*1
2864         for i in range(len(b)):
2865             if type(b[i]) == FloatType:
2866                 b[i] = round(b[i],digits)
2867         b.shape = shp
2868     else:  # not a float, double or Object array
2869         b = a*1
2870     return b
2871
2872
2873 def athreshold(a,threshmin=None,threshmax=None,newval=0):
2874    """
2875Like Numeric.clip() except that values <threshmid or >threshmax are replaced
2876by newval instead of by threshmin/threshmax (respectively).
2877
2878Usage:   athreshold(a,threshmin=None,threshmax=None,newval=0)
2879Returns: a, with values <threshmin or >threshmax replaced with newval
2880"""
2881    mask = N.zeros(a.shape)
2882    if threshmin <> None:
2883        mask = mask + N.where(N.less(a,threshmin),1,0)
2884    if threshmax <> None:
2885        mask = mask + N.where(N.greater(a,threshmax),1,0)
2886    mask = N.clip(mask,0,1)
2887    return N.where(mask,newval,a)
2888
2889
2890 def atrimboth (a,proportiontocut):
2891    """
2892Slices off the passed proportion of items from BOTH ends of the passed
2893array (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND
2894'rightmost' 10% of scores.  You must pre-sort the array if you want
2895"proper" trimming.  Slices off LESS if proportion results in a
2896non-integer slice index (i.e., conservatively slices off
2897proportiontocut).
2898
2899Usage:   atrimboth (a,proportiontocut)
2900Returns: trimmed version of array a
2901"""
2902    lowercut = int(proportiontocut*len(a))
2903    uppercut = len(a) - lowercut
2904    return a[lowercut:uppercut]
2905
2906
2907 def atrim1 (a,proportiontocut,tail='right'):
2908    """
2909Slices off the passed proportion of items from ONE end of the passed
2910array (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost'
291110% of scores).  Slices off LESS if proportion results in a non-integer
2912slice index (i.e., conservatively slices off proportiontocut).
2913
2914Usage:   atrim1(a,proportiontocut,tail='right')  or set tail='left'
2915Returns: trimmed version of array a
2916"""
2917    if string.lower(tail) == 'right':
2918        lowercut = 0
2919        uppercut = len(a) - int(proportiontocut*len(a))
2920    elif string.lower(tail) == 'left':
2921        lowercut = int(proportiontocut*len(a))
2922        uppercut = len(a)
2923    return a[lowercut:uppercut]
2924
2925
2926#####################################
2927#####  ACORRELATION FUNCTIONS  ######
2928#####################################
2929
2930 def acovariance(X):
2931    """
2932Computes the covariance matrix of a matrix X.  Requires a 2D matrix input.
2933
2934Usage:   acovariance(X)
2935Returns: covariance matrix of X
2936"""
2937    if len(X.shape) <> 2:
2938        raise TypeError, "acovariance requires 2D matrices"
2939    n = X.shape[0]
2940    mX = amean(X,0)
2941    return N.dot(N.transpose(X),X) / float(n) - N.multiply.outer(mX,mX)
2942
2943
2944 def acorrelation(X):
2945    """
2946Computes the correlation matrix of a matrix X.  Requires a 2D matrix input.
2947
2948Usage:   acorrelation(X)
2949Returns: correlation matrix of X
2950"""
2951    C = acovariance(X)
2952    V = N.diagonal(C)
2953    return C / N.sqrt(N.multiply.outer(V,V))
2954
2955
2956 def apaired(x,y):
2957    """
2958Interactively determines the type of data in x and y, and then runs the
2959appropriated statistic for paired group data.
2960
2961Usage:   apaired(x,y)     x,y = the two arrays of values to be compared
2962Returns: appropriate statistic name, value, and probability
2963"""
2964    samples = ''
2965    while samples not in ['i','r','I','R','c','C']:
2966        print '\nIndependent or related samples, or correlation (i,r,c): ',
2967        samples = raw_input()
2968
2969    if samples in ['i','I','r','R']:
2970        print '\nComparing variances ...',
2971# USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112
2972        r = obrientransform(x,y)
2973        f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1))
2974        if p<0.05:
2975            vartype='unequal, p='+str(round(p,4))
2976        else:
2977            vartype='equal'
2978        print vartype
2979        if samples in ['i','I']:
2980            if vartype[0]=='e':
2981                t,p = ttest_ind(x,y,None,0)
2982                print '\nIndependent samples t-test:  ', round(t,4),round(p,4)
2983            else:
2984                if len(x)>20 or len(y)>20:
2985                    z,p = ranksums(x,y)
2986                    print '\nRank Sums test (NONparametric, n>20):  ', round(z,4),round(p,4)
2987                else:
2988                    u,p = mannwhitneyu(x,y)
2989                    print '\nMann-Whitney U-test (NONparametric, ns<20):  ', round(u,4),round(p,4)
2990
2991        else:  # RELATED SAMPLES
2992            if vartype[0]=='e':
2993                t,p = ttest_rel(x,y,0)
2994                print '\nRelated samples t-test:  ', round(t,4),round(p,4)
2995            else:
2996                t,p = ranksums(x,y)
2997                print '\nWilcoxon T-test (NONparametric):  ', round(t,4),round(p,4)
2998    else:  # CORRELATION ANALYSIS
2999        corrtype = ''
3000        while corrtype not in ['c','C','r','R','d','D']:
3001            print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ',
3002            corrtype = raw_input()
3003        if corrtype in ['c','C']:
3004            m,b,r,p,see = linregress(x,y)
3005            print '\nLinear regression for continuous variables ...'
3006            lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]]
3007            pstat.printcc(lol)
3008        elif corrtype in ['r','R']:
3009            r,p = spearmanr(x,y)
3010            print '\nCorrelation for ranked variables ...'
3011            print "Spearman's r: ",round(r,4),round(p,4)
3012        else: # DICHOTOMOUS
3013            r,p = pointbiserialr(x,y)
3014            print '\nAssuming x contains a dichotomous variable ...'
3015            print 'Point Biserial r: ',round(r,4),round(p,4)
3016    print '\n\n'
3017    return None
3018
3019
3020 def apearsonr(x,y,verbose=1):
3021    """
3022Calculates a Pearson correlation coefficient and returns p.  Taken
3023from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195.
3024
3025Usage:   apearsonr(x,y,verbose=1)      where x,y are equal length arrays
3026Returns: Pearson's r, two-tailed p-value
3027"""
3028    TINY = 1.0e-20
3029    n = len(x)
3030    xmean = amean(x)
3031    ymean = amean(y)
3032    r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y)
3033    r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y)))
3034    r = (r_num / r_den)
3035    df = n-2
3036    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
3037    prob = abetai(0.5*df,0.5,df/(df+t*t),verbose)
3038    return r,prob
3039
3040
3041 def aspearmanr(x,y):
3042    """
3043Calculates a Spearman rank-order correlation coefficient.  Taken
3044from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.
3045
3046Usage:   aspearmanr(x,y)      where x,y are equal-length arrays
3047Returns: Spearman's r, two-tailed p-value
3048"""
3049    TINY = 1e-30
3050    n = len(x)
3051    rankx = rankdata(x)
3052    ranky = rankdata(y)
3053    dsq = N.add.reduce((rankx-ranky)**2)
3054    rs = 1 - 6*dsq / float(n*(n**2-1))
3055    t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
3056    df = n-2
3057    probrs = abetai(0.5*df,0.5,df/(df+t*t))
3058# probability values for rs are from part 2 of the spearman function in
3059# Numerical Recipies, p.510.  They close to tables, but not exact.(?)
3060    return rs, probrs
3061
3062
3063 def apointbiserialr(x,y):
3064    """
3065Calculates a point-biserial correlation coefficient and the associated
3066probability value.  Taken from Heiman's Basic Statistics for the Behav.
3067Sci (1st), p.194.
3068
3069Usage:   apointbiserialr(x,y)      where x,y are equal length arrays
3070Returns: Point-biserial r, two-tailed p-value
3071"""
3072    TINY = 1e-30
3073    categories = pstat.aunique(x)
3074    data = pstat.aabut(x,y)
3075    if len(categories) <> 2:
3076        raise ValueError, "Exactly 2 categories required (in x) for pointbiserialr()."
3077    else:   # there are 2 categories, continue
3078        codemap = pstat.aabut(categories,N.arange(2))
3079        recoded = pstat.arecode(data,codemap,0)
3080        x = pstat.alinexand(data,0,categories[0])
3081        y = pstat.alinexand(data,0,categories[1])
3082        xmean = amean(pstat.acolex(x,1))
3083        ymean = amean(pstat.acolex(y,1))
3084        n = len(data)
3085        adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n)))
3086        rpb = (ymean - xmean)/asamplestdev(pstat.acolex(data,1))*adjust
3087        df = n-2
3088        t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY)))
3089        prob = abetai(0.5*df,0.5,df/(df+t*t))
3090        return rpb, prob
3091
3092
3093 def akendalltau(x,y):
3094    """
3095Calculates Kendall's tau ... correlation of ordinal data.  Adapted
3096from function kendl1 in Numerical Recipies.  Needs good test-cases.@@@
3097
3098Usage:   akendalltau(x,y)
3099Returns: Kendall's tau, two-tailed p-value
3100"""
3101    n1 = 0
3102    n2 = 0
3103    iss = 0
3104    for j in range(len(x)-1):
3105        for k in range(j,len(y)):
3106            a1 = x[j] - x[k]
3107            a2 = y[j] - y[k]
3108            aa = a1 * a2
3109            if (aa):             # neither array has a tie
3110                n1 = n1 + 1
3111                n2 = n2 + 1
3112                if aa > 0:
3113                    iss = iss + 1
3114                else:
3115                    iss = iss -1
3116            else:
3117                if (a1):
3118                    n1 = n1 + 1
3119                else:
3120                    n2 = n2 + 1
3121    tau = iss / math.sqrt(n1*n2)
3122    svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
3123    z = tau / math.sqrt(svar)
3124    prob = erfcc(abs(z)/1.4142136)
3125    return tau, prob
3126
3127
3128 def alinregress(*args):
3129    """
3130Calculates a regression line on two arrays, x and y, corresponding to x,y
3131pairs.  If a single 2D array is passed, alinregress finds dim with 2 levels
3132and splits data into x,y pairs along that dim.
3133
3134Usage:   alinregress(*args)    args=2 equal-length arrays, or one 2D array
3135Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate
3136"""
3137    TINY = 1.0e-20
3138    if len(args) == 1:  # more than 1D array?
3139        args = args[0]
3140        if len(args) == 2:
3141            x = args[0]
3142            y = args[1]
3143        else:
3144            x = args[:,0]
3145            y = args[:,1]
3146    else:
3147        x = args[0]
3148        y = args[1]
3149    n = len(x)
3150    xmean = amean(x)
3151    ymean = amean(y)
3152    r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y)
3153    r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y)))
3154    r = r_num / r_den
3155    z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY))
3156    df = n-2
3157    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
3158    prob = abetai(0.5*df,0.5,df/(df+t*t))
3159    slope = r_num / (float(n)*ass(x) - asquare_of_sums(x))
3160    intercept = ymean - slope*xmean
3161    sterrest = math.sqrt(1-r*r)*asamplestdev(y)
3162    return slope, intercept, r, prob, sterrest
3163
3164
3165#####################################
3166#####  AINFERENTIAL STATISTICS  #####
3167#####################################
3168
3169 def attest_1samp(a,popmean,printit=0,name='Sample',writemode='a'):
3170    """
3171Calculates the t-obtained for the independent samples T-test on ONE group
3172of scores a, given a population mean.  If printit=1, results are printed
3173to the screen.  If printit='filename', the results are output to 'filename'
3174using the given writemode (default=append).  Returns t-value, and prob.
3175
3176Usage:   attest_1samp(a,popmean,Name='Sample',printit=0,writemode='a')
3177Returns: t-value, two-tailed prob
3178"""
3179    if type(a) != N.ArrayType:
3180        a = N.array(a)
3181    x = amean(a)
3182    v = avar(a)
3183    n = len(a)
3184    df = n-1
3185    svar = ((n-1)*v) / float(df)
3186    t = (x-popmean)/math.sqrt(svar*(1.0/n))
3187    prob = abetai(0.5*df,0.5,df/(df+t*t))
3188
3189    if printit <> 0:
3190        statname = 'Single-sample T-test.'
3191        outputpairedstats(printit,writemode,
3192                          'Population','--',popmean,0,0,0,
3193                          name,n,x,v,N.minimum.reduce(N.ravel(a)),
3194                          N.maximum.reduce(N.ravel(a)),
3195                          statname,t,prob)
3196    return t,prob
3197
3198
3199 def attest_ind (a, b, dimension=None, printit=0, name1='Samp1', name2='Samp2',writemode='a'):
3200    """
3201Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores
3202a, and b.  From Numerical Recipies, p.483.  If printit=1, results are
3203printed to the screen.  If printit='filename', the results are output
3204to 'filename' using the given writemode (default=append).  Dimension
3205can equal None (ravel array first), or an integer (the dimension over
3206which to operate on a and b).
3207
3208Usage:   attest_ind (a,b,dimension=None,printit=0,
3209                     Name1='Samp1',Name2='Samp2',writemode='a')
3210Returns: t-value, two-tailed p-value
3211"""
3212    if dimension == None:
3213        a = N.ravel(a)
3214        b = N.ravel(b)
3215        dimension = 0
3216    x1 = amean(a,dimension)
3217    x2 = amean(b,dimension)
3218    v1 = avar(a,dimension)
3219    v2 = avar(b,dimension)
3220    n1 = a.shape[dimension]
3221    n2 = b.shape[dimension]
3222    df = n1+n2-2
3223    svar = ((n1-1)*v1+(n2-1)*v2) / float(df)
3224    zerodivproblem = N.equal(svar,0)
3225    svar = N.where(zerodivproblem,1,svar)  # avoid zero-division in 1st place
3226    t = (x1-x2)/N.sqrt(svar*(1.0/n1 + 1.0/n2))  # N-D COMPUTATION HERE!!!!!!
3227    t = N.where(zerodivproblem,1.0,t)     # replace NaN/wrong t-values with 1.0
3228    probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
3229
3230    if type(t) == N.ArrayType:
3231        probs = N.reshape(probs,t.shape)
3232    if len(probs) == 1:
3233        probs = probs[0]
3234       
3235    if printit <> 0:
3236        if type(t) == N.ArrayType:
3237            t = t[0]
3238        if type(probs) == N.ArrayType:
3239            probs = probs[0]
3240        statname = 'Independent samples T-test.'
3241        outputpairedstats(printit,writemode,
3242                          name1,n1,x1,v1,N.minimum.reduce(N.ravel(a)),
3243                          N.maximum.reduce(N.ravel(a)),
3244                          name2,n2,x2,v2,N.minimum.reduce(N.ravel(b)),
3245                          N.maximum.reduce(N.ravel(b)),
3246                          statname,t,probs)
3247        return
3248    return t, probs
3249
3250
3251 def attest_rel (a,b,dimension=None,printit=0,name1='Samp1',name2='Samp2',writemode='a'):
3252    """
3253Calculates the t-obtained T-test on TWO RELATED samples of scores, a
3254and b.  From Numerical Recipies, p.483.  If printit=1, results are
3255printed to the screen.  If printit='filename', the results are output
3256to 'filename' using the given writemode (default=append).  Dimension
3257can equal None (ravel array first), or an integer (the dimension over
3258which to operate on a and b).
3259
3260Usage:   attest_rel(a,b,dimension=None,printit=0,
3261                    name1='Samp1',name2='Samp2',writemode='a')
3262Returns: t-value, two-tailed p-value
3263"""
3264    if dimension == None:
3265        a = N.ravel(a)
3266        b = N.ravel(b)
3267        dimension = 0
3268    if len(a)<>len(b):
3269        raise ValueError, 'Unequal length arrays.'
3270    x1 = amean(a,dimension)
3271    x2 = amean(b,dimension)
3272    v1 = avar(a,dimension)
3273    v2 = avar(b,dimension)
3274    n = a.shape[dimension]
3275    df = float(n-1)
3276    d = (a-b).astype('d')
3277
3278    denom = N.sqrt((n*N.add.reduce(d*d,dimension) - N.add.reduce(d,dimension)**2) /df)
3279    zerodivproblem = N.equal(denom,0)
3280    denom = N.where(zerodivproblem,1,denom)  # avoid zero-division in 1st place
3281    t = N.add.reduce(d,dimension) / denom      # N-D COMPUTATION HERE!!!!!!
3282    t = N.where(zerodivproblem,1.0,t)     # replace NaN/wrong t-values with 1.0
3283    probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
3284    if type(t) == N.ArrayType:
3285        probs = N.reshape(probs,t.shape)
3286    if len(probs) == 1:
3287        probs = probs[0]
3288
3289    if printit <> 0:
3290        statname = 'Related samples T-test.'
3291        outputpairedstats(printit,writemode,
3292                          name1,n,x1,v1,N.minimum.reduce(N.ravel(a)),
3293                          N.maximum.reduce(N.ravel(a)),
3294                          name2,n,x2,v2,N.minimum.reduce(N.ravel(b)),
3295                          N.maximum.reduce(N.ravel(b)),
3296                          statname,t,probs)
3297        return
3298    return t, probs
3299
3300
3301 def achisquare(f_obs,f_exp=None):
3302    """
3303Calculates a one-way chi square for array of observed frequencies and returns
3304the result.  If no expected frequencies are given, the total N is assumed to
3305be equally distributed across all groups.
3306
3307Usage:   achisquare(f_obs, f_exp=None)   f_obs = array of observed cell freq.
3308Returns: chisquare-statistic, associated p-value
3309"""
3310
3311    k = len(f_obs)
3312    if f_exp == None:
3313        f_exp = N.array([sum(f_obs)/float(k)] * len(f_obs),N.Float)
3314    f_exp = f_exp.astype(N.Float)
3315    chisq = N.add.reduce((f_obs-f_exp)**2 / f_exp)
3316    return chisq, chisqprob(chisq, k-1)
3317
3318
3319 def aks_2samp (data1,data2):
3320    """
3321Computes the Kolmogorov-Smirnof statistic on 2 samples.  Modified from
3322Numerical Recipies in C, page 493.  Returns KS D-value, prob.  Not ufunc-
3323like.
3324
3325Usage:   aks_2samp(data1,data2)  where data1 and data2 are 1D arrays
3326Returns: KS D-value, p-value
3327"""
3328    j1 = 0    # N.zeros(data1.shape[1:]) TRIED TO MAKE THIS UFUNC-LIKE
3329    j2 = 0    # N.zeros(data2.shape[1:])
3330    fn1 = 0.0 # N.zeros(data1.shape[1:],N.Float)
3331    fn2 = 0.0 # N.zeros(data2.shape[1:],N.Float)
3332    n1 = data1.shape[0]
3333    n2 = data2.shape[0]
3334    en1 = n1*1
3335    en2 = n2*1
3336    d = N.zeros(data1.shape[1:],N.Float)
3337    data1 = N.sort(data1,0)
3338    data2 = N.sort(data2,0)
3339    while j1 < n1 and j2 < n2:
3340        d1=data1[j1]
3341        d2=data2[j2]
3342        if d1 <= d2:
3343            fn1 = (j1)/float(en1)
3344            j1 = j1 + 1
3345        if d2 <= d1:
3346            fn2 = (j2)/float(en2)
3347            j2 = j2 + 1
3348        dt = (fn2-fn1)
3349        if abs(dt) > abs(d):
3350            d = dt
3351    try:
3352        en = math.sqrt(en1*en2/float(en1+en2))
3353        prob = aksprob((en+0.12+0.11/en)*N.fabs(d))
3354    except:
3355        prob = 1.0
3356    return d, prob
3357
3358
3359 def amannwhitneyu(x,y):
3360    """
3361Calculates a Mann-Whitney U statistic on the provided scores and
3362returns the result.  Use only when the n in each condition is < 20 and
3363you have 2 independent samples of ranks.  REMEMBER: Mann-Whitney U is
3364significant if the u-obtained is LESS THAN or equal to the critical
3365value of U.
3366
3367Usage:   amannwhitneyu(x,y)     where x,y are arrays of values for 2 conditions
3368Returns: u-statistic, one-tailed p-value (i.e., p(z(U)))
3369"""
3370    n1 = len(x)
3371    n2 = len(y)
3372    ranked = rankdata(N.concatenate((x,y)))
3373    rankx = ranked[0:n1]       # get the x-ranks
3374    ranky = ranked[n1:]        # the rest are y-ranks
3375    u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx)  # calc U for x
3376    u2 = n1*n2 - u1                            # remainder is U for y
3377    bigu = max(u1,u2)
3378    smallu = min(u1,u2)
3379    T = math.sqrt(tiecorrect(ranked))  # correction factor for tied scores
3380    if T == 0:
3381        raise ValueError, 'All numbers are identical in amannwhitneyu'
3382    sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0)
3383    z = abs((bigu-n1*n2/2.0) / sd)  # normal approximation for prob calc
3384    return smallu, 1.0 - zprob(z)
3385
3386
3387 def atiecorrect(rankvals):
3388    """
3389Tie-corrector for ties in Mann Whitney U and Kruskal Wallis H tests.
3390See Siegel, S. (1956) Nonparametric Statistics for the Behavioral
3391Sciences.  New York: McGraw-Hill.  Code adapted from |Stat rankind.c
3392code.
3393
3394Usage:   atiecorrect(rankvals)
3395Returns: T correction factor for U or H
3396"""
3397    sorted,posn = ashellsort(N.array(rankvals))
3398    n = len(sorted)
3399    T = 0.0
3400    i = 0
3401    while (i<n-1):
3402        if sorted[i] == sorted[i+1]:
3403            nties = 1
3404            while (i<n-1) and (sorted[i] == sorted[i+1]):
3405                nties = nties +1
3406                i = i +1
3407            T = T + nties**3 - nties
3408        i = i+1
3409    T = T / float(n**3-n)
3410    return 1.0 - T
3411
3412
3413 def aranksums(x,y):
3414    """
3415Calculates the rank sums statistic on the provided scores and returns
3416the result.
3417
3418Usage:   aranksums(x,y)     where x,y are arrays of values for 2 conditions
3419Returns: z-statistic, two-tailed p-value
3420"""
3421    n1 = len(x)
3422    n2 = len(y)
3423    alldata = N.concatenate((x,y))
3424    ranked = arankdata(alldata)
3425    x = ranked[:n1]
3426    y = ranked[n1:]
3427    s = sum(x)
3428    expected = n1*(n1+n2+1) / 2.0
3429    z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0)
3430    prob = 2*(1.0 -zprob(abs(z)))
3431    return z, prob
3432
3433
3434 def awilcoxont(x,y):
3435    """
3436Calculates the Wilcoxon T-test for related samples and returns the
3437result.  A non-parametric T-test.
3438
3439Usage:   awilcoxont(x,y)     where x,y are equal-length arrays for 2 conditions
3440Returns: t-statistic, two-tailed p-value
3441"""
3442    if len(x) <> len(y):
3443        raise ValueError, 'Unequal N in awilcoxont.  Aborting.'
3444    d = x-y
3445    d = N.compress(N.not_equal(d,0),d) # Keep all non-zero differences
3446    count = len(d)
3447    absd = abs(d)
3448    absranked = arankdata(absd)
3449    r_plus = 0.0
3450    r_minus = 0.0
3451    for i in range(len(absd)):
3452        if d[i] < 0:
3453            r_minus = r_minus + absranked[i]
3454        else:
3455            r_plus = r_plus + absranked[i]
3456    wt = min(r_plus, r_minus)
3457    mn = count * (count+1) * 0.25
3458    se =  math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0)
3459    z = math.fabs(wt-mn) / se
3460    z = math.fabs(wt-mn) / se
3461    prob = 2*(1.0 -zprob(abs(z)))
3462    return wt, prob
3463
3464
3465 def akruskalwallish(*args):
3466    """
3467The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more
3468groups, requiring at least 5 subjects in each group.  This function
3469calculates the Kruskal-Wallis H and associated p-value for 3 or more
3470independent samples.
3471
3472Usage:   akruskalwallish(*args)     args are separate arrays for 3+ conditions
3473Returns: H-statistic (corrected for ties), associated p-value
3474"""
3475    assert len(args) == 3, "Need at least 3 groups in stats.akruskalwallish()"
3476    args = list(args)
3477    n = [0]*len(args)
3478    n = map(len,args)
3479    all = []
3480    for i in range(len(args)):
3481        all = all + args[i].tolist()
3482    ranked = rankdata(all)
3483    T = tiecorrect(ranked)
3484    for i in range(len(args)):
3485        args[i] = ranked[0:n[i]]
3486        del ranked[0:n[i]]
3487    rsums = []
3488    for i in range(len(args)):
3489        rsums.append(sum(args[i])**2)
3490        rsums[i] = rsums[i] / float(n[i])
3491    ssbn = sum(rsums)
3492    totaln = sum(n)
3493    h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
3494    df = len(args) - 1
3495    if T == 0:
3496        raise ValueError, 'All numbers are identical in akruskalwallish'
3497    h = h / float(T)
3498    return h, chisqprob(h,df)
3499
3500
3501 def afriedmanchisquare(*args):
3502    """
3503Friedman Chi-Square is a non-parametric, one-way within-subjects
3504ANOVA.  This function calculates the Friedman Chi-square test for
3505repeated measures and returns the result, along with the associated
3506probability value.  It assumes 3 or more repeated measures.  Only 3
3507levels requires a minimum of 10 subjects in the study.  Four levels
3508requires 5 subjects per level(??).
3509
3510Usage:   afriedmanchisquare(*args)   args are separate arrays for 2+ conditions
3511Returns: chi-square statistic, associated p-value
3512"""
3513    k = len(args)
3514    if k < 3:
3515        raise ValueError, '\nLess than 3 levels.  Friedman test not appropriate.\n'
3516    n = len(args[0])
3517    data = apply(pstat.aabut,args)
3518    data = data.astype(N.Float)
3519    for i in range(len(data)):
3520        data[i] = arankdata(data[i])
3521    ssbn = asum(asum(args,1)**2)
3522    chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)
3523    return chisq, chisqprob(chisq,k-1)
3524
3525
3526#####################################
3527####  APROBABILITY CALCULATIONS  ####
3528#####################################
3529
3530 def achisqprob(chisq,df):
3531    """
3532Returns the (1-tail) probability value associated with the provided chi-square
3533value and df.  Heavily modified from chisq.c in Gary Perlman's |Stat.  Can
3534handle multiple dimensions.
3535
3536Usage:   achisqprob(chisq,df)    chisq=chisquare stat., df=degrees of freedom
3537"""
3538    BIG = 200.0
3539    def ex(x):
3540        BIG = 200.0
3541        exponents = N.where(N.less(x,-BIG),-BIG,x)
3542        return N.exp(exponents)
3543
3544    if type(chisq) == N.ArrayType:
3545        arrayflag = 1
3546    else:
3547        arrayflag = 0
3548        chisq = N.array([chisq])
3549    if df < 1:
3550        return N.ones(chisq.shape,N.float)
3551    probs = N.zeros(chisq.shape,N.Float)
3552    probs = N.where(N.less_equal(chisq,0),1.0,probs)  # set prob=1 for chisq<0
3553    a = 0.5 * chisq
3554    if df > 1:
3555        y = ex(-a)
3556    if df%2 == 0:
3557        even = 1
3558        s = y*1
3559        s2 = s*1
3560    else:
3561        even = 0
3562        s = 2.0 * azprob(-N.sqrt(chisq))
3563        s2 = s*1
3564    if (df > 2):
3565        chisq = 0.5 * (df - 1.0)
3566        if even:
3567            z = N.ones(probs.shape,N.Float)
3568        else:
3569            z = 0.5 *N.ones(probs.shape,N.Float)
3570        if even:
3571            e = N.zeros(probs.shape,N.Float)
3572        else:
3573            e = N.log(N.sqrt(N.pi)) *N.ones(probs.shape,N.Float)
3574        c = N.log(a)
3575        mask = N.zeros(probs.shape)
3576        a_big = N.greater(a,BIG)
3577        a_big_frozen = -1 *N.ones(probs.shape,N.Float)
3578        totalelements = N.multiply.reduce(N.array(probs.shape))
3579        while asum(mask)<>totalelements:
3580            e = N.log(z) + e
3581            s = s + ex(c*z-a-e)
3582            z = z + 1.0
3583#            print z, e, s
3584            newmask = N.greater(z,chisq)
3585            a_big_frozen = N.where(newmask*N.equal(mask,0)*a_big, s, a_big_frozen)
3586            mask = N.clip(newmask+mask,0,1)
3587        if even:
3588            z = N.ones(probs.shape,N.Float)
3589            e = N.ones(probs.shape,N.Float)
3590        else:
3591            z = 0.5 *N.ones(probs.shape,N.Float)
3592            e = 1.0 / N.sqrt(N.pi) / N.sqrt(a) * N.ones(probs.shape,N.Float)
3593        c = 0.0
3594        mask = N.zeros(probs.shape)
3595        a_notbig_frozen = -1 *N.ones(probs.shape,N.Float)
3596        while asum(mask)<>totalelements:
3597            e = e * (a/z.astype(N.Float))
3598            c = c + e
3599            z = z + 1.0
3600#            print '#2', z, e, c, s, c*y+s2
3601            newmask = N.greater(z,chisq)
3602            a_notbig_frozen = N.where(newmask*N.equal(mask,0)*(1-a_big),
3603                                      c*y+s2, a_notbig_frozen)
3604            mask = N.clip(newmask+mask,0,1)
3605        probs = N.where(N.equal(probs,1),1,
3606                        N.where(N.greater(a,BIG),a_big_frozen,a_notbig_frozen))
3607        return probs
3608    else:
3609        return s
3610
3611
3612 def aerfcc(x):
3613    """
3614Returns the complementary error function erfc(x) with fractional error
3615everywhere less than 1.2e-7.  Adapted from Numerical Recipies.  Can
3616handle multiple dimensions.
3617
3618Usage:   aerfcc(x)
3619"""
3620    z = abs(x)
3621    t = 1.0 / (1.0+0.5*z)
3622    ans = t * N.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277)))))))))
3623    return N.where(N.greater_equal(x,0), ans, 2.0-ans)
3624
3625
3626 def azprob(z):
3627    """
3628Returns the area under the normal curve 'to the left of' the given z value.
3629Thus,
3630    for z<0, zprob(z) = 1-tail probability
3631    for z>0, 1.0-zprob(z) = 1-tail probability
3632    for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability
3633Adapted from z.c in Gary Perlman's |Stat.  Can handle multiple dimensions.
3634
3635Usage:   azprob(z)    where z is a z-value
3636"""
3637    def yfunc(y):
3638        x = (((((((((((((-0.000045255659 * y
3639                         +0.000152529290) * y -0.000019538132) * y
3640                       -0.000676904986) * y +0.001390604284) * y
3641                     -0.000794620820) * y -0.002034254874) * y
3642                   +0.006549791214) * y -0.010557625006) * y
3643                 +0.011630447319) * y -0.009279453341) * y
3644               +0.005353579108) * y -0.002141268741) * y
3645             +0.000535310849) * y +0.999936657524
3646        return x
3647
3648    def wfunc(w):
3649        x = ((((((((0.000124818987 * w
3650                    -0.001075204047) * w +0.005198775019) * w
3651                  -0.019198292004) * w +0.059054035642) * w
3652                -0.151968751364) * w +0.319152932694) * w
3653              -0.531923007300) * w +0.797884560593) * N.sqrt(w) * 2.0
3654        return x
3655
3656    Z_MAX = 6.0    # maximum meaningful z-value
3657    x = N.zeros(z.shape,N.Float) # initialize
3658    y = 0.5 * N.fabs(z)
3659    x = N.where(N.less(y,1.0),wfunc(y*y),yfunc(y-2.0)) # get x's
3660    x = N.where(N.greater(y,Z_MAX*0.5),1.0,x)          # kill those with big Z
3661    prob = N.where(N.greater(z,0),(x+1)*0.5,(1-x)*0.5)
3662    return prob
3663
3664
3665 def aksprob(alam):
3666     """
3667Returns the probability value for a K-S statistic computed via ks_2samp.
3668Adapted from Numerical Recipies.  Can handle multiple dimensions.
3669
3670Usage:   aksprob(alam)
3671"""
3672     if type(alam) == N.ArrayType:
3673         frozen = -1 *N.ones(alam.shape,N.Float64)
3674         alam = alam.astype(N.Float64)
3675         arrayflag = 1
3676     else:
3677         frozen = N.array(-1.)
3678         alam = N.array(alam,N.Float64)
3679     mask = N.zeros(alam.shape)
3680     fac = 2.0 *N.ones(alam.shape,N.Float)
3681     sum = N.zeros(alam.shape,N.Float)
3682     termbf = N.zeros(alam.shape,N.Float)
3683     a2 = N.array(-2.0*alam*alam,N.Float64)
3684     totalelements = N.multiply.reduce(N.array(mask.shape))
3685     for j in range(1,201):
3686         if asum(mask) == totalelements:
3687             break
3688         exponents = (a2*j*j)
3689         overflowmask = N.less(exponents,-746)
3690         frozen = N.where(overflowmask,0,frozen)
3691         mask = mask+overflowmask
3692         term = fac*N.exp(exponents)
3693         sum = sum + term
3694         newmask = N.where(N.less_equal(abs(term),(0.001*termbf)) +
3695                           N.less(abs(term),1.0e-8*sum), 1, 0)
3696         frozen = N.where(newmask*N.equal(mask,0), sum, frozen)
3697         mask = N.clip(mask+newmask,0,1)
3698         fac = -fac
3699         termbf = abs(term)
3700     if arrayflag:
3701         return N.where(N.equal(frozen,-1), 1.0, frozen)  # 1.0 if doesn't converge
3702     else:
3703         return N.where(N.equal(frozen,-1), 1.0, frozen)[0]  # 1.0 if doesn't converge
3704
3705
3706 def afprob (dfnum, dfden, F):
3707    """
3708Returns the 1-tailed significance level (p-value) of an F statistic
3709given the degrees of freedom for the numerator (dfR-dfF) and the degrees
3710of freedom for the denominator (dfF).  Can handle multiple dims for F.
3711
3712Usage:   afprob(dfnum, dfden, F)   where usually dfnum=dfbn, dfden=dfwn
3713"""
3714    if type(F) == N.ArrayType:
3715        return abetai(0.5*dfden, 0.5*dfnum, dfden/(1.0*dfden+dfnum*F))
3716    else:
3717        return abetai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F))
3718
3719
3720 def abetacf(a,b,x,verbose=1):
3721    """
3722Evaluates the continued fraction form of the incomplete Beta function,
3723betai.  (Adapted from: Numerical Recipies in C.)  Can handle multiple
3724dimensions for x.
3725
3726Usage:   abetacf(a,b,x,verbose=1)
3727"""
3728    ITMAX = 200
3729    EPS = 3.0e-7
3730
3731    arrayflag = 1
3732    if type(x) == N.ArrayType:
3733        frozen = N.ones(x.shape,N.Float) *-1  #start out w/ -1s, should replace all
3734    else:
3735        arrayflag = 0
3736        frozen = N.array([-1])
3737        x = N.array([x])
3738    mask = N.zeros(x.shape)
3739    bm = az = am = 1.0
3740    qab = a+b
3741    qap = a+1.0
3742    qam = a-1.0
3743    bz = 1.0-qab*x/qap
3744    for i in range(ITMAX+1):
3745        if N.sum(N.ravel(N.equal(frozen,-1)))==0:
3746            break
3747        em = float(i+1)
3748        tem = em + em
3749        d = em*(b-em)*x/((qam+tem)*(a+tem))
3750        ap = az + d*am
3751        bp = bz+d*bm
3752        d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem))
3753        app = ap+d*az
3754        bpp = bp+d*bz
3755        aold = az*1
3756        am = ap/bpp
3757        bm = bp/bpp
3758        az = app/bpp
3759        bz = 1.0
3760        newmask = N.less(abs(az-aold),EPS*abs(az))
3761        frozen = N.where(newmask*N.equal(mask,0), az, frozen)
3762        mask = N.clip(mask+newmask,0,1)
3763    noconverge = asum(N.equal(frozen,-1))
3764    if noconverge <> 0 and verbose:
3765        print 'a or b too big, or ITMAX too small in Betacf for ',noconverge,' elements'
3766    if arrayflag:
3767        return frozen
3768    else:
3769        return frozen[0]
3770
3771
3772 def agammln(xx):
3773    """
3774Returns the gamma function of xx.
3775    Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
3776Adapted from: Numerical Recipies in C.  Can handle multiple dims ... but
3777probably doesn't normally have to.
3778
3779Usage:   agammln(xx)
3780"""
3781    coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
3782             0.120858003e-2, -0.536382e-5]
3783    x = xx - 1.0
3784    tmp = x + 5.5
3785    tmp = tmp - (x+0.5)*N.log(tmp)
3786    ser = 1.0
3787    for j in range(len(coeff)):
3788        x = x + 1
3789        ser = ser + coeff[j]/x
3790    return -tmp + N.log(2.50662827465*ser)
3791
3792
3793 def abetai(a,b,x,verbose=1):
3794    """
3795Returns the incomplete beta function:
3796
3797    I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt)
3798
3799where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma
3800function of a.  The continued fraction formulation is implemented
3801here, using the betacf function.  (Adapted from: Numerical Recipies in
3802C.)  Can handle multiple dimensions.
3803
3804Usage:   abetai(a,b,x,verbose=1)
3805"""
3806    TINY = 1e-15
3807    if type(a) == N.ArrayType:
3808        if asum(N.less(x,0)+N.greater(x,1)) <> 0:
3809            raise ValueError, 'Bad x in abetai'
3810    x = N.where(N.equal(x,0),TINY,x)
3811    x = N.where(N.equal(x,1.0),1-TINY,x)
3812
3813    bt = N.where(N.equal(x,0)+N.equal(x,1), 0, -1)
3814    exponents = ( gammln(a+b)-gammln(a)-gammln(b)+a*N.log(x)+b*
3815                  N.log(1.0-x) )
3816    # 746 (below) is the MAX POSSIBLE BEFORE OVERFLOW
3817    exponents = N.where(N.less(exponents,-740),-740,exponents)
3818    bt = N.exp(exponents)
3819    if type(x) == N.ArrayType:
3820        ans = N.where(N.less(x,(a+1)/(a+b+2.0)),
3821                      bt*abetacf(a,b,x,verbose)/float(a),
3822                      1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b))
3823    else:
3824        if x<(a+1)/(a+b+2.0):
3825            ans = bt*abetacf(a,b,x,verbose)/float(a)
3826        else:
3827            ans = 1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b)
3828    return ans
3829
3830
3831#####################################
3832#######  AANOVA CALCULATIONS  #######
3833#####################################
3834
3835 import LinearAlgebra, operator
3836 LA = LinearAlgebra
3837
3838 def aglm(data,para):
3839    """
3840Calculates a linear model fit ... anova/ancova/lin-regress/t-test/etc. Taken
3841from:
3842    Peterson et al. Statistical limitations in functional neuroimaging
3843    I. Non-inferential methods and statistical models.  Phil Trans Royal Soc
3844    Lond B 354: 1239-1260.
3845
3846Usage:   aglm(data,para)
3847Returns: statistic, p-value ???
3848"""
3849    if len(para) <> len(data):
3850        print "data and para must be same length in aglm"
3851        return
3852    n = len(para)
3853    p = pstat.aunique(para)
3854    x = N.zeros((n,len(p)))  # design matrix
3855    for l in range(len(p)):
3856        x[:,l] = N.equal(para,p[l])
3857    b = N.dot(N.dot(LA.inverse(N.dot(N.transpose(x),x)),  # i.e., b=inv(X'X)X'Y
3858                    N.transpose(x)),
3859              data)
3860    diffs = (data - N.dot(x,b))
3861    s_sq = 1./(n-len(p)) * N.dot(N.transpose(diffs), diffs)
3862
3863    if len(p) == 2:  # ttest_ind
3864        c = N.array([1,-1])
3865        df = n-2
3866        fact = asum(1.0/asum(x,0))  # i.e., 1/n1 + 1/n2 + 1/n3 ...
3867        t = N.dot(c,b) / N.sqrt(s_sq*fact)
3868        probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
3869        return t, probs
3870
3871
3872 def aF_oneway(*args):
3873    """
3874Performs a 1-way ANOVA, returning an F-value and probability given
3875any number of groups.  From Heiman, pp.394-7.
3876
3877Usage:   aF_oneway (*args)    where *args is 2 or more arrays, one per
3878                                  treatment group
3879Returns: f-value, probability
3880"""
3881    na = len(args)            # ANOVA on 'na' groups, each in it's own array
3882    means = [0]*na
3883    vars = [0]*na
3884    ns = [0]*na
3885    alldata = []
3886    tmp = map(N.array,args)
3887    means = map(amean,tmp)
3888    vars = map(avar,tmp)
3889    ns = map(len,args)
3890    alldata = N.concatenate(args)
3891    bign = len(alldata)
3892    sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign))
3893    ssbn = 0
3894    for a in args:
3895        ssbn = ssbn + asquare_of_sums(N.array(a))/float(len(a))
3896    ssbn = ssbn - (asquare_of_sums(alldata)/float(bign))
3897    sswn = sstot-ssbn
3898    dfbn = na-1
3899    dfwn = bign - na
3900    msb = ssbn/float(dfbn)
3901    msw = sswn/float(dfwn)
3902    f = msb/msw
3903    prob = fprob(dfbn,dfwn,f)
3904    return f, prob
3905
3906
3907 def aF_value (ER,EF,dfR,dfF):
3908    """
3909Returns an F-statistic given the following:
3910        ER  = error associated with the null hypothesis (the Restricted model)
3911        EF  = error associated with the alternate hypothesis (the Full model)
3912        dfR = degrees of freedom the Restricted model
3913        dfF = degrees of freedom associated with the Restricted model
3914"""
3915    return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF)))
3916
3917
3918 def outputfstats(Enum, Eden, dfnum, dfden, f, prob):
3919     Enum = round(Enum,3)
3920     Eden = round(Eden,3)
3921     dfnum = round(Enum,3)
3922     dfden = round(dfden,3)
3923     f = round(f,3)
3924     prob = round(prob,3)
3925     suffix = ''                       # for *s after the p-value
3926     if  prob < 0.001:  suffix = '  ***'
3927     elif prob < 0.01:  suffix = '  **'
3928     elif prob < 0.05:  suffix = '  *'
3929     title = [['EF/ER','DF','Mean Square','F-value','prob','']]
3930     lofl = title+[[Enum, dfnum, round(Enum/float(dfnum),3), f, prob, suffix],
3931                   [Eden, dfden, round(Eden/float(dfden),3),'','','']]
3932     pstat.printcc(lofl)
3933     return
3934
3935
3936 def F_value_multivariate(ER, EF, dfnum, dfden):
3937     """
3938Returns an F-statistic given the following:
3939        ER  = error associated with the null hypothesis (the Restricted model)
3940        EF  = error associated with the alternate hypothesis (the Full model)
3941        dfR = degrees of freedom the Restricted model
3942        dfF = degrees of freedom associated with the Restricted model
3943where ER and EF are matrices from a multivariate F calculation.
3944"""
3945     if type(ER) in [IntType, FloatType]:
3946         ER = N.array([[ER]])
3947     if type(EF) in [IntType, FloatType]:
3948         EF = N.array([[EF]])
3949     n_um = (LA.determinant(ER) - LA.determinant(EF)) / float(dfnum)
3950     d_en = LA.determinant(EF) / float(dfden)
3951     return n_um / d_en
3952
3953
3954#####################################
3955#######  ASUPPORT FUNCTIONS  ########
3956#####################################
3957
3958 def asign(a):
3959    """
3960Usage:   asign(a)
3961Returns: array shape of a, with -1 where a<0 and +1 where a>=0
3962"""
3963    a = N.asarray(a)
3964    if ((type(a) == type(1.4)) or (type(a) == type(1))):
3965        return a-a-N.less(a,0)+N.greater(a,0)
3966    else:
3967        return N.zeros(N.shape(a))-N.less(a,0)+N.greater(a,0)
3968
3969
3970 def asum (a, dimension=None,keepdims=0):
3971     """
3972An alternative to the Numeric.add.reduce function, which allows one to
3973(1) collapse over multiple dimensions at once, and/or (2) to retain
3974all dimensions in the original array (squashing one down to size.
3975Dimension can equal None (ravel array first), an integer (the
3976dimension over which to operate), or a sequence (operate over multiple
3977dimensions).  If keepdims=1, the resulting array will have as many
3978dimensions as the input array.
3979
3980Usage:   asum(a, dimension=None, keepdims=0)
3981Returns: array summed along 'dimension'(s), same _number_ of dims if keepdims=1
3982"""
3983     if type(a) == N.ArrayType and a.typecode() in ['l','s','b']:
3984         a = a.astype(N.Float)
3985     if dimension == None:
3986         s = N.sum(N.ravel(a))
3987     elif type(dimension) in [IntType,FloatType]:
3988         s = N.add.reduce(a, dimension)
3989         if keepdims == 1:
3990             shp = list(a.shape)
3991             shp[dimension] = 1
3992             s = N.reshape(s,shp)
3993     else: # must be a SEQUENCE of dims to sum over
3994        dims = list(dimension)
3995        dims.sort()
3996        dims.reverse()
3997        s = a *1.0
3998        for dim in dims:
3999            s = N.add.reduce(s,dim)
4000        if keepdims == 1:
4001            shp = list(a.shape)
4002            for dim in dims:
4003                shp[dim] = 1
4004            s = N.reshape(s,shp)
4005     return s
4006
4007
4008 def acumsum (a,dimension=None):
4009    """
4010Returns an array consisting of the cumulative sum of the items in the
4011passed array.  Dimension can equal None (ravel array first), an
4012integer (the dimension over which to operate), or a sequence (operate
4013over multiple dimensions, but this last one just barely makes sense).
4014
4015Usage:   acumsum(a,dimension=None)
4016"""
4017    if dimension == None:
4018        a = N.ravel(a)
4019        dimension = 0
4020    if type(dimension) in [ListType, TupleType, N.ArrayType]:
4021        dimension = list(dimension)
4022        dimension.sort()
4023        dimension.reverse()
4024        for d in dimension:
4025            a = N.add.accumulate(a,d)
4026        return a
4027    else:
4028        return N.add.accumulate(a,dimension)
4029
4030
4031 def ass(inarray, dimension=None, keepdims=0):
4032    """
4033Squares each value in the passed array, adds these squares & returns
4034the result.  Unfortunate function name. :-) Defaults to ALL values in
4035the array.  Dimension can equal None (ravel array first), an integer
4036(the dimension over which to operate), or a sequence (operate over
4037multiple dimensions).  Set keepdims=1 to maintain the original number
4038of dimensions.
4039
4040Usage:   ass(inarray, dimension=None, keepdims=0)
4041Returns: sum-along-'dimension' for (inarray*inarray)
4042"""
4043    if dimension == None:
4044        inarray = N.ravel(inarray)
4045        dimension = 0
4046    return asum(inarray*inarray,dimension,keepdims)
4047
4048
4049 def asummult (array1,array2,dimension=None,keepdims=0):
4050    """
4051Multiplies elements in array1 and array2, element by element, and
4052returns the sum (along 'dimension') of all resulting multiplications.
4053Dimension can equal None (ravel array first), an integer (the
4054dimension over which to operate), or a sequence (operate over multiple
4055dimensions).  A trivial function, but included for completeness.
4056
4057Usage:   asummult(array1,array2,dimension=None,keepdims=0)
4058"""
4059    if dimension == None:
4060        array1 = N.ravel(array1)
4061        array2 = N.ravel(array2)
4062        dimension = 0
4063    return asum(array1*array2,dimension,keepdims)
4064
4065
4066 def asquare_of_sums(inarray, dimension=None, keepdims=0):
4067    """
4068Adds the values in the passed array, squares that sum, and returns the
4069result.  Dimension can equal None (ravel array first), an integer (the
4070dimension over which to operate), or a sequence (operate over multiple
4071dimensions).  If keepdims=1, the returned array will have the same
4072NUMBER of dimensions as the original.
4073
4074Usage:   asquare_of_sums(inarray, dimension=None, keepdims=0)
4075Returns: the square of the sum over dim(s) in dimension
4076"""
4077    if dimension == None:
4078        inarray = N.ravel(inarray)
4079        dimension = 0
4080    s = asum(inarray,dimension,keepdims)
4081    if type(s) == N.ArrayType:
4082        return s.astype(N.Float)*s
4083    else:
4084        return float(s)*s
4085
4086
4087 def asumdiffsquared(a,b, dimension=None, keepdims=0):
4088    """
4089Takes pairwise differences of the values in arrays a and b, squares
4090these differences, and returns the sum of these squares.  Dimension
4091can equal None (ravel array first), an integer (the dimension over
4092which to operate), or a sequence (operate over multiple dimensions).
4093keepdims=1 means the return shape = len(a.shape) = len(b.shape)
4094
4095Usage:   asumdiffsquared(a,b)
4096Returns: sum[ravel(a-b)**2]
4097"""
4098    if dimension == None:
4099        inarray = N.ravel(a)
4100        dimension = 0
4101    return asum((a-b)**2,dimension,keepdims)
4102
4103
4104 def ashellsort(inarray):
4105    """
4106Shellsort algorithm.  Sorts a 1D-array.
4107
4108Usage:   ashellsort(inarray)
4109Returns: sorted-inarray, sorting-index-vector (for original array)
4110"""
4111    n = len(inarray)
4112    svec = inarray *1.0
4113    ivec = range(n)
4114    gap = n/2   # integer division needed
4115    while gap >0:
4116        for i in range(gap,n):
4117            for j in range(i-gap,-1,-gap):
4118                while j>=0 and svec[j]>svec[j+gap]:
4119                    temp        = svec[j]
4120                    svec[j]     = svec[j+gap]
4121                    svec[j+gap] = temp
4122                    itemp       = ivec[j]
4123                    ivec[j]     = ivec[j+gap]
4124                    ivec[j+gap] = itemp
4125        gap = gap / 2  # integer division needed
4126#    svec is now sorted input vector, ivec has the order svec[i] = vec[ivec[i]]
4127    return svec, ivec
4128
4129
4130 def arankdata(inarray):
4131    """
4132Ranks the data in inarray, dealing with ties appropritely.  Assumes
4133a 1D inarray.  Adapted from Gary Perlman's |Stat ranksort.
4134
4135Usage:   arankdata(inarray)
4136Returns: array of length equal to inarray, containing rank scores
4137"""
4138    n = len(inarray)
4139    svec, ivec = ashellsort(inarray)
4140    sumranks = 0
4141    dupcount = 0
4142    newarray = N.zeros(n,N.Float)
4143    for i in range(n):
4144        sumranks = sumranks + i
4145        dupcount = dupcount + 1
4146        if i==n-1 or svec[i] <> svec[i+1]:
4147            averank = sumranks / float(dupcount) + 1
4148            for j in range(i-dupcount+1,i+1):
4149                newarray[ivec[j]] = averank
4150            sumranks = 0
4151            dupcount = 0
4152    return newarray
4153
4154
4155 def afindwithin(data):
4156    """
4157Returns a binary vector, 1=within-subject factor, 0=between.  Input
4158equals the entire data array (i.e., column 1=random factor, last
4159column = measured values.
4160
4161Usage:   afindwithin(data)     data in |Stat format
4162"""
4163    numfact = len(data[0])-2
4164    withinvec = [0]*numfact
4165    for col in range(1,numfact+1):
4166        rows = pstat.linexand(data,col,pstat.unique(pstat.colex(data,1))[0])  # get 1 level of this factor
4167        if len(pstat.unique(pstat.colex(rows,0))) < len(rows):   # if fewer subjects than scores on this factor
4168            withinvec[col-1] = 1
4169    return withinvec
4170
4171
4172 #########################################################
4173 #########################################################
4174 ######  RE-DEFINE DISPATCHES TO INCLUDE ARRAYS  #########
4175 #########################################################
4176 #########################################################
4177
4178## CENTRAL TENDENCY:
4179 geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)),
4180                            (ageometricmean, (N.ArrayType,)) )
4181 harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)),
4182                           (aharmonicmean, (N.ArrayType,)) )
4183 mean = Dispatch ( (lmean, (ListType, TupleType)),
4184                   (amean, (N.ArrayType,)) )
4185 median = Dispatch ( (lmedian, (ListType, TupleType)),
4186                     (amedian, (N.ArrayType,)) )
4187 medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)),
4188                          (amedianscore, (N.ArrayType,)) )
4189 mode = Dispatch ( (lmode, (ListType, TupleType)),
4190                   (amode, (N.ArrayType,)) )
4191 tmean = Dispatch ( (atmean, (N.ArrayType,)) )
4192 tvar = Dispatch ( (atvar, (N.ArrayType,)) )
4193 tstdev = Dispatch ( (atstdev, (N.ArrayType,)) )
4194 tsem = Dispatch ( (atsem, (N.ArrayType,)) )
4195
4196## VARIATION:
4197 moment = Dispatch ( (lmoment, (ListType, TupleType)),
4198                     (amoment, (N.ArrayType,)) )
4199 variation = Dispatch ( (lvariation, (ListType, TupleType)),
4200                        (avariation, (N.ArrayType,)) )
4201 skew = Dispatch ( (lskew, (ListType, TupleType)),
4202                   (askew, (N.ArrayType,)) )
4203 kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)),
4204                       (akurtosis, (N.ArrayType,)) )
4205 describe = Dispatch ( (ldescribe, (ListType, TupleType)),
4206                       (adescribe, (N.ArrayType,)) )
4207
4208## DISTRIBUTION TESTS
4209
4210 skewtest = Dispatch ( (askewtest, (ListType, TupleType)),
4211                       (askewtest, (N.ArrayType,)) )
4212 kurtosistest = Dispatch ( (akurtosistest, (ListType, TupleType)),
4213                           (akurtosistest, (N.ArrayType,)) )
4214 normaltest = Dispatch ( (anormaltest, (ListType, TupleType)),
4215                         (anormaltest, (N.ArrayType,)) )
4216
4217## FREQUENCY STATS:
4218 itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)),
4219                       (aitemfreq, (N.ArrayType,)) )
4220 scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)),
4221                                (ascoreatpercentile, (N.ArrayType,)) )
4222 percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)),
4223                                 (apercentileofscore, (N.ArrayType,)) )
4224 histogram = Dispatch ( (lhistogram, (ListType, TupleType)),
4225                        (ahistogram, (N.ArrayType,)) )
4226 cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)),
4227                      (acumfreq, (N.ArrayType,)) )
4228 relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)),
4229                      (arelfreq, (N.ArrayType,)) )
4230 
4231## VARIABILITY:
4232 obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)),
4233                              (aobrientransform, (N.ArrayType,)) )
4234 samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)),
4235                        (asamplevar, (N.ArrayType,)) )
4236 samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)),
4237                          (asamplestdev, (N.ArrayType,)) )
4238 signaltonoise = Dispatch( (asignaltonoise, (N.ArrayType,)),)
4239 var = Dispatch ( (lvar, (ListType, TupleType)),
4240                  (avar, (N.ArrayType,)) )
4241 stdev = Dispatch ( (lstdev, (ListType, TupleType)),
4242                    (astdev, (N.ArrayType,)) )
4243 sterr = Dispatch ( (lsterr, (ListType, TupleType)),
4244                    (asterr, (N.ArrayType,)) )
4245 sem = Dispatch ( (lsem, (ListType, TupleType)),
4246                  (asem, (N.ArrayType,)) )
4247 z = Dispatch ( (lz, (ListType, TupleType)),
4248                (az, (N.ArrayType,)) )
4249 zs = Dispatch ( (lzs, (ListType, TupleType)),
4250                 (azs, (N.ArrayType,)) )
4251 
4252## TRIMMING FCNS:
4253 threshold = Dispatch( (athreshold, (N.ArrayType,)),)
4254 trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)),
4255                       (atrimboth, (N.ArrayType,)) )
4256 trim1 = Dispatch ( (ltrim1, (ListType, TupleType)),
4257                    (atrim1, (N.ArrayType,)) )
4258 
4259## CORRELATION FCNS:
4260 paired = Dispatch ( (lpaired, (ListType, TupleType)),
4261                     (apaired, (N.ArrayType,)) )
4262 pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)),
4263                       (apearsonr, (N.ArrayType,)) )
4264 spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)),
4265                        (aspearmanr, (N.ArrayType,)) )
4266 pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)),
4267                             (apointbiserialr, (N.ArrayType,)) )
4268 kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)),
4269                         (akendalltau, (N.ArrayType,)) )
4270 linregress = Dispatch ( (llinregress, (ListType, TupleType)),
4271                         (alinregress, (N.ArrayType,)) )
4272 
4273## INFERENTIAL STATS:
4274 ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)),
4275                          (attest_1samp, (N.ArrayType,)) )
4276 ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)),
4277                        (attest_ind, (N.ArrayType,)) )
4278 ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)),
4279                        (attest_rel, (N.ArrayType,)) )
4280 chisquare = Dispatch ( (lchisquare, (ListType, TupleType)),
4281                        (achisquare, (N.ArrayType,)) )
4282 ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)),
4283                       (aks_2samp, (N.ArrayType,)) )
4284 mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)),
4285                           (amannwhitneyu, (N.ArrayType,)) )
4286 tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)),
4287                         (atiecorrect, (N.ArrayType,)) )
4288 ranksums = Dispatch ( (lranksums, (ListType, TupleType)),
4289                       (aranksums, (N.ArrayType,)) )
4290 wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)),
4291                        (awilcoxont, (N.ArrayType,)) )
4292 kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)),
4293                             (akruskalwallish, (N.ArrayType,)) )
4294 friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)),
4295                                (afriedmanchisquare, (N.ArrayType,)) )
4296 
4297## PROBABILITY CALCS:
4298 chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)),
4299                        (achisqprob, (N.ArrayType,)) )
4300 zprob = Dispatch ( (lzprob, (IntType, FloatType)),
4301                    (azprob, (N.ArrayType,)) )
4302 ksprob = Dispatch ( (lksprob, (IntType, FloatType)),
4303                     (aksprob, (N.ArrayType,)) )
4304 fprob = Dispatch ( (lfprob, (IntType, FloatType)),
4305                    (afprob, (N.ArrayType,)) )
4306 betacf = Dispatch ( (lbetacf, (IntType, FloatType)),
4307                     (abetacf, (N.ArrayType,)) )
4308 betai = Dispatch ( (lbetai, (IntType, FloatType)),
4309                    (abetai, (N.ArrayType,)) )
4310 erfcc = Dispatch ( (lerfcc, (IntType, FloatType)),
4311                    (aerfcc, (N.ArrayType,)) )
4312 gammln = Dispatch ( (lgammln, (IntType, FloatType)),
4313                     (agammln, (N.ArrayType,)) )
4314 
4315## ANOVA FUNCTIONS:
4316 F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)),
4317                       (aF_oneway, (N.ArrayType,)) )
4318 F_value = Dispatch ( (lF_value, (ListType, TupleType)),
4319                      (aF_value, (N.ArrayType,)) )
4320
4321## SUPPORT FUNCTIONS:
4322 incr = Dispatch ( (lincr, (ListType, TupleType, N.ArrayType)), )
4323 sum = Dispatch ( (lsum, (ListType, TupleType)),
4324                  (asum, (N.ArrayType,)) )
4325 cumsum = Dispatch ( (lcumsum, (ListType, TupleType)),
4326                     (acumsum, (N.ArrayType,)) )
4327 ss = Dispatch ( (lss, (ListType, TupleType)),
4328                 (ass, (N.ArrayType,)) )
4329 summult = Dispatch ( (lsummult, (ListType, TupleType)),
4330                      (asummult, (N.ArrayType,)) )
4331 square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)),
4332                             (asquare_of_sums, (N.ArrayType,)) )
4333 sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)),
4334                             (asumdiffsquared, (N.ArrayType,)) )
4335 shellsort = Dispatch ( (lshellsort, (ListType, TupleType)),
4336                        (ashellsort, (N.ArrayType,)) )
4337 rankdata = Dispatch ( (lrankdata, (ListType, TupleType)),
4338                       (arankdata, (N.ArrayType,)) )
4339 findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)),
4340                         (afindwithin, (N.ArrayType,)) )
4341
4342######################  END OF NUMERIC FUNCTION BLOCK  #####################
4343
4344######################  END OF STATISTICAL FUNCTIONS  ######################
4345
4346except ImportError:
4347 pass
Note: See TracBrowser for help on using the repository browser.