source: orange-bioinformatics/stats.py @ 62:0e8684f83fec

Revision 62:0e8684f83fec, 147.9 KB checked in by blaz <blaz.zupan@…>, 10 years ago (diff)

no message

Line 
1# Copyright (c) 1999-2002 Gary Strangman; All Rights Reserved.
2#
3# This software is distributable under the terms of the GNU
4# General Public License (GPL) v2, the text of which can be found at
5# http://www.gnu.org/copyleft/gpl.html. Installing, importing or otherwise
6# using this module constitutes acceptance of the terms of this License.
7#
8# Disclaimer
9#
10# This software is provided "as-is".  There are no expressed or implied
11# warranties of any kind, including, but not limited to, the warranties
12# of merchantability and fittness for a given application.  In no event
13# shall Gary Strangman be liable for any direct, indirect, incidental,
14# special, exemplary or consequential damages (including, but not limited
15# to, loss of use, data or profits, or business interruption) however
16# caused and on any theory of liability, whether in contract, strict
17# liability or tort (including negligence or otherwise) arising in any way
18# out of the use of this software, even if advised of the possibility of
19# such damage.
20#
21# Comments and/or additions are welcome (send e-mail to:
22# strang@nmr.mgh.harvard.edu).
23#
24"""
25stats.py module
26
27(Requires pstat.py module.)
28
29#################################################
30#######  Written by:  Gary Strangman  ###########
31#######  Last modified:  May 10, 2002 ###########
32#################################################
33
34A collection of basic statistical functions for python.  The function
35names appear below.
36
37IMPORTANT:  There are really *3* sets of functions.  The first set has an 'l'
38prefix, which can be used with list or tuple arguments.  The second set has
39an 'a' prefix, which can accept NumPy array arguments.  These latter
40functions are defined only when NumPy is available on the system.  The third
41type has NO prefix (i.e., has the name that appears below).  Functions of
42this set are members of a "Dispatch" class, c/o David Ascher.  This class
43allows different functions to be called depending on the type of the passed
44arguments.  Thus, stats.mean is a member of the Dispatch class and
45stats.mean(range(20)) will call stats.lmean(range(20)) while
46stats.mean(Numeric.arange(20)) will call stats.amean(Numeric.arange(20)).
47This is a handy way to keep consistent function names when different
48argument types require different functions to be called.  Having
49implementated the Dispatch class, however, means that to get info on
50a given function, you must use the REAL function name ... that is
51"print stats.lmean.__doc__" or "print stats.amean.__doc__" work fine,
52while "print stats.mean.__doc__" will print the doc for the Dispatch
53class.  NUMPY FUNCTIONS ('a' prefix) generally have more argument options
54but should otherwise be consistent with the corresponding list functions.
55
56Disclaimers:  The function list is obviously incomplete and, worse, the
57functions are not optimized.  All functions have been tested (some more
58so than others), but they are far from bulletproof.  Thus, as with any
59free software, no warranty or guarantee is expressed or implied. :-)  A
60few extra functions that don't appear in the list below can be found by
61interested treasure-hunters.  These functions don't necessarily have
62both list and array versions but were deemed useful
63
64CENTRAL TENDENCY:  geometricmean
65                   harmonicmean
66                   mean
67                   median
68                   medianscore
69                   mode
70
71MOMENTS:  moment
72          variation
73          skew
74          kurtosis
75          skewtest   (for Numpy arrays only)
76          kurtosistest (for Numpy arrays only)
77          normaltest (for Numpy arrays only)
78
79ALTERED VERSIONS:  tmean  (for Numpy arrays only)
80                   tvar   (for Numpy arrays only)
81                   tmin   (for Numpy arrays only)
82                   tmax   (for Numpy arrays only)
83                   tstdev (for Numpy arrays only)
84                   tsem   (for Numpy arrays only)
85                   describe
86
87FREQUENCY STATS:  itemfreq
88                  scoreatpercentile
89                  percentileofscore
90                  histogram
91                  cumfreq
92                  relfreq
93
94VARIABILITY:  obrientransform
95              samplevar
96              samplestdev
97              signaltonoise (for Numpy arrays only)
98              var
99              stdev
100              sterr
101              sem
102              z
103              zs
104              zmap (for Numpy arrays only)
105
106TRIMMING FCNS:  threshold (for Numpy arrays only)
107                trimboth
108                trim1
109                round (round all vals to 'n' decimals; Numpy only)
110
111CORRELATION FCNS:  covariance  (for Numpy arrays only)
112                   correlation (for Numpy arrays only)
113                   paired
114                   pearsonr
115                   spearmanr
116                   pointbiserialr
117                   kendalltau
118                   linregress
119
120INFERENTIAL STATS:  ttest_1samp
121                    ttest_ind
122                    ttest_rel
123                    chisquare
124                    ks_2samp
125                    mannwhitneyu
126                    ranksums
127                    wilcoxont
128                    kruskalwallish
129                    friedmanchisquare
130
131PROBABILITY CALCS:  chisqprob
132                    erfcc
133                    zprob
134                    ksprob
135                    fprob
136                    betacf
137                    gammln
138                    betai
139
140ANOVA FUNCTIONS:  F_oneway
141                  F_value
142
143SUPPORT FUNCTIONS:  writecc
144                    incr
145                    sign  (for Numpy arrays only)
146                    sum
147                    cumsum
148                    ss
149                    summult
150                    sumdiffsquared
151                    square_of_sums
152                    shellsort
153                    rankdata
154                    outputpairedstats
155                    findwithin
156"""
157## CHANGE LOG:
158## ===========
159## 02-11-19 ... fixed attest_ind and attest_rel for div-by-zero Overflows
160## 02-05-10 ... fixed lchisqprob indentation (failed when df=even)
161## 00-12-28 ... removed aanova() to separate module, fixed licensing to
162##                   match Python License, fixed doc string & imports
163## 00-04-13 ... pulled all "global" statements, except from aanova()
164##              added/fixed lots of documentation, removed io.py dependency
165##              changed to version 0.5
166## 99-11-13 ... added asign() function
167## 99-11-01 ... changed version to 0.4 ... enough incremental changes now
168## 99-10-25 ... added acovariance and acorrelation functions
169## 99-10-10 ... fixed askew/akurtosis to avoid divide-by-zero errors
170##              added aglm function (crude, but will be improved)
171## 99-10-04 ... upgraded acumsum, ass, asummult, asamplevar, avar, etc. to
172##                   all handle lists of 'dimension's and keepdims
173##              REMOVED ar0, ar2, ar3, ar4 and replaced them with around
174##              reinserted fixes for abetai to avoid math overflows
175## 99-09-05 ... rewrote achisqprob/aerfcc/aksprob/afprob/abetacf/abetai to
176##                   handle multi-dimensional arrays (whew!)
177## 99-08-30 ... fixed l/amoment, l/askew, l/akurtosis per D'Agostino (1990)
178##              added anormaltest per same reference
179##              re-wrote azprob to calc arrays of probs all at once
180## 99-08-22 ... edited attest_ind printing section so arrays could be rounded
181## 99-08-19 ... fixed amean and aharmonicmean for non-error(!) overflow on
182##                   short/byte arrays (mean of #s btw 100-300 = -150??)
183## 99-08-09 ... fixed asum so that the None case works for Byte arrays
184## 99-08-08 ... fixed 7/3 'improvement' to handle t-calcs on N-D arrays
185## 99-07-03 ... improved attest_ind, attest_rel (zero-division errortrap)
186## 99-06-24 ... fixed bug(?) in attest_ind (n1=a.shape[0])
187## 04/11/99 ... added asignaltonoise, athreshold functions, changed all
188##                   max/min in array section to N.maximum/N.minimum,
189##                   fixed square_of_sums to prevent integer overflow
190## 04/10/99 ... !!! Changed function name ... sumsquared ==> square_of_sums
191## 03/18/99 ... Added ar0, ar2, ar3 and ar4 rounding functions
192## 02/28/99 ... Fixed aobrientransform to return an array rather than a list
193## 01/15/99 ... Essentially ceased updating list-versions of functions (!!!)
194## 01/13/99 ... CHANGED TO VERSION 0.3
195##              fixed bug in a/lmannwhitneyu p-value calculation
196## 12/31/98 ... fixed variable-name bug in ldescribe
197## 12/19/98 ... fixed bug in findwithin (fcns needed pstat. prefix)
198## 12/16/98 ... changed amedianscore to return float (not array) for 1 score
199## 12/14/98 ... added atmin and atmax functions
200##              removed umath from import line (not needed)
201##              l/ageometricmean modified to reduce chance of overflows (take
202##                   nth root first, then multiply)
203## 12/07/98 ... added __version__variable (now 0.2)
204##              removed all 'stats.' from anova() fcn
205## 12/06/98 ... changed those functions (except shellsort) that altered
206##                   arguments in-place ... cumsum, ranksort, ...
207##              updated (and fixed some) doc-strings
208## 12/01/98 ... added anova() function (requires NumPy)
209##              incorporated Dispatch class
210## 11/12/98 ... added functionality to amean, aharmonicmean, ageometricmean
211##              added 'asum' function (added functionality to N.add.reduce)
212##              fixed both moment and amoment (two errors)
213##              changed name of skewness and askewness to skew and askew
214##              fixed (a)histogram (which sometimes counted points <lowerlimit)
215
216import pstat               # required 3rd party module
217import math, string, copy  # required python modules
218from types import *
219
220__version__ = 0.5
221
222############# DISPATCH CODE ##############
223
224
225class Dispatch:
226    """
227The Dispatch class, care of David Ascher, allows different functions to
228be called depending on the argument types.  This way, there can be one
229function name regardless of the argument type.  To access function doc
230in stats.py module, prefix the function with an 'l' or 'a' for list or
231array arguments, respectively.  That is, print stats.lmean.__doc__ or
232print stats.amean.__doc__ or whatever.
233"""
234
235    def __init__(self, *tuples):
236        self._dispatch = {}
237        for func, types in tuples:
238            for t in types:
239                if t in self._dispatch.keys():
240                    raise ValueError, "can't have two dispatches on "+str(t)
241                self._dispatch[t] = func
242        self._types = self._dispatch.keys()
243
244    def __call__(self, arg1, *args, **kw):
245        if type(arg1) not in self._types:
246            raise TypeError, "don't know how to dispatch %s arguments" %  type(arg1)
247        return apply(self._dispatch[type(arg1)], (arg1,) + args, kw)
248
249
250##########################################################################
251########################   LIST-BASED FUNCTIONS   ########################
252##########################################################################
253
254### Define these regardless
255
256####################################
257#######  CENTRAL TENDENCY  #########
258####################################
259
260def lgeometricmean (inlist):
261    """
262Calculates the geometric mean of the values in the passed list.
263That is:  n-th root of (x1 * x2 * ... * xn).  Assumes a '1D' list.
264
265Usage:   lgeometricmean(inlist)
266"""
267    mult = 1.0
268    one_over_n = 1.0/len(inlist)
269    for item in inlist:
270        mult = mult * pow(item,one_over_n)
271    return mult
272
273
274def lharmonicmean (inlist):
275    """
276Calculates the harmonic mean of the values in the passed list.
277That is:  n / (1/x1 + 1/x2 + ... + 1/xn).  Assumes a '1D' list.
278
279Usage:   lharmonicmean(inlist)
280"""
281    sum = 0
282    for item in inlist:
283        sum = sum + 1.0/item
284    return len(inlist) / sum
285
286
287def lmean (inlist):
288    """
289Returns the arithematic mean of the values in the passed list.
290Assumes a '1D' list, but will function on the 1st dim of an array(!).
291
292Usage:   lmean(inlist)
293"""
294    sum = 0
295    for item in inlist:
296        sum = sum + item
297    return sum/float(len(inlist))
298
299
300def lmedian (inlist,numbins=1000):
301    """
302Returns the computed median value of a list of numbers, given the
303number of bins to use for the histogram (more bins brings the computed value
304closer to the median score, default number of bins = 1000).  See G.W.
305Heiman's Basic Stats (1st Edition), or CRC Probability & Statistics.
306
307Usage:   lmedian (inlist, numbins=1000)
308"""
309    (hist, smallest, binsize, extras) = histogram(inlist,numbins) # make histog
310    cumhist = cumsum(hist)              # make cumulative histogram
311    for i in range(len(cumhist)):        # get 1st(!) index holding 50%ile score
312        if cumhist[i]>=len(inlist)/2.0:
313            cfbin = i
314            break
315    LRL = smallest + binsize*cfbin        # get lower read limit of that bin
316    cfbelow = cumhist[cfbin-1]
317    freq = float(hist[cfbin])                # frequency IN the 50%ile bin
318    median = LRL + ((len(inlist)/2.0 - cfbelow)/float(freq))*binsize  # median formula
319    return median
320
321
322def lmedianscore (inlist):
323    """
324Returns the 'middle' score of the passed list.  If there is an even
325number of scores, the mean of the 2 middle scores is returned.
326
327Usage:   lmedianscore(inlist)
328"""
329
330    newlist = copy.deepcopy(inlist)
331    newlist.sort()
332    if len(newlist) % 2 == 0:   # if even number of scores, average middle 2
333        index = len(newlist)/2  # integer division correct
334        median = float(newlist[index] + newlist[index-1]) /2
335    else:
336        index = len(newlist)/2  # int divsion gives mid value when count from 0
337        median = newlist[index]
338    return median
339
340
341def lmode(inlist):
342    """
343Returns a list of the modal (most common) score(s) in the passed
344list.  If there is more than one such score, all are returned.  The
345bin-count for the mode(s) is also returned.
346
347Usage:   lmode(inlist)
348Returns: bin-count for mode(s), a list of modal value(s)
349"""
350
351    scores = pstat.unique(inlist)
352    scores.sort()
353    freq = []
354    for item in scores:
355        freq.append(inlist.count(item))
356    maxfreq = max(freq)
357    mode = []
358    stillmore = 1
359    while stillmore:
360        try:
361            indx = freq.index(maxfreq)
362            mode.append(scores[indx])
363            del freq[indx]
364            del scores[indx]
365        except ValueError:
366            stillmore=0
367    return maxfreq, mode
368
369
370####################################
371############  MOMENTS  #############
372####################################
373
374def lmoment(inlist,moment=1):
375    """
376Calculates the nth moment about the mean for a sample (defaults to
377the 1st moment).  Used to calculate coefficients of skewness and kurtosis.
378
379Usage:   lmoment(inlist,moment=1)
380Returns: appropriate moment (r) from ... 1/n * SUM((inlist(i)-mean)**r)
381"""
382    if moment == 1:
383        return 0.0
384    else:
385        mn = mean(inlist)
386        n = len(inlist)
387        s = 0
388        for x in inlist:
389            s = s + (x-mn)**moment
390        return s/float(n)
391
392
393def lvariation(inlist):
394    """
395Returns the coefficient of variation, as defined in CRC Standard
396Probability and Statistics, p.6.
397
398Usage:   lvariation(inlist)
399"""
400    return 100.0*samplestdev(inlist)/float(mean(inlist))
401
402
403def lskew(inlist):
404    """
405Returns the skewness of a distribution, as defined in Numerical
406Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.)
407
408Usage:   lskew(inlist)
409"""
410    return moment(inlist,3)/pow(moment(inlist,2),1.5)
411
412
413def lkurtosis(inlist):
414    """
415Returns the kurtosis of a distribution, as defined in Numerical
416Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.)
417
418Usage:   lkurtosis(inlist)
419"""
420    return moment(inlist,4)/pow(moment(inlist,2),2.0)
421
422
423def ldescribe(inlist):
424    """
425Returns some descriptive statistics of the passed list (assumed to be 1D).
426
427Usage:   ldescribe(inlist)
428Returns: n, mean, standard deviation, skew, kurtosis
429"""
430    n = len(inlist)
431    mm = (min(inlist),max(inlist))
432    m = mean(inlist)
433    sd = stdev(inlist)
434    sk = skew(inlist)
435    kurt = kurtosis(inlist)
436    return n, mm, m, sd, sk, kurt
437
438
439####################################
440#######  FREQUENCY STATS  ##########
441####################################
442
443def litemfreq(inlist):
444    """
445Returns a list of pairs.  Each pair consists of one of the scores in inlist
446and it's frequency count.  Assumes a 1D list is passed.
447
448Usage:   litemfreq(inlist)
449Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies)
450"""
451    scores = pstat.unique(inlist)
452    scores.sort()
453    freq = []
454    for item in scores:
455        freq.append(inlist.count(item))
456    return pstat.abut(scores, freq)
457
458
459def lscoreatpercentile (inlist, percent):
460    """
461Returns the score at a given percentile relative to the distribution
462given by inlist.
463
464Usage:   lscoreatpercentile(inlist,percent)
465"""
466    if percent > 1:
467        print "\nDividing percent>1 by 100 in lscoreatpercentile().\n"
468        percent = percent / 100.0
469    targetcf = percent*len(inlist)
470    h, lrl, binsize, extras = histogram(inlist)
471    cumhist = cumsum(copy.deepcopy(h))
472    for i in range(len(cumhist)):
473        if cumhist[i] >= targetcf:
474            break
475    score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i)
476    return score
477
478
479def lpercentileofscore (inlist, score,histbins=10,defaultlimits=None):
480    """
481Returns the percentile value of a score relative to the distribution
482given by inlist.  Formula depends on the values used to histogram the data(!).
483
484Usage:   lpercentileofscore(inlist,score,histbins=10,defaultlimits=None)
485"""
486
487    h, lrl, binsize, extras = histogram(inlist,histbins,defaultlimits)
488    cumhist = cumsum(copy.deepcopy(h))
489    i = int((score - lrl)/float(binsize))
490    pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inlist)) * 100
491    return pct
492
493
494def lhistogram (inlist,numbins=10,defaultreallimits=None,printextras=0):
495    """
496Returns (i) a list of histogram bin counts, (ii) the smallest value
497of the histogram binning, and (iii) the bin width (the last 2 are not
498necessarily integers).  Default number of bins is 10.  If no sequence object
499is given for defaultreallimits, the routine picks (usually non-pretty) bins
500spanning all the numbers in the inlist.
501
502Usage:   lhistogram (inlist, numbins=10, defaultreallimits=None,suppressoutput=0)
503Returns: list of bin values, lowerreallimit, binsize, extrapoints
504"""
505    if (defaultreallimits <> None):
506        if type(defaultreallimits) not in [ListType,TupleType] or len(defaultreallimits)==1: # only one limit given, assumed to be lower one & upper is calc'd
507            lowerreallimit = defaultreallimits
508            upperreallimit = 1.0001 * max(inlist)
509        else: # assume both limits given
510            lowerreallimit = defaultreallimits[0]
511            upperreallimit = defaultreallimits[1]
512        binsize = (upperreallimit-lowerreallimit)/float(numbins)
513    else:     # no limits given for histogram, both must be calc'd
514        estbinwidth=(max(inlist)-min(inlist))/float(numbins) + 1 # 1=>cover all
515        binsize = ((max(inlist)-min(inlist)+estbinwidth))/float(numbins)
516        lowerreallimit = min(inlist) - binsize/2 #lower real limit,1st bin
517    bins = [0]*(numbins)
518    extrapoints = 0
519    for num in inlist:
520        try:
521            if (num-lowerreallimit) < 0:
522                extrapoints = extrapoints + 1
523            else:
524                bintoincrement = int((num-lowerreallimit)/float(binsize))
525                bins[bintoincrement] = bins[bintoincrement] + 1
526        except:
527            extrapoints = extrapoints + 1
528    if (extrapoints > 0 and printextras == 1):
529        print '\nPoints outside given histogram range =',extrapoints
530    return (bins, lowerreallimit, binsize, extrapoints)
531
532
533def lcumfreq(inlist,numbins=10,defaultreallimits=None):
534    """
535Returns a cumulative frequency histogram, using the histogram function.
536
537Usage:   lcumfreq(inlist,numbins=10,defaultreallimits=None)
538Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints
539"""
540    h,l,b,e = histogram(inlist,numbins,defaultreallimits)
541    cumhist = cumsum(copy.deepcopy(h))
542    return cumhist,l,b,e
543
544
545def lrelfreq(inlist,numbins=10,defaultreallimits=None):
546    """
547Returns a relative frequency histogram, using the histogram function.
548
549Usage:   lrelfreq(inlist,numbins=10,defaultreallimits=None)
550Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints
551"""
552    h,l,b,e = histogram(inlist,numbins,defaultreallimits)
553    for i in range(len(h)):
554        h[i] = h[i]/float(len(inlist))
555    return h,l,b,e
556
557
558####################################
559#####  VARIABILITY FUNCTIONS  ######
560####################################
561
562def lobrientransform(*args):
563    """
564Computes a transform on input data (any number of columns).  Used to
565test for homogeneity of variance prior to running one-way stats.  From
566Maxwell and Delaney, p.112.
567
568Usage:   lobrientransform(*args)
569Returns: transformed data for use in an ANOVA
570"""
571    TINY = 1e-10
572    k = len(args)
573    n = [0.0]*k
574    v = [0.0]*k
575    m = [0.0]*k
576    nargs = []
577    for i in range(k):
578        nargs.append(copy.deepcopy(args[i]))
579        n[i] = float(len(nargs[i]))
580        v[i] = var(nargs[i])
581        m[i] = mean(nargs[i])
582    for j in range(k):
583        for i in range(n[j]):
584            t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2
585            t2 = 0.5*v[j]*(n[j]-1.0)
586            t3 = (n[j]-1.0)*(n[j]-2.0)
587            nargs[j][i] = (t1-t2) / float(t3)
588    check = 1
589    for j in range(k):
590        if v[j] - mean(nargs[j]) > TINY:
591            check = 0
592    if check <> 1:
593        raise ValueError, 'Problem in obrientransform.'
594    else:
595        return nargs
596
597
598def lsamplevar (inlist):
599    """
600Returns the variance of the values in the passed list using
601N for the denominator (i.e., DESCRIBES the sample variance only).
602
603Usage:   lsamplevar(inlist)
604"""
605    n = len(inlist)
606    mn = mean(inlist)
607    deviations = []
608    for item in inlist:
609        deviations.append(item-mn)
610    return ss(deviations)/float(n)
611
612
613def lsamplestdev (inlist):
614    """
615Returns the standard deviation of the values in the passed list using
616N for the denominator (i.e., DESCRIBES the sample stdev only).
617
618Usage:   lsamplestdev(inlist)
619"""
620    return math.sqrt(samplevar(inlist))
621
622
623def lvar (inlist):
624    """
625Returns the variance of the values in the passed list using N-1
626for the denominator (i.e., for estimating population variance).
627
628Usage:   lvar(inlist)
629"""
630    n = len(inlist)
631    mn = mean(inlist)
632    deviations = [0]*len(inlist)
633    for i in range(len(inlist)):
634        deviations[i] = inlist[i] - mn
635    return ss(deviations)/float(n-1)
636
637
638def lstdev (inlist):
639    """
640Returns the standard deviation of the values in the passed list
641using N-1 in the denominator (i.e., to estimate population stdev).
642
643Usage:   lstdev(inlist)
644"""
645    return math.sqrt(var(inlist))
646
647
648def lsterr(inlist):
649    """
650Returns the standard error of the values in the passed list using N-1
651in the denominator (i.e., to estimate population standard error).
652
653Usage:   lsterr(inlist)
654"""
655    return stdev(inlist) / float(math.sqrt(len(inlist)))
656
657
658def lsem (inlist):
659    """
660Returns the estimated standard error of the mean (sx-bar) of the
661values in the passed list.  sem = stdev / sqrt(n)
662
663Usage:   lsem(inlist)
664"""
665    sd = stdev(inlist)
666    n = len(inlist)
667    return sd/math.sqrt(n)
668
669
670def lz (inlist, score):
671    """
672Returns the z-score for a given input score, given that score and the
673list from which that score came.  Not appropriate for population calculations.
674
675Usage:   lz(inlist, score)
676"""
677    z = (score-mean(inlist))/samplestdev(inlist)
678    return z
679
680
681def lzs (inlist):
682    """
683Returns a list of z-scores, one for each score in the passed list.
684
685Usage:   lzs(inlist)
686"""
687    zscores = []
688    for item in inlist:
689        zscores.append(z(inlist,item))
690    return zscores
691
692
693####################################
694#######  TRIMMING FUNCTIONS  #######
695####################################
696
697def ltrimboth (l,proportiontocut):
698    """
699Slices off the passed proportion of items from BOTH ends of the passed
700list (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND 'rightmost'
70110% of scores.  Assumes list is sorted by magnitude.  Slices off LESS if
702proportion results in a non-integer slice index (i.e., conservatively
703slices off proportiontocut).
704
705Usage:   ltrimboth (l,proportiontocut)
706Returns: trimmed version of list l
707"""
708    lowercut = int(proportiontocut*len(l))
709    uppercut = len(l) - lowercut
710    return l[lowercut:uppercut]
711
712
713def ltrim1 (l,proportiontocut,tail='right'):
714    """
715Slices off the passed proportion of items from ONE end of the passed
716list (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost'
71710% of scores).  Slices off LESS if proportion results in a non-integer
718slice index (i.e., conservatively slices off proportiontocut).
719
720Usage:   ltrim1 (l,proportiontocut,tail='right')  or set tail='left'
721Returns: trimmed version of list l
722"""
723    if tail == 'right':
724        lowercut = 0
725        uppercut = len(l) - int(proportiontocut*len(l))
726    elif tail == 'left':
727        lowercut = int(proportiontocut*len(l))
728        uppercut = len(l)
729    return l[lowercut:uppercut]
730
731
732####################################
733#####  CORRELATION FUNCTIONS  ######
734####################################
735
736def lpaired(x,y):
737    """
738Interactively determines the type of data and then runs the
739appropriated statistic for paired group data.
740
741Usage:   lpaired(x,y)
742Returns: appropriate statistic name, value, and probability
743"""
744    samples = ''
745    while samples not in ['i','r','I','R','c','C']:
746        print '\nIndependent or related samples, or correlation (i,r,c): ',
747        samples = raw_input()
748
749    if samples in ['i','I','r','R']:
750        print '\nComparing variances ...',
751# USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112
752        r = obrientransform(x,y)
753        f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1))
754        if p<0.05:
755            vartype='unequal, p='+str(round(p,4))
756        else:
757            vartype='equal'
758        print vartype
759        if samples in ['i','I']:
760            if vartype[0]=='e':
761                t,p = ttest_ind(x,y,0)
762                print '\nIndependent samples t-test:  ', round(t,4),round(p,4)
763            else:
764                if len(x)>20 or len(y)>20:
765                    z,p = ranksums(x,y)
766                    print '\nRank Sums test (NONparametric, n>20):  ', round(z,4),round(p,4)
767                else:
768                    u,p = mannwhitneyu(x,y)
769                    print '\nMann-Whitney U-test (NONparametric, ns<20):  ', round(u,4),round(p,4)
770
771        else:  # RELATED SAMPLES
772            if vartype[0]=='e':
773                t,p = ttest_rel(x,y,0)
774                print '\nRelated samples t-test:  ', round(t,4),round(p,4)
775            else:
776                t,p = ranksums(x,y)
777                print '\nWilcoxon T-test (NONparametric):  ', round(t,4),round(p,4)
778    else:  # CORRELATION ANALYSIS
779        corrtype = ''
780        while corrtype not in ['c','C','r','R','d','D']:
781            print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ',
782            corrtype = raw_input()
783        if corrtype in ['c','C']:
784            m,b,r,p,see = linregress(x,y)
785            print '\nLinear regression for continuous variables ...'
786            lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]]
787            pstat.printcc(lol)
788        elif corrtype in ['r','R']:
789            r,p = spearmanr(x,y)
790            print '\nCorrelation for ranked variables ...'
791            print "Spearman's r: ",round(r,4),round(p,4)
792        else: # DICHOTOMOUS
793            r,p = pointbiserialr(x,y)
794            print '\nAssuming x contains a dichotomous variable ...'
795            print 'Point Biserial r: ',round(r,4),round(p,4)
796    print '\n\n'
797    return None
798
799
800def lpearsonr(x,y):
801    """
802Calculates a Pearson correlation coefficient and the associated
803probability value.  Taken from Heiman's Basic Statistics for the Behav.
804Sci (2nd), p.195.
805
806Usage:   lpearsonr(x,y)      where x and y are equal-length lists
807Returns: Pearson's r value, two-tailed p-value
808"""
809    TINY = 1.0e-30
810    if len(x) <> len(y):
811        raise ValueError, 'Input values not paired in pearsonr.  Aborting.'
812    n = len(x)
813    x = map(float,x)
814    y = map(float,y)
815    xmean = mean(x)
816    ymean = mean(y)
817    r_num = n*(summult(x,y)) - sum(x)*sum(y)
818    r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y)))
819    r = (r_num / r_den)  # denominator already a float
820    df = n-2
821    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
822    prob = betai(0.5*df,0.5,df/float(df+t*t))
823    return r, prob
824
825
826def lspearmanr(x,y):
827    """
828Calculates a Spearman rank-order correlation coefficient.  Taken
829from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.
830
831Usage:   lspearmanr(x,y)      where x and y are equal-length lists
832Returns: Spearman's r, two-tailed p-value
833"""
834    TINY = 1e-30
835    if len(x) <> len(y):
836        raise ValueError, 'Input values not paired in spearmanr.  Aborting.'
837    n = len(x)
838    rankx = rankdata(x)
839    ranky = rankdata(y)
840    dsq = sumdiffsquared(rankx,ranky)
841    rs = 1 - 6*dsq / float(n*(n**2-1))
842    t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
843    df = n-2
844    probrs = betai(0.5*df,0.5,df/(df+t*t))  # t already a float
845# probability values for rs are from part 2 of the spearman function in
846# Numerical Recipies, p.510.  They are close to tables, but not exact. (?)
847    return rs, probrs
848
849
850def lpointbiserialr(x,y):
851    """
852Calculates a point-biserial correlation coefficient and the associated
853probability value.  Taken from Heiman's Basic Statistics for the Behav.
854Sci (1st), p.194.
855
856Usage:   lpointbiserialr(x,y)      where x,y are equal-length lists
857Returns: Point-biserial r, two-tailed p-value
858"""
859    TINY = 1e-30
860    if len(x) <> len(y):
861        raise ValueError, 'INPUT VALUES NOT PAIRED IN pointbiserialr.  ABORTING.'
862    data = pstat.abut(x,y)
863    categories = pstat.unique(x)
864    if len(categories) <> 2:
865        raise ValueError, "Exactly 2 categories required for pointbiserialr()."
866    else:   # there are 2 categories, continue
867        codemap = pstat.abut(categories,range(2))
868        recoded = pstat.recode(data,codemap,0)
869        x = pstat.linexand(data,0,categories[0])
870        y = pstat.linexand(data,0,categories[1])
871        xmean = mean(pstat.colex(x,1))
872        ymean = mean(pstat.colex(y,1))
873        n = len(data)
874        adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n)))
875        rpb = (ymean - xmean)/samplestdev(pstat.colex(data,1))*adjust
876        df = n-2
877        t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY)))
878        prob = betai(0.5*df,0.5,df/(df+t*t))  # t already a float
879        return rpb, prob
880
881
882def lkendalltau(x,y):
883    """
884Calculates Kendall's tau ... correlation of ordinal data.  Adapted
885from function kendl1 in Numerical Recipies.  Needs good test-routine.@@@
886
887Usage:   lkendalltau(x,y)
888Returns: Kendall's tau, two-tailed p-value
889"""
890    n1 = 0
891    n2 = 0
892    iss = 0
893    for j in range(len(x)-1):
894        for k in range(j,len(y)):
895            a1 = x[j] - x[k]
896            a2 = y[j] - y[k]
897            aa = a1 * a2
898            if (aa):             # neither list has a tie
899                n1 = n1 + 1
900                n2 = n2 + 1
901                if aa > 0:
902                    iss = iss + 1
903                else:
904                    iss = iss -1
905            else:
906                if (a1):
907                    n1 = n1 + 1
908                else:
909                    n2 = n2 + 1
910    tau = iss / math.sqrt(n1*n2)
911    svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
912    z = tau / math.sqrt(svar)
913    prob = erfcc(abs(z)/1.4142136)
914    return tau, prob
915
916
917def llinregress(x,y):
918    """
919Calculates a regression line on x,y pairs. 
920
921Usage:   llinregress(x,y)      x,y are equal-length lists of x-y coordinates
922Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate
923"""
924    TINY = 1.0e-20
925    if len(x) <> len(y):
926        raise ValueError, 'Input values not paired in linregress.  Aborting.'
927    n = len(x)
928    x = map(float,x)
929    y = map(float,y)
930    xmean = mean(x)
931    ymean = mean(y)
932    r_num = float(n*(summult(x,y)) - sum(x)*sum(y))
933    r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y)))
934    r = r_num / r_den
935    z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY))
936    df = n-2
937    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
938    prob = betai(0.5*df,0.5,df/(df+t*t))
939    slope = r_num / float(n*ss(x) - square_of_sums(x))
940    intercept = ymean - slope*xmean
941    sterrest = math.sqrt(1-r*r)*samplestdev(y)
942    return slope, intercept, r, prob, sterrest
943
944
945####################################
946#####  INFERENTIAL STATISTICS  #####
947####################################
948
949def lttest_1samp(a,popmean,printit=0,name='Sample',writemode='a'):
950    """
951Calculates the t-obtained for the independent samples T-test on ONE group
952of scores a, given a population mean.  If printit=1, results are printed
953to the screen.  If printit='filename', the results are output to 'filename'
954using the given writemode (default=append).  Returns t-value, and prob.
955
956Usage:   lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a')
957Returns: t-value, two-tailed prob
958"""
959    x = mean(a)
960    v = var(a)
961    n = len(a)
962    df = n-1
963    svar = ((n-1)*v)/float(df)
964    t = (x-popmean)/math.sqrt(svar*(1.0/n))
965    prob = betai(0.5*df,0.5,float(df)/(df+t*t))
966
967    if printit <> 0:
968        statname = 'Single-sample T-test.'
969        outputpairedstats(printit,writemode,
970                          'Population','--',popmean,0,0,0,
971                          name,n,x,v,min(a),max(a),
972                          statname,t,prob)
973    return t,prob
974
975
976def lttest_ind (a, b, printit=0, name1='Samp1', name2='Samp2', writemode='a'):
977    """
978Calculates the t-obtained T-test on TWO INDEPENDENT samples of
979scores a, and b.  From Numerical Recipies, p.483.  If printit=1, results
980are printed to the screen.  If printit='filename', the results are output
981to 'filename' using the given writemode (default=append).  Returns t-value,
982and prob.
983
984Usage:   lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a')
985Returns: t-value, two-tailed prob
986"""
987    x1 = mean(a)
988    x2 = mean(b)
989    v1 = stdev(a)**2
990    v2 = stdev(b)**2
991    n1 = len(a)
992    n2 = len(b)
993    df = n1+n2-2
994    svar = ((n1-1)*v1+(n2-1)*v2)/float(df)
995    t = (x1-x2)/math.sqrt(svar*(1.0/n1 + 1.0/n2))
996    prob = betai(0.5*df,0.5,df/(df+t*t))
997
998    if printit <> 0:
999        statname = 'Independent samples T-test.'
1000        outputpairedstats(printit,writemode,
1001                          name1,n1,x1,v1,min(a),max(a),
1002                          name2,n2,x2,v2,min(b),max(b),
1003                          statname,t,prob)
1004    return t,prob
1005
1006
1007def lttest_rel (a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a'):
1008    """
1009Calculates the t-obtained T-test on TWO RELATED samples of scores,
1010a and b.  From Numerical Recipies, p.483.  If printit=1, results are
1011printed to the screen.  If printit='filename', the results are output to
1012'filename' using the given writemode (default=append).  Returns t-value,
1013and prob.
1014
1015Usage:   lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a')
1016Returns: t-value, two-tailed prob
1017"""
1018    if len(a)<>len(b):
1019        raise ValueError, 'Unequal length lists in ttest_rel.'
1020    x1 = mean(a)
1021    x2 = mean(b)
1022    v1 = var(a)
1023    v2 = var(b)
1024    n = len(a)
1025    cov = 0
1026    for i in range(len(a)):
1027        cov = cov + (a[i]-x1) * (b[i]-x2)
1028    df = n-1
1029    cov = cov / float(df)
1030    sd = math.sqrt((v1+v2 - 2.0*cov)/float(n))
1031    t = (x1-x2)/sd
1032    prob = betai(0.5*df,0.5,df/(df+t*t))
1033
1034    if printit <> 0:
1035        statname = 'Related samples T-test.'
1036        outputpairedstats(printit,writemode,
1037                          name1,n,x1,v1,min(a),max(a),
1038                          name2,n,x2,v2,min(b),max(b),
1039                          statname,t,prob)
1040    return t, prob
1041
1042
1043def lchisquare(f_obs,f_exp=None):
1044    """
1045Calculates a one-way chi square for list of observed frequencies and returns
1046the result.  If no expected frequencies are given, the total N is assumed to
1047be equally distributed across all groups.
1048
1049Usage:   lchisquare(f_obs, f_exp=None)   f_obs = list of observed cell freq.
1050Returns: chisquare-statistic, associated p-value
1051"""
1052    k = len(f_obs)                 # number of groups
1053    if f_exp == None:
1054        f_exp = [sum(f_obs)/float(k)] * len(f_obs) # create k bins with = freq.
1055    chisq = 0
1056    for i in range(len(f_obs)):
1057        chisq = chisq + (f_obs[i]-f_exp[i])**2 / float(f_exp[i])
1058    return chisq, chisqprob(chisq, k-1)
1059
1060
1061def lks_2samp (data1,data2):
1062    """
1063Computes the Kolmogorov-Smirnof statistic on 2 samples.  From
1064Numerical Recipies in C, page 493.
1065
1066Usage:   lks_2samp(data1,data2)   data1&2 are lists of values for 2 conditions
1067Returns: KS D-value, associated p-value
1068"""
1069    j1 = 0
1070    j2 = 0
1071    fn1 = 0.0
1072    fn2 = 0.0
1073    n1 = len(data1)
1074    n2 = len(data2)
1075    en1 = n1
1076    en2 = n2
1077    d = 0.0
1078    data1.sort()
1079    data2.sort()
1080    while j1 < n1 and j2 < n2:
1081        d1=data1[j1]
1082        d2=data2[j2]
1083        if d1 <= d2:
1084            fn1 = (j1)/float(en1)
1085            j1 = j1 + 1
1086        if d2 <= d1:
1087            fn2 = (j2)/float(en2)
1088            j2 = j2 + 1
1089        dt = (fn2-fn1)
1090        if math.fabs(dt) > math.fabs(d):
1091            d = dt
1092    try:
1093        en = math.sqrt(en1*en2/float(en1+en2))
1094        prob = ksprob((en+0.12+0.11/en)*abs(d))
1095    except:
1096        prob = 1.0
1097    return d, prob
1098
1099
1100def lmannwhitneyu(x,y):
1101    """
1102Calculates a Mann-Whitney U statistic on the provided scores and
1103returns the result.  Use only when the n in each condition is < 20 and
1104you have 2 independent samples of ranks.  NOTE: Mann-Whitney U is
1105significant if the u-obtained is LESS THAN or equal to the critical
1106value of U found in the tables.  Equivalent to Kruskal-Wallis H with
1107just 2 groups.
1108
1109Usage:   lmannwhitneyu(data)
1110Returns: u-statistic, one-tailed p-value (i.e., p(z(U)))
1111"""
1112    n1 = len(x)
1113    n2 = len(y)
1114    ranked = rankdata(x+y)
1115    rankx = ranked[0:n1]       # get the x-ranks
1116    ranky = ranked[n1:]        # the rest are y-ranks
1117    u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx)  # calc U for x
1118    u2 = n1*n2 - u1                            # remainder is U for y
1119    bigu = max(u1,u2)
1120    smallu = min(u1,u2)
1121    T = math.sqrt(tiecorrect(ranked))  # correction factor for tied scores
1122    if T == 0:
1123        raise ValueError, 'All numbers are identical in lmannwhitneyu'
1124    sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0)
1125    z = abs((bigu-n1*n2/2.0) / sd)  # normal approximation for prob calc
1126    return smallu, 1.0 - zprob(z)
1127
1128
1129def ltiecorrect(rankvals):
1130    """
1131Corrects for ties in Mann Whitney U and Kruskal Wallis H tests.  See
1132Siegel, S. (1956) Nonparametric Statistics for the Behavioral Sciences.
1133New York: McGraw-Hill.  Code adapted from |Stat rankind.c code.
1134
1135Usage:   ltiecorrect(rankvals)
1136Returns: T correction factor for U or H
1137"""
1138    sorted,posn = shellsort(rankvals)
1139    n = len(sorted)
1140    T = 0.0
1141    i = 0
1142    while (i<n-1):
1143        if sorted[i] == sorted[i+1]:
1144            nties = 1
1145            while (i<n-1) and (sorted[i] == sorted[i+1]):
1146                nties = nties +1
1147                i = i +1
1148            T = T + nties**3 - nties
1149        i = i+1
1150    T = T / float(n**3-n)
1151    return 1.0 - T
1152
1153
1154def lranksums(x,y):
1155    """
1156Calculates the rank sums statistic on the provided scores and
1157returns the result.  Use only when the n in each condition is > 20 and you
1158have 2 independent samples of ranks.
1159
1160Usage:   lranksums(x,y)
1161Returns: a z-statistic, two-tailed p-value
1162"""
1163    n1 = len(x)
1164    n2 = len(y)
1165    alldata = x+y
1166    ranked = rankdata(alldata)
1167    x = ranked[:n1]
1168    y = ranked[n1:]
1169    s = sum(x)
1170    expected = n1*(n1+n2+1) / 2.0
1171    z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0)
1172    prob = 2*(1.0 -zprob(abs(z)))
1173    return z, prob
1174
1175
1176def lwilcoxont(x,y):
1177    """
1178Calculates the Wilcoxon T-test for related samples and returns the
1179result.  A non-parametric T-test.
1180
1181Usage:   lwilcoxont(x,y)
1182Returns: a t-statistic, two-tail probability estimate
1183"""
1184    if len(x) <> len(y):
1185        raise ValueError, 'Unequal N in wilcoxont.  Aborting.'
1186    d=[]
1187    for i in range(len(x)):
1188        diff = x[i] - y[i]
1189        if diff <> 0:
1190            d.append(diff)
1191    count = len(d)
1192    absd = map(abs,d)
1193    absranked = rankdata(absd)
1194    r_plus = 0.0
1195    r_minus = 0.0
1196    for i in range(len(absd)):
1197        if d[i] < 0:
1198            r_minus = r_minus + absranked[i]
1199        else:
1200            r_plus = r_plus + absranked[i]
1201    wt = min(r_plus, r_minus)
1202    mn = count * (count+1) * 0.25
1203    se =  math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0)
1204    z = math.fabs(wt-mn) / se
1205    prob = 2*(1.0 -zprob(abs(z)))
1206    return wt, prob
1207
1208
1209def lkruskalwallish(*args):
1210    """
1211The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more
1212groups, requiring at least 5 subjects in each group.  This function
1213calculates the Kruskal-Wallis H-test for 3 or more independent samples
1214and returns the result. 
1215
1216Usage:   lkruskalwallish(*args)
1217Returns: H-statistic (corrected for ties), associated p-value
1218"""
1219    args = list(args)
1220    n = [0]*len(args)
1221    all = []
1222    n = map(len,args)
1223    for i in range(len(args)):
1224        all = all + args[i]
1225    ranked = rankdata(all)
1226    T = tiecorrect(ranked)
1227    for i in range(len(args)):
1228        args[i] = ranked[0:n[i]]
1229        del ranked[0:n[i]]
1230    rsums = []
1231    for i in range(len(args)):
1232        rsums.append(sum(args[i])**2)
1233        rsums[i] = rsums[i] / float(n[i])
1234    ssbn = sum(rsums)
1235    totaln = sum(n)
1236    h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
1237    df = len(args) - 1
1238    if T == 0:
1239        raise ValueError, 'All numbers are identical in lkruskalwallish'
1240    h = h / float(T)
1241    return h, chisqprob(h,df)
1242
1243
1244def lfriedmanchisquare(*args):
1245    """
1246Friedman Chi-Square is a non-parametric, one-way within-subjects
1247ANOVA.  This function calculates the Friedman Chi-square test for repeated
1248measures and returns the result, along with the associated probability
1249value.  It assumes 3 or more repeated measures.  Only 3 levels requires a
1250minimum of 10 subjects in the study.  Four levels requires 5 subjects per
1251level(??).
1252
1253Usage:   lfriedmanchisquare(*args)
1254Returns: chi-square statistic, associated p-value
1255"""
1256    k = len(args)
1257    if k < 3:
1258        raise ValueError, 'Less than 3 levels.  Friedman test not appropriate.'
1259    n = len(args[0])
1260    data = apply(pstat.abut,tuple(args))
1261    for i in range(len(data)):
1262        data[i] = rankdata(data[i])
1263    ssbn = 0
1264    for i in range(k):
1265        ssbn = ssbn + sum(args[i])**2
1266    chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)
1267    return chisq, chisqprob(chisq,k-1)
1268
1269
1270####################################
1271####  PROBABILITY CALCULATIONS  ####
1272####################################
1273
1274def lchisqprob(chisq,df):
1275    """
1276Returns the (1-tailed) probability value associated with the provided
1277chi-square value and df.  Adapted from chisq.c in Gary Perlman's |Stat.
1278
1279Usage:   lchisqprob(chisq,df)
1280"""
1281    BIG = 20.0
1282    def ex(x):
1283        BIG = 20.0
1284        if x < -BIG:
1285            return 0.0
1286        else:
1287            return math.exp(x)
1288
1289    if chisq <=0 or df < 1:
1290        return 1.0
1291    a = 0.5 * chisq
1292    if df%2 == 0:
1293        even = 1
1294    else:
1295        even = 0
1296    if df > 1:
1297        y = ex(-a)
1298    if even:
1299        s = y
1300    else:
1301        s = 2.0 * zprob(-math.sqrt(chisq))
1302    if (df > 2):
1303        chisq = 0.5 * (df - 1.0)
1304        if even:
1305            z = 1.0
1306        else:
1307            z = 0.5
1308        if a > BIG:
1309            if even:
1310                e = 0.0
1311            else:
1312                e = math.log(math.sqrt(math.pi))
1313            c = math.log(a)
1314            while (z <= chisq):
1315                e = math.log(z) + e
1316                s = s + ex(c*z-a-e)
1317                z = z + 1.0
1318            return s
1319        else:
1320            if even:
1321                e = 1.0
1322            else:
1323                e = 1.0 / math.sqrt(math.pi) / math.sqrt(a)
1324            c = 0.0
1325            while (z <= chisq):
1326                e = e * (a/float(z))
1327                c = c + e
1328                z = z + 1.0
1329            return (c*y+s)
1330    else:
1331        return s
1332
1333
1334def lerfcc(x):
1335    """
1336Returns the complementary error function erfc(x) with fractional
1337error everywhere less than 1.2e-7.  Adapted from Numerical Recipies.
1338
1339Usage:   lerfcc(x)
1340"""
1341    z = abs(x)
1342    t = 1.0 / (1.0+0.5*z)
1343    ans = t * math.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277)))))))))
1344    if x >= 0:
1345        return ans
1346    else:
1347        return 2.0 - ans
1348
1349
1350def lzprob(z):
1351    """
1352Returns the area under the normal curve 'to the left of' the given z value.
1353Thus,
1354    for z<0, zprob(z) = 1-tail probability
1355    for z>0, 1.0-zprob(z) = 1-tail probability
1356    for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability
1357Adapted from z.c in Gary Perlman's |Stat.
1358
1359Usage:   lzprob(z)
1360"""
1361    Z_MAX = 6.0    # maximum meaningful z-value
1362    if z == 0.0:
1363        x = 0.0
1364    else:
1365        y = 0.5 * math.fabs(z)
1366        if y >= (Z_MAX*0.5):
1367            x = 1.0
1368        elif (y < 1.0):
1369            w = y*y
1370            x = ((((((((0.000124818987 * w
1371                        -0.001075204047) * w +0.005198775019) * w
1372                      -0.019198292004) * w +0.059054035642) * w
1373                    -0.151968751364) * w +0.319152932694) * w
1374                  -0.531923007300) * w +0.797884560593) * y * 2.0
1375        else:
1376            y = y - 2.0
1377            x = (((((((((((((-0.000045255659 * y
1378                             +0.000152529290) * y -0.000019538132) * y
1379                           -0.000676904986) * y +0.001390604284) * y
1380                         -0.000794620820) * y -0.002034254874) * y
1381                       +0.006549791214) * y -0.010557625006) * y
1382                     +0.011630447319) * y -0.009279453341) * y
1383                   +0.005353579108) * y -0.002141268741) * y
1384                 +0.000535310849) * y +0.999936657524
1385    if z > 0.0:
1386        prob = ((x+1.0)*0.5)
1387    else:
1388        prob = ((1.0-x)*0.5)
1389    return prob
1390
1391
1392def lksprob(alam):
1393    """
1394Computes a Kolmolgorov-Smirnov t-test significance level.  Adapted from
1395Numerical Recipies.
1396
1397Usage:   lksprob(alam)
1398"""
1399    fac = 2.0
1400    sum = 0.0
1401    termbf = 0.0
1402    a2 = -2.0*alam*alam
1403    for j in range(1,201):
1404        term = fac*math.exp(a2*j*j)
1405        sum = sum + term
1406        if math.fabs(term) <= (0.001*termbf) or math.fabs(term) < (1.0e-8*sum):
1407            return sum
1408        fac = -fac
1409        termbf = math.fabs(term)
1410    return 1.0             # Get here only if fails to converge; was 0.0!!
1411
1412
1413def lfprob (dfnum, dfden, F):
1414    """
1415Returns the (1-tailed) significance level (p-value) of an F
1416statistic given the degrees of freedom for the numerator (dfR-dfF) and
1417the degrees of freedom for the denominator (dfF).
1418
1419Usage:   lfprob(dfnum, dfden, F)   where usually dfnum=dfbn, dfden=dfwn
1420"""
1421    p = betai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F))
1422    return p
1423
1424
1425def lbetacf(a,b,x):
1426    """
1427This function evaluates the continued fraction form of the incomplete
1428Beta function, betai.  (Adapted from: Numerical Recipies in C.)
1429
1430Usage:   lbetacf(a,b,x)
1431"""
1432    ITMAX = 200
1433    EPS = 3.0e-7
1434
1435    bm = az = am = 1.0
1436    qab = a+b
1437    qap = a+1.0
1438    qam = a-1.0
1439    bz = 1.0-qab*x/qap
1440    for i in range(ITMAX+1):
1441        em = float(i+1)
1442        tem = em + em
1443        d = em*(b-em)*x/((qam+tem)*(a+tem))
1444        ap = az + d*am
1445        bp = bz+d*bm
1446        d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem))
1447        app = ap+d*az
1448        bpp = bp+d*bz
1449        aold = az
1450        am = ap/bpp
1451        bm = bp/bpp
1452        az = app/bpp
1453        bz = 1.0
1454        if (abs(az-aold)<(EPS*abs(az))):
1455            return az
1456    print 'a or b too big, or ITMAX too small in Betacf.'
1457
1458
1459def lgammln(xx):
1460    """
1461Returns the gamma function of xx.
1462    Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
1463(Adapted from: Numerical Recipies in C.)
1464
1465Usage:   lgammln(xx)
1466"""
1467
1468    coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
1469             0.120858003e-2, -0.536382e-5]
1470    x = xx - 1.0
1471    tmp = x + 5.5
1472    tmp = tmp - (x+0.5)*math.log(tmp)
1473    ser = 1.0
1474    for j in range(len(coeff)):
1475        x = x + 1
1476        ser = ser + coeff[j]/x
1477    return -tmp + math.log(2.50662827465*ser)
1478
1479
1480def lbetai(a,b,x):
1481    """
1482Returns the incomplete beta function:
1483
1484    I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt)
1485
1486where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma
1487function of a.  The continued fraction formulation is implemented here,
1488using the betacf function.  (Adapted from: Numerical Recipies in C.)
1489
1490Usage:   lbetai(a,b,x)
1491"""
1492    if (x<0.0 or x>1.0):
1493        raise ValueError, 'Bad x in lbetai'
1494    if (x==0.0 or x==1.0):
1495        bt = 0.0
1496    else:
1497        bt = math.exp(gammln(a+b)-gammln(a)-gammln(b)+a*math.log(x)+b*
1498                      math.log(1.0-x))
1499    if (x<(a+1.0)/(a+b+2.0)):
1500        return bt*betacf(a,b,x)/float(a)
1501    else:
1502        return 1.0-bt*betacf(b,a,1.0-x)/float(b)
1503
1504
1505####################################
1506#######  ANOVA CALCULATIONS  #######
1507####################################
1508
1509def lF_oneway(*lists):
1510    """
1511Performs a 1-way ANOVA, returning an F-value and probability given
1512any number of groups.  From Heiman, pp.394-7.
1513
1514Usage:   F_oneway(*lists)    where *lists is any number of lists, one per
1515                                  treatment group
1516Returns: F value, one-tailed p-value
1517"""
1518    a = len(lists)           # ANOVA on 'a' groups, each in it's own list
1519    means = [0]*a
1520    vars = [0]*a
1521    ns = [0]*a
1522    alldata = []
1523    tmp = map(N.array,lists)
1524    means = map(amean,tmp)
1525    vars = map(avar,tmp)
1526    ns = map(len,lists)
1527    for i in range(len(lists)):
1528        alldata = alldata + lists[i]
1529    alldata = N.array(alldata)
1530    bign = len(alldata)
1531    sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign))
1532    ssbn = 0
1533    for list in lists:
1534        ssbn = ssbn + asquare_of_sums(N.array(list))/float(len(list))
1535    ssbn = ssbn - (asquare_of_sums(alldata)/float(bign))
1536    sswn = sstot-ssbn
1537    dfbn = a-1
1538    dfwn = bign - a
1539    msb = ssbn/float(dfbn)
1540    msw = sswn/float(dfwn)
1541    f = msb/msw
1542    prob = fprob(dfbn,dfwn,f)
1543    return f, prob
1544
1545
1546def lF_value (ER,EF,dfnum,dfden):
1547    """
1548Returns an F-statistic given the following:
1549        ER  = error associated with the null hypothesis (the Restricted model)
1550        EF  = error associated with the alternate hypothesis (the Full model)
1551        dfR-dfF = degrees of freedom of the numerator
1552        dfF = degrees of freedom associated with the denominator/Full model
1553
1554Usage:   lF_value(ER,EF,dfnum,dfden)
1555"""
1556    return ((ER-EF)/float(dfnum) / (EF/float(dfden)))
1557
1558
1559####################################
1560########  SUPPORT FUNCTIONS  #######
1561####################################
1562
1563def writecc (listoflists,file,writetype='w',extra=2):
1564    """
1565Writes a list of lists to a file in columns, customized by the max
1566size of items within the columns (max size of items in col, +2 characters)
1567to specified file.  File-overwrite is the default.
1568
1569Usage:   writecc (listoflists,file,writetype='w',extra=2)
1570Returns: None
1571"""
1572    if type(listoflists[0]) not in [ListType,TupleType]:
1573        listoflists = [listoflists]
1574    outfile = open(file,writetype)
1575    rowstokill = []
1576    list2print = copy.deepcopy(listoflists)
1577    for i in range(len(listoflists)):
1578        if listoflists[i] == ['\n'] or listoflists[i]=='\n' or listoflists[i]=='dashes':
1579            rowstokill = rowstokill + [i]
1580    rowstokill.reverse()
1581    for row in rowstokill:
1582        del list2print[row]
1583    maxsize = [0]*len(list2print[0])
1584    for col in range(len(list2print[0])):
1585        items = pstat.colex(list2print,col)
1586        items = map(pstat.makestr,items)
1587        maxsize[col] = max(map(len,items)) + extra
1588    for row in listoflists:
1589        if row == ['\n'] or row == '\n':
1590            outfile.write('\n')
1591        elif row == ['dashes'] or row == 'dashes':
1592            dashes = [0]*len(maxsize)
1593            for j in range(len(maxsize)):
1594                dashes[j] = '-'*(maxsize[j]-2)
1595            outfile.write(pstat.lineincustcols(dashes,maxsize))
1596        else:
1597            outfile.write(pstat.lineincustcols(row,maxsize))
1598        outfile.write('\n')
1599    outfile.close()
1600    return None
1601
1602
1603def lincr(l,cap):        # to increment a list up to a max-list of 'cap'
1604    """
1605Simulate a counting system from an n-dimensional list.
1606
1607Usage:   lincr(l,cap)   l=list to increment, cap=max values for each list pos'n
1608Returns: next set of values for list l, OR -1 (if overflow)
1609"""
1610    l[0] = l[0] + 1     # e.g., [0,0,0] --> [2,4,3] (=cap)
1611    for i in range(len(l)):
1612        if l[i] > cap[i] and i < len(l)-1: # if carryover AND not done
1613            l[i] = 0
1614            l[i+1] = l[i+1] + 1
1615        elif l[i] > cap[i] and i == len(l)-1: # overflow past last column, must be finished
1616            l = -1
1617    return l
1618
1619
1620def lsum (inlist):
1621    """
1622Returns the sum of the items in the passed list.
1623
1624Usage:   lsum(inlist)
1625"""
1626    s = 0
1627    for item in inlist:
1628        s = s + item
1629    return s
1630
1631
1632def lcumsum (inlist):
1633    """
1634Returns a list consisting of the cumulative sum of the items in the
1635passed list.
1636
1637Usage:   lcumsum(inlist)
1638"""
1639    newlist = copy.deepcopy(inlist)
1640    for i in range(1,len(newlist)):
1641        newlist[i] = newlist[i] + newlist[i-1]
1642    return newlist
1643
1644
1645def lss(inlist):
1646    """
1647Squares each value in the passed list, adds up these squares and
1648returns the result.
1649
1650Usage:   lss(inlist)
1651"""
1652    ss = 0
1653    for item in inlist:
1654        ss = ss + item*item
1655    return ss
1656
1657
1658def lsummult (list1,list2):
1659    """
1660Multiplies elements in list1 and list2, element by element, and
1661returns the sum of all resulting multiplications.  Must provide equal
1662length lists.
1663
1664Usage:   lsummult(list1,list2)
1665"""
1666    if len(list1) <> len(list2):
1667        raise ValueError, "Lists not equal length in summult."
1668    s = 0
1669    for item1,item2 in pstat.abut(list1,list2):
1670        s = s + item1*item2
1671    return s
1672
1673
1674def lsumdiffsquared(x,y):
1675    """
1676Takes pairwise differences of the values in lists x and y, squares
1677these differences, and returns the sum of these squares.
1678
1679Usage:   lsumdiffsquared(x,y)
1680Returns: sum[(x[i]-y[i])**2]
1681"""
1682    sds = 0
1683    for i in range(len(x)):
1684        sds = sds + (x[i]-y[i])**2
1685    return sds
1686
1687
1688def lsquare_of_sums(inlist):
1689    """
1690Adds the values in the passed list, squares the sum, and returns
1691the result.
1692
1693Usage:   lsquare_of_sums(inlist)
1694Returns: sum(inlist[i])**2
1695"""
1696    s = sum(inlist)
1697    return float(s)*s
1698
1699
1700def lshellsort(inlist):
1701    """
1702Shellsort algorithm.  Sorts a 1D-list.
1703
1704Usage:   lshellsort(inlist)
1705Returns: sorted-inlist, sorting-index-vector (for original list)
1706"""
1707    n = len(inlist)
1708    svec = copy.deepcopy(inlist)
1709    ivec = range(n)
1710    gap = n/2   # integer division needed
1711    while gap >0:
1712        for i in range(gap,n):
1713            for j in range(i-gap,-1,-gap):
1714                while j>=0 and svec[j]>svec[j+gap]:
1715                    temp        = svec[j]
1716                    svec[j]     = svec[j+gap]
1717                    svec[j+gap] = temp
1718                    itemp       = ivec[j]
1719                    ivec[j]     = ivec[j+gap]
1720                    ivec[j+gap] = itemp
1721        gap = gap / 2  # integer division needed
1722# svec is now sorted inlist, and ivec has the order svec[i] = vec[ivec[i]]
1723    return svec, ivec
1724
1725
1726def lrankdata(inlist):
1727    """
1728Ranks the data in inlist, dealing with ties appropritely.  Assumes
1729a 1D inlist.  Adapted from Gary Perlman's |Stat ranksort.
1730
1731Usage:   lrankdata(inlist)
1732Returns: a list of length equal to inlist, containing rank scores
1733"""
1734    n = len(inlist)
1735    svec, ivec = shellsort(inlist)
1736    sumranks = 0
1737    dupcount = 0
1738    newlist = [0]*n
1739    for i in range(n):
1740        sumranks = sumranks + i
1741        dupcount = dupcount + 1
1742        if i==n-1 or svec[i] <> svec[i+1]:
1743            averank = sumranks / float(dupcount) + 1
1744            for j in range(i-dupcount+1,i+1):
1745                newlist[ivec[j]] = averank
1746            sumranks = 0
1747            dupcount = 0
1748    return newlist
1749
1750
1751def outputpairedstats(fname,writemode,name1,n1,m1,se1,min1,max1,name2,n2,m2,se2,min2,max2,statname,stat,prob):
1752    """
1753Prints or write to a file stats for two groups, using the name, n,
1754mean, sterr, min and max for each group, as well as the statistic name,
1755its value, and the associated p-value.
1756
1757Usage:   outputpairedstats(fname,writemode,
1758                           name1,n1,mean1,stderr1,min1,max1,
1759                           name2,n2,mean2,stderr2,min2,max2,
1760                           statname,stat,prob)
1761Returns: None
1762"""
1763    suffix = ''                       # for *s after the p-value
1764    try:
1765        x = prob.shape
1766        prob = prob[0]
1767    except:
1768        pass
1769    if  prob < 0.001:  suffix = '  ***'
1770    elif prob < 0.01:  suffix = '  **'
1771    elif prob < 0.05:  suffix = '  *'
1772    title = [['Name','N','Mean','SD','Min','Max']]
1773    lofl = title+[[name1,n1,round(m1,3),round(math.sqrt(se1),3),min1,max1],
1774                  [name2,n2,round(m2,3),round(math.sqrt(se2),3),min2,max2]]
1775    if type(fname)<>StringType or len(fname)==0:
1776        print
1777        print statname
1778        print
1779        pstat.printcc(lofl)
1780        print
1781        try:
1782            if stat.shape == ():
1783                stat = stat[0]
1784            if prob.shape == ():
1785                prob = prob[0]
1786        except:
1787            pass
1788        print 'Test statistic = ',round(stat,3),'   p = ',round(prob,3),suffix
1789        print
1790    else:
1791        file = open(fname,writemode)
1792        file.write('\n'+statname+'\n\n')
1793        file.close()
1794        writecc(lofl,fname,'a')
1795        file = open(fname,'a')
1796        try:
1797            if stat.shape == ():
1798                stat = stat[0]
1799            if prob.shape == ():
1800                prob = prob[0]
1801        except:
1802            pass
1803        file.write(pstat.list2string(['\nTest statistic = ',round(stat,4),'   p = ',round(prob,4),suffix,'\n\n']))
1804        file.close()
1805    return None
1806
1807
1808def lfindwithin (data):
1809    """
1810Returns an integer representing a binary vector, where 1=within-
1811subject factor, 0=between.  Input equals the entire data 2D list (i.e.,
1812column 0=random factor, column -1=measured values (those two are skipped).
1813Note: input data is in |Stat format ... a list of lists ("2D list") with
1814one row per measured value, first column=subject identifier, last column=
1815score, one in-between column per factor (these columns contain level
1816designations on each factor).  See also stats.anova.__doc__.
1817
1818Usage:   lfindwithin(data)     data in |Stat format
1819"""
1820
1821    numfact = len(data[0])-1
1822    withinvec = 0
1823    for col in range(1,numfact):
1824        examplelevel = pstat.unique(pstat.colex(data,col))[0]
1825        rows = pstat.linexand(data,col,examplelevel)  # get 1 level of this factor
1826        factsubjs = pstat.unique(pstat.colex(rows,0))
1827        allsubjs = pstat.unique(pstat.colex(data,0))
1828        if len(factsubjs) == len(allsubjs):  # fewer Ss than scores on this factor?
1829            withinvec = withinvec + (1 << col)
1830    return withinvec
1831
1832
1833#########################################################
1834#########################################################
1835####### DISPATCH LISTS AND TUPLES TO ABOVE FCNS #########
1836#########################################################
1837#########################################################
1838
1839## CENTRAL TENDENCY:
1840geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)), )
1841harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)), )
1842mean = Dispatch ( (lmean, (ListType, TupleType)), )
1843median = Dispatch ( (lmedian, (ListType, TupleType)), )
1844medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)), )
1845mode = Dispatch ( (lmode, (ListType, TupleType)), )
1846
1847## MOMENTS:
1848moment = Dispatch ( (lmoment, (ListType, TupleType)), )
1849variation = Dispatch ( (lvariation, (ListType, TupleType)), )
1850skew = Dispatch ( (lskew, (ListType, TupleType)), )
1851kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)), )
1852describe = Dispatch ( (ldescribe, (ListType, TupleType)), )
1853
1854## FREQUENCY STATISTICS:
1855itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)), )
1856scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)), )
1857percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)), )
1858histogram = Dispatch ( (lhistogram, (ListType, TupleType)), )
1859cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)), )
1860relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)), )
1861
1862## VARIABILITY:
1863obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)), )
1864samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)), )
1865samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)), )
1866var = Dispatch ( (lvar, (ListType, TupleType)), )
1867stdev = Dispatch ( (lstdev, (ListType, TupleType)), )
1868sterr = Dispatch ( (lsterr, (ListType, TupleType)), )
1869sem = Dispatch ( (lsem, (ListType, TupleType)), )
1870z = Dispatch ( (lz, (ListType, TupleType)), )
1871zs = Dispatch ( (lzs, (ListType, TupleType)), )
1872
1873## TRIMMING FCNS:
1874trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)), )
1875trim1 = Dispatch ( (ltrim1, (ListType, TupleType)), )
1876
1877## CORRELATION FCNS:
1878paired = Dispatch ( (lpaired, (ListType, TupleType)), )
1879pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)), )
1880spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)), )
1881pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)), )
1882kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)), )
1883linregress = Dispatch ( (llinregress, (ListType, TupleType)), )
1884
1885## INFERENTIAL STATS:
1886ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)), )
1887ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)), )
1888ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)), )
1889chisquare = Dispatch ( (lchisquare, (ListType, TupleType)), )
1890ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)), )
1891mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)), )
1892ranksums = Dispatch ( (lranksums, (ListType, TupleType)), )
1893tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)), )
1894wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)), )
1895kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)), )
1896friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)), )
1897
1898## PROBABILITY CALCS:
1899chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)), )
1900zprob = Dispatch ( (lzprob, (IntType, FloatType)), )
1901ksprob = Dispatch ( (lksprob, (IntType, FloatType)), )
1902fprob = Dispatch ( (lfprob, (IntType, FloatType)), )
1903betacf = Dispatch ( (lbetacf, (IntType, FloatType)), )
1904betai = Dispatch ( (lbetai, (IntType, FloatType)), )
1905erfcc = Dispatch ( (lerfcc, (IntType, FloatType)), )
1906gammln = Dispatch ( (lgammln, (IntType, FloatType)), )
1907
1908## ANOVA FUNCTIONS:
1909F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)), )
1910F_value = Dispatch ( (lF_value, (ListType, TupleType)), )
1911
1912## SUPPORT FUNCTIONS:
1913incr = Dispatch ( (lincr, (ListType, TupleType)), )
1914sum = Dispatch ( (lsum, (ListType, TupleType)), )
1915cumsum = Dispatch ( (lcumsum, (ListType, TupleType)), )
1916ss = Dispatch ( (lss, (ListType, TupleType)), )
1917summult = Dispatch ( (lsummult, (ListType, TupleType)), )
1918square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)), )
1919sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)), )
1920shellsort = Dispatch ( (lshellsort, (ListType, TupleType)), )
1921rankdata = Dispatch ( (lrankdata, (ListType, TupleType)), )
1922findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)), )
1923
1924
1925#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1926#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1927#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1928#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1929#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1930#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1931#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1932#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1933#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1934#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1935#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1936#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1937#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1938#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1939#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1940#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1941#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1942#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1943#=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
1944
1945try:                         # DEFINE THESE *ONLY* IF NUMERIC IS AVAILABLE
1946 import Numeric
1947 N = Numeric
1948 import LinearAlgebra
1949 LA = LinearAlgebra
1950
1951
1952#####################################
1953########  ACENTRAL TENDENCY  ########
1954#####################################
1955
1956 def ageometricmean (inarray,dimension=None,keepdims=0):
1957    """
1958Calculates the geometric mean of the values in the passed array.
1959That is:  n-th root of (x1 * x2 * ... * xn).  Defaults to ALL values in
1960the passed array.  Use dimension=None to flatten array first.  REMEMBER: if
1961dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
1962if dimension is a sequence, it collapses over all specified dimensions.  If
1963keepdims is set to 1, the resulting array will have as many dimensions as
1964inarray, with only 1 'level' per dim that was collapsed over.
1965
1966Usage:   ageometricmean(inarray,dimension=None,keepdims=0)
1967Returns: geometric mean computed over dim(s) listed in dimension
1968"""
1969    inarray = N.array(inarray,N.Float)
1970    if dimension == None:
1971        inarray = N.ravel(inarray)
1972        size = len(inarray)
1973        mult = N.power(inarray,1.0/size)
1974        mult = N.multiply.reduce(mult)
1975    elif type(dimension) in [IntType,FloatType]:
1976        size = inarray.shape[dimension]
1977        mult = N.power(inarray,1.0/size)
1978        mult = N.multiply.reduce(mult,dimension)
1979        if keepdims == 1:
1980            shp = list(inarray.shape)
1981            shp[dimension] = 1
1982            sum = N.reshape(sum,shp)
1983    else: # must be a SEQUENCE of dims to average over
1984        dims = list(dimension)
1985        dims.sort()
1986        dims.reverse()
1987        size = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.Float)
1988        mult = N.power(inarray,1.0/size)
1989        for dim in dims:
1990            mult = N.multiply.reduce(mult,dim)
1991        if keepdims == 1:
1992            shp = list(inarray.shape)
1993            for dim in dims:
1994                shp[dim] = 1
1995            mult = N.reshape(mult,shp)
1996    return mult
1997
1998
1999 def aharmonicmean (inarray,dimension=None,keepdims=0):
2000    """
2001Calculates the harmonic mean of the values in the passed array.
2002That is:  n / (1/x1 + 1/x2 + ... + 1/xn).  Defaults to ALL values in
2003the passed array.  Use dimension=None to flatten array first.  REMEMBER: if
2004dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
2005if dimension is a sequence, it collapses over all specified dimensions.  If
2006keepdims is set to 1, the resulting array will have as many dimensions as
2007inarray, with only 1 'level' per dim that was collapsed over.
2008
2009Usage:   aharmonicmean(inarray,dimension=None,keepdims=0)
2010Returns: harmonic mean computed over dim(s) in dimension
2011"""
2012    inarray = inarray.astype(N.Float)
2013    if dimension == None:
2014        inarray = N.ravel(inarray)
2015        size = len(inarray)
2016        s = N.add.reduce(1.0 / inarray)
2017    elif type(dimension) in [IntType,FloatType]:
2018        size = float(inarray.shape[dimension])
2019        s = N.add.reduce(1.0/inarray, dimension)
2020        if keepdims == 1:
2021            shp = list(inarray.shape)
2022            shp[dimension] = 1
2023            s = N.reshape(s,shp)
2024    else: # must be a SEQUENCE of dims to average over
2025        dims = list(dimension)
2026        dims.sort()
2027        nondims = []
2028        for i in range(len(inarray.shape)):
2029            if i not in dims:
2030                nondims.append(i)
2031        tinarray = N.transpose(inarray,nondims+dims) # put keep-dims first
2032        idx = [0] *len(nondims)
2033        if idx == []:
2034            size = len(N.ravel(inarray))
2035            s = asum(1.0 / inarray)
2036            if keepdims == 1:
2037                s = N.reshape([s],N.ones(len(inarray.shape)))
2038        else:
2039            idx[0] = -1
2040            loopcap = N.array(tinarray.shape[0:len(nondims)]) -1
2041            s = N.zeros(loopcap+1,N.Float)
2042            while incr(idx,loopcap) <> -1:
2043                s[idx] = asum(1.0/tinarray[idx])
2044            size = N.multiply.reduce(N.take(inarray.shape,dims))
2045            if keepdims == 1:
2046                shp = list(inarray.shape)
2047                for dim in dims:
2048                    shp[dim] = 1
2049                s = N.reshape(s,shp)
2050    return size / s
2051
2052
2053 def amean (inarray,dimension=None,keepdims=0):
2054    """
2055Calculates the arithmatic mean of the values in the passed array.
2056That is:  1/n * (x1 + x2 + ... + xn).  Defaults to ALL values in the
2057passed array.  Use dimension=None to flatten array first.  REMEMBER: if
2058dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
2059if dimension is a sequence, it collapses over all specified dimensions.  If
2060keepdims is set to 1, the resulting array will have as many dimensions as
2061inarray, with only 1 'level' per dim that was collapsed over.
2062
2063Usage:   amean(inarray,dimension=None,keepdims=0)
2064Returns: arithematic mean calculated over dim(s) in dimension
2065"""
2066    if inarray.typecode() in ['l','s','b']:
2067        inarray = inarray.astype(N.Float)
2068    if dimension == None:
2069        inarray = N.ravel(inarray)
2070        sum = N.add.reduce(inarray)
2071        denom = float(len(inarray))
2072    elif type(dimension) in [IntType,FloatType]:
2073        sum = asum(inarray,dimension)
2074        denom = float(inarray.shape[dimension])
2075        if keepdims == 1:
2076            shp = list(inarray.shape)
2077            shp[dimension] = 1
2078            sum = N.reshape(sum,shp)
2079    else: # must be a TUPLE of dims to average over
2080        dims = list(dimension)
2081        dims.sort()
2082        dims.reverse()
2083        sum = inarray *1.0
2084        for dim in dims:
2085            sum = N.add.reduce(sum,dim)
2086        denom = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.Float)
2087        if keepdims == 1:
2088            shp = list(inarray.shape)
2089            for dim in dims:
2090                shp[dim] = 1
2091            sum = N.reshape(sum,shp)
2092    return sum/denom
2093
2094
2095 def amedian (inarray,numbins=1000):
2096    """
2097Calculates the COMPUTED median value of an array of numbers, given the
2098number of bins to use for the histogram (more bins approaches finding the
2099precise median value of the array; default number of bins = 1000).  From
2100G.W. Heiman's Basic Stats, or CRC Probability & Statistics.
2101NOTE:  THIS ROUTINE ALWAYS uses the entire passed array (flattens it first).
2102
2103Usage:   amedian(inarray,numbins=1000)
2104Returns: median calculated over ALL values in inarray
2105"""
2106    inarray = N.ravel(inarray)
2107    (hist, smallest, binsize, extras) = ahistogram(inarray,numbins)
2108    cumhist = N.cumsum(hist)            # make cumulative histogram
2109    otherbins = N.greater_equal(cumhist,len(inarray)/2.0)
2110    otherbins = list(otherbins)         # list of 0/1s, 1s start at median bin
2111    cfbin = otherbins.index(1)                # get 1st(!) index holding 50%ile score
2112    LRL = smallest + binsize*cfbin        # get lower read limit of that bin
2113    cfbelow = N.add.reduce(hist[0:cfbin])        # cum. freq. below bin
2114    freq = hist[cfbin]                        # frequency IN the 50%ile bin
2115    median = LRL + ((len(inarray)/2.0-cfbelow)/float(freq))*binsize # MEDIAN
2116    return median
2117
2118
2119 def amedianscore (inarray,dimension=None):
2120    """
2121Returns the 'middle' score of the passed array.  If there is an even
2122number of scores, the mean of the 2 middle scores is returned.  Can function
2123with 1D arrays, or on the FIRST dimension of 2D arrays (i.e., dimension can
2124be None, to pre-flatten the array, or else dimension must equal 0).
2125
2126Usage:   amedianscore(inarray,dimension=None)
2127Returns: 'middle' score of the array, or the mean of the 2 middle scores
2128"""
2129    if dimension == None:
2130        inarray = N.ravel(inarray)
2131        dimension = 0
2132    inarray = N.sort(inarray,dimension)
2133    if inarray.shape[dimension] % 2 == 0:   # if even number of elements
2134        indx = inarray.shape[dimension]/2   # integer division correct
2135        median = N.asarray(inarray[indx]+inarray[indx-1]) / 2.0
2136    else:
2137        indx = inarray.shape[dimension] / 2 # integer division correct
2138        median = N.take(inarray,[indx],dimension)
2139        if median.shape == (1,):
2140            median = median[0]
2141    return median
2142
2143
2144 def amode(a, dimension=None):
2145    """
2146Returns an array of the modal (most common) score in the passed array.
2147If there is more than one such score, ONLY THE FIRST is returned.
2148The bin-count for the modal values is also returned.  Operates on whole
2149array (dimension=None), or on a given dimension.
2150
2151Usage:   amode(a, dimension=None)
2152Returns: array of bin-counts for mode(s), array of corresponding modal values
2153"""
2154
2155    if dimension == None:
2156        a = N.ravel(a)
2157        dimension = 0
2158    scores = pstat.aunique(N.ravel(a))       # get ALL unique values
2159    testshape = list(a.shape)
2160    testshape[dimension] = 1
2161    oldmostfreq = N.zeros(testshape)
2162    oldcounts = N.zeros(testshape)
2163    for score in scores:
2164        template = N.equal(a,score)
2165        counts = asum(template,dimension,1)
2166        mostfrequent = N.where(N.greater(counts,oldcounts),score,oldmostfreq)
2167        oldcounts = N.where(N.greater(counts,oldcounts),counts,oldcounts)
2168        oldmostfreq = mostfrequent
2169    return oldcounts, mostfrequent
2170
2171
2172 def atmean(a,limits=None,inclusive=(1,1)):
2173     """
2174Returns the arithmetic mean of all values in an array, ignoring values
2175strictly outside the sequence passed to 'limits'.   Note: either limit
2176in the sequence, or the value of limits itself, can be set to None.  The
2177inclusive list/tuple determines whether the lower and upper limiting bounds
2178(respectively) are open/exclusive (0) or closed/inclusive (1).
2179
2180Usage:   atmean(a,limits=None,inclusive=(1,1))
2181"""
2182     if a.typecode() in ['l','s','b']:
2183         a = a.astype(N.Float)
2184     if limits == None:
2185         return mean(a)
2186     assert type(limits) in [ListType,TupleType,N.ArrayType], "Wrong type for limits in atmean"
2187     if inclusive[0]:         lowerfcn = N.greater_equal
2188     else:               lowerfcn = N.greater
2189     if inclusive[1]:         upperfcn = N.less_equal
2190     else:               upperfcn = N.less
2191     if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
2192         raise ValueError, "No array values within given limits (atmean)."
2193     elif limits[0]==None and limits[1]<>None:
2194         mask = upperfcn(a,limits[1])
2195     elif limits[0]<>None and limits[1]==None:
2196         mask = lowerfcn(a,limits[0])
2197     elif limits[0]<>None and limits[1]<>None:
2198         mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
2199     s = float(N.add.reduce(N.ravel(a*mask)))
2200     n = float(N.add.reduce(N.ravel(mask)))
2201     return s/n
2202
2203
2204 def atvar(a,limits=None,inclusive=(1,1)):
2205     """
2206Returns the sample variance of values in an array, (i.e., using N-1),
2207ignoring values strictly outside the sequence passed to 'limits'. 
2208Note: either limit in the sequence, or the value of limits itself,
2209can be set to None.  The inclusive list/tuple determines whether the lower
2210and upper limiting bounds (respectively) are open/exclusive (0) or
2211closed/inclusive (1).
2212
2213Usage:   atvar(a,limits=None,inclusive=(1,1))
2214"""
2215     a = a.astype(N.Float)
2216     if limits == None or limits == [None,None]:
2217         term1 = N.add.reduce(N.ravel(a*a))
2218         n = float(len(N.ravel(a))) - 1
2219         term2 = N.add.reduce(N.ravel(a))**2 / n
2220         print term1, term2, n
2221         return (term1 - term2) / n
2222     assert type(limits) in [ListType,TupleType,N.ArrayType], "Wrong type for limits in atvar"
2223     if inclusive[0]:         lowerfcn = N.greater_equal
2224     else:               lowerfcn = N.greater
2225     if inclusive[1]:         upperfcn = N.less_equal
2226     else:               upperfcn = N.less
2227     if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
2228         raise ValueError, "No array values within given limits (atvar)."
2229     elif limits[0]==None and limits[1]<>None:
2230         mask = upperfcn(a,limits[1])
2231     elif limits[0]<>None and limits[1]==None:
2232         mask = lowerfcn(a,limits[0])
2233     elif limits[0]<>None and limits[1]<>None:
2234         mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
2235     term1 = N.add.reduce(N.ravel(a*a*mask))
2236     n = float(N.add.reduce(N.ravel(mask))) - 1
2237     term2 = N.add.reduce(N.ravel(a*mask))**2 / n
2238     print term1, term2, n
2239     return (term1 - term2) / n
2240
2241
2242 def atmin(a,lowerlimit=None,dimension=None,inclusive=1):
2243     """
2244Returns the minimum value of a, along dimension, including only values less
2245than (or equal to, if inclusive=1) lowerlimit.  If the limit is set to None,
2246all values in the array are used.
2247
2248Usage:   atmin(a,lowerlimit=None,dimension=None,inclusive=1)
2249"""
2250     if inclusive:         lowerfcn = N.greater
2251     else:               lowerfcn = N.greater_equal
2252     if dimension == None:
2253         a = N.ravel(a)
2254         dimension = 0
2255     if lowerlimit == None:
2256         lowerlimit = N.minimum.reduce(N.ravel(a))-11
2257     biggest = N.maximum.reduce(N.ravel(a))
2258     ta = N.where(lowerfcn(a,lowerlimit),a,biggest)
2259     return N.minimum.reduce(ta,dimension)
2260
2261
2262 def atmax(a,upperlimit,dimension=None,inclusive=1):
2263     """
2264Returns the maximum value of a, along dimension, including only values greater
2265than (or equal to, if inclusive=1) upperlimit.  If the limit is set to None,
2266a limit larger than the max value in the array is used.
2267
2268Usage:   atmax(a,upperlimit,dimension=None,inclusive=1)
2269"""
2270     if inclusive:         upperfcn = N.less
2271     else:               upperfcn = N.less_equal
2272     if dimension == None:
2273         a = N.ravel(a)
2274         dimension = 0
2275     if upperlimit == None:
2276         upperlimit = N.maximum.reduce(N.ravel(a))+1
2277     smallest = N.minimum.reduce(N.ravel(a))
2278     ta = N.where(upperfcn(a,upperlimit),a,smallest)
2279     return N.maximum.reduce(ta,dimension)
2280
2281
2282 def atstdev(a,limits=None,inclusive=(1,1)):
2283     """
2284Returns the standard deviation of all values in an array, ignoring values
2285strictly outside the sequence passed to 'limits'.   Note: either limit
2286in the sequence, or the value of limits itself, can be set to None.  The
2287inclusive list/tuple determines whether the lower and upper limiting bounds
2288(respectively) are open/exclusive (0) or closed/inclusive (1).
2289
2290Usage:   atstdev(a,limits=None,inclusive=(1,1))
2291"""
2292     return N.sqrt(tvar(a,limits,inclusive))
2293
2294
2295 def atsem(a,limits=None,inclusive=(1,1)):
2296     """
2297Returns the standard error of the mean for the values in an array,
2298(i.e., using N for the denominator), ignoring values strictly outside
2299the sequence passed to 'limits'.   Note: either limit in the sequence,
2300or the value of limits itself, can be set to None.  The inclusive list/tuple
2301determines whether the lower and upper limiting bounds (respectively) are
2302open/exclusive (0) or closed/inclusive (1).
2303
2304Usage:   atsem(a,limits=None,inclusive=(1,1))
2305"""
2306     sd = tstdev(a,limits,inclusive)
2307     if limits == None or limits == [None,None]:
2308         n = float(len(N.ravel(a)))
2309     assert type(limits) in [ListType,TupleType,N.ArrayType], "Wrong type for limits in atsem"
2310     if inclusive[0]:         lowerfcn = N.greater_equal
2311     else:               lowerfcn = N.greater
2312     if inclusive[1]:         upperfcn = N.less_equal
2313     else:               upperfcn = N.less
2314     if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
2315         raise ValueError, "No array values within given limits (atsem)."
2316     elif limits[0]==None and limits[1]<>None:
2317         mask = upperfcn(a,limits[1])
2318     elif limits[0]<>None and limits[1]==None:
2319         mask = lowerfcn(a,limits[0])
2320     elif limits[0]<>None and limits[1]<>None:
2321         mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
2322     term1 = N.add.reduce(N.ravel(a*a*mask))
2323     n = float(N.add.reduce(N.ravel(mask)))
2324     return sd/math.sqrt(n)
2325
2326
2327#####################################
2328############  AMOMENTS  #############
2329#####################################
2330
2331 def amoment(a,moment=1,dimension=None):
2332    """
2333Calculates the nth moment about the mean for a sample (defaults to the
23341st moment).  Generally used to calculate coefficients of skewness and
2335kurtosis.  Dimension can equal None (ravel array first), an integer
2336(the dimension over which to operate), or a sequence (operate over
2337multiple dimensions).
2338
2339Usage:   amoment(a,moment=1,dimension=None)
2340Returns: appropriate moment along given dimension
2341"""
2342    if dimension == None:
2343        a = N.ravel(a)
2344        dimension = 0
2345    if moment == 1:
2346        return 0.0
2347    else:
2348        mn = amean(a,dimension,1)  # 1=keepdims
2349        s = N.power((a-mn),moment)
2350        return amean(s,dimension)
2351
2352
2353 def avariation(a,dimension=None):
2354    """
2355Returns the coefficient of variation, as defined in CRC Standard
2356Probability and Statistics, p.6. Dimension can equal None (ravel array
2357first), an integer (the dimension over which to operate), or a
2358sequence (operate over multiple dimensions).
2359
2360Usage:   avariation(a,dimension=None)
2361"""
2362    return 100.0*asamplestdev(a,dimension)/amean(a,dimension)
2363
2364
2365 def askew(a,dimension=None): 
2366    """
2367Returns the skewness of a distribution (normal ==> 0.0; >0 means extra
2368weight in left tail).  Use askewtest() to see if it's close enough.
2369Dimension can equal None (ravel array first), an integer (the
2370dimension over which to operate), or a sequence (operate over multiple
2371dimensions).
2372
2373Usage:   askew(a, dimension=None)
2374Returns: skew of vals in a along dimension, returning ZERO where all vals equal
2375"""
2376    denom = N.power(amoment(a,2,dimension),1.5)
2377    zero = N.equal(denom,0)
2378    if type(denom) == N.ArrayType and asum(zero) <> 0:
2379        print "Number of zeros in askew: ",asum(zero)
2380    denom = denom + zero  # prevent divide-by-zero
2381    return N.where(zero, 0, amoment(a,3,dimension)/denom)
2382
2383
2384 def akurtosis(a,dimension=None):
2385    """
2386Returns the kurtosis of a distribution (normal ==> 3.0; >3 means
2387heavier in the tails, and usually more peaked).  Use akurtosistest()
2388to see if it's close enough.  Dimension can equal None (ravel array
2389first), an integer (the dimension over which to operate), or a
2390sequence (operate over multiple dimensions).
2391
2392Usage:   akurtosis(a,dimension=None)
2393Returns: kurtosis of values in a along dimension, and ZERO where all vals equal
2394"""
2395    denom = N.power(amoment(a,2,dimension),2)
2396    zero = N.equal(denom,0)
2397    if type(denom) == N.ArrayType and asum(zero) <> 0:
2398        print "Number of zeros in akurtosis: ",asum(zero)
2399    denom = denom + zero  # prevent divide-by-zero
2400    return N.where(zero,0,amoment(a,4,dimension)/denom)
2401
2402
2403 def adescribe(inarray,dimension=None):
2404     """
2405Returns several descriptive statistics of the passed array.  Dimension
2406can equal None (ravel array first), an integer (the dimension over
2407which to operate), or a sequence (operate over multiple dimensions).
2408
2409Usage:   adescribe(inarray,dimension=None)
2410Returns: n, (min,max), mean, standard deviation, skew, kurtosis
2411"""
2412     if dimension == None:
2413         inarray = N.ravel(inarray)
2414         dimension = 0
2415     n = inarray.shape[dimension]
2416     mm = (N.minimum.reduce(inarray),N.maximum.reduce(inarray))
2417     m = amean(inarray,dimension)
2418     sd = astdev(inarray,dimension)
2419     skew = askew(inarray,dimension)
2420     kurt = akurtosis(inarray,dimension)
2421     return n, mm, m, sd, skew, kurt
2422
2423
2424#####################################
2425########  NORMALITY TESTS  ##########
2426#####################################
2427
2428 def askewtest(a,dimension=None):
2429    """
2430Tests whether the skew is significantly different from a normal
2431distribution.  Dimension can equal None (ravel array first), an
2432integer (the dimension over which to operate), or a sequence (operate
2433over multiple dimensions).
2434
2435Usage:   askewtest(a,dimension=None)
2436Returns: z-score and 2-tail z-probability
2437"""
2438    if dimension == None:
2439        a = N.ravel(a)
2440        dimension = 0
2441    b2 = askew(a,dimension)
2442    n = float(a.shape[dimension])
2443    y = b2 * N.sqrt(((n+1)*(n+3)) / (6.0*(n-2)) )
2444    beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) )
2445    W2 = -1 + N.sqrt(2*(beta2-1))
2446    delta = 1/N.sqrt(N.log(N.sqrt(W2)))
2447    alpha = N.sqrt(2/(W2-1))
2448    y = N.where(N.equal(y,0),1,y)
2449    Z = delta*N.log(y/alpha + N.sqrt((y/alpha)**2+1))
2450    return Z, (1.0-zprob(Z))*2
2451
2452
2453 def akurtosistest(a,dimension=None):
2454    """
2455Tests whether a dataset has normal kurtosis (i.e.,
2456kurtosis=3(n-1)/(n+1)) Valid only for n>20.  Dimension can equal None
2457(ravel array first), an integer (the dimension over which to operate),
2458or a sequence (operate over multiple dimensions).
2459
2460Usage:   akurtosistest(a,dimension=None)
2461Returns: z-score and 2-tail z-probability, returns 0 for bad pixels
2462"""
2463    if dimension == None:
2464        a = N.ravel(a)
2465        dimension = 0
2466    n = float(a.shape[dimension])
2467    if n<20:
2468        print "akurtosistest only valid for n>=20 ... continuing anyway, n=",n
2469    b2 = akurtosis(a,dimension)
2470    E = 3.0*(n-1) /(n+1)
2471    varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5))
2472    x = (b2-E)/N.sqrt(varb2)
2473    sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * N.sqrt((6.0*(n+3)*(n+5))/
2474                                                       (n*(n-2)*(n-3)))
2475    A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + N.sqrt(1+4.0/(sqrtbeta1**2)))
2476    term1 = 1 -2/(9.0*A)
2477    denom = 1 +x*N.sqrt(2/(A-4.0))
2478    denom = N.where(N.less(denom,0), 99, denom)
2479    term2 = N.where(N.equal(denom,0), term1, N.power((1-2.0/A)/denom,1/3.0))
2480    Z = ( term1 - term2 ) / N.sqrt(2/(9.0*A))
2481    Z = N.where(N.equal(denom,99), 0, Z)
2482    return Z, (1.0-zprob(Z))*2
2483
2484
2485 def anormaltest(a,dimension=None):
2486    """
2487Tests whether skew and/OR kurtosis of dataset differs from normal
2488curve.  Can operate over multiple dimensions.  Dimension can equal
2489None (ravel array first), an integer (the dimension over which to
2490operate), or a sequence (operate over multiple dimensions).
2491
2492Usage:   anormaltest(a,dimension=None)
2493Returns: z-score and 2-tail probability
2494"""
2495    if dimension == None:
2496        a = N.ravel(a)
2497        dimension = 0
2498    s,p = askewtest(a,dimension)
2499    k,p = akurtosistest(a,dimension)
2500    k2 = N.power(s,2) + N.power(k,2)
2501    return k2, achisqprob(k2,2)
2502
2503
2504#####################################
2505######  AFREQUENCY FUNCTIONS  #######
2506#####################################
2507
2508 def aitemfreq(a):
2509    """
2510Returns a 2D array of item frequencies.  Column 1 contains item values,
2511column 2 contains their respective counts.  Assumes a 1D array is passed.
2512
2513Usage:   aitemfreq(a)
2514Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies)
2515"""
2516    scores = pstat.aunique(a)
2517    scores = N.sort(scores)
2518    freq = N.zeros(len(scores))
2519    for i in range(len(scores)):
2520        freq[i] = N.add.reduce(N.equal(a,scores[i]))
2521    return N.array(pstat.aabut(scores, freq))
2522
2523
2524 def ascoreatpercentile (inarray, percent):
2525    """
2526Usage:   ascoreatpercentile(inarray,percent)   0<percent<100
2527Returns: score at given percentile, relative to inarray distribution
2528"""
2529    percent = percent / 100.0
2530    targetcf = percent*len(inarray)
2531    h, lrl, binsize, extras = histogram(inarray)
2532    cumhist = cumsum(h*1)
2533    for i in range(len(cumhist)):
2534        if cumhist[i] >= targetcf:
2535            break
2536    score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i)
2537    return score
2538
2539
2540 def apercentileofscore (inarray,score,histbins=10,defaultlimits=None):
2541    """
2542Note: result of this function depends on the values used to histogram
2543the data(!).
2544
2545Usage:   apercentileofscore(inarray,score,histbins=10,defaultlimits=None)
2546Returns: percentile-position of score (0-100) relative to inarray
2547"""
2548    h, lrl, binsize, extras = histogram(inarray,histbins,defaultlimits)
2549    cumhist = cumsum(h*1)
2550    i = int((score - lrl)/float(binsize))
2551    pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inarray)) * 100
2552    return pct
2553
2554
2555 def ahistogram (inarray,numbins=10,defaultlimits=None,printextras=1):
2556    """
2557Returns (i) an array of histogram bin counts, (ii) the smallest value
2558of the histogram binning, and (iii) the bin width (the last 2 are not
2559necessarily integers).  Default number of bins is 10.  Defaultlimits
2560can be None (the routine picks bins spanning all the numbers in the
2561inarray) or a 2-sequence (lowerlimit, upperlimit).  Returns all of the
2562following: array of bin values, lowerreallimit, binsize, extrapoints.
2563
2564Usage:   ahistogram(inarray,numbins=10,defaultlimits=None,printextras=1)
2565Returns: (array of bin counts, bin-minimum, min-width, #-points-outside-range)
2566"""
2567    inarray = N.ravel(inarray)               # flatten any >1D arrays
2568    if (defaultlimits <> None):
2569        lowerreallimit = defaultlimits[0]
2570        upperreallimit = defaultlimits[1]
2571        binsize = (upperreallimit-lowerreallimit) / float(numbins)
2572    else:
2573        Min = N.minimum.reduce(inarray)
2574        Max = N.maximum.reduce(inarray)
2575        estbinwidth = float(Max - Min)/float(numbins) + 1
2576        binsize = (Max-Min+estbinwidth)/float(numbins)
2577        lowerreallimit = Min - binsize/2.0  #lower real limit,1st bin
2578    bins = N.zeros(numbins)
2579    extrapoints = 0
2580    for num in inarray:
2581        try:
2582            if (num-lowerreallimit) < 0:
2583                extrapoints = extrapoints + 1
2584            else:
2585                bintoincrement = int((num-lowerreallimit) / float(binsize))
2586                bins[bintoincrement] = bins[bintoincrement] + 1
2587        except:                           # point outside lower/upper limits
2588            extrapoints = extrapoints + 1
2589    if (extrapoints > 0 and printextras == 1):
2590        print '\nPoints outside given histogram range =',extrapoints
2591    return (bins, lowerreallimit, binsize, extrapoints)
2592
2593
2594 def acumfreq(a,numbins=10,defaultreallimits=None):
2595    """
2596Returns a cumulative frequency histogram, using the histogram function.
2597Defaultreallimits can be None (use all data), or a 2-sequence containing
2598lower and upper limits on values to include.
2599
2600Usage:   acumfreq(a,numbins=10,defaultreallimits=None)
2601Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints
2602"""
2603    h,l,b,e = histogram(a,numbins,defaultreallimits)
2604    cumhist = cumsum(h*1)
2605    return cumhist,l,b,e
2606
2607
2608 def arelfreq(a,numbins=10,defaultreallimits=None):
2609    """
2610Returns a relative frequency histogram, using the histogram function.
2611Defaultreallimits can be None (use all data), or a 2-sequence containing
2612lower and upper limits on values to include.
2613
2614Usage:   arelfreq(a,numbins=10,defaultreallimits=None)
2615Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints
2616"""
2617    h,l,b,e = histogram(a,numbins,defaultreallimits)
2618    h = N.array(h/float(a.shape[0]))
2619    return h,l,b,e
2620
2621
2622#####################################
2623######  AVARIABILITY FUNCTIONS  #####
2624#####################################
2625
2626 def aobrientransform(*args):
2627    """
2628Computes a transform on input data (any number of columns).  Used to
2629test for homogeneity of variance prior to running one-way stats.  Each
2630array in *args is one level of a factor.  If an F_oneway() run on the
2631transformed data and found significant, variances are unequal.   From
2632Maxwell and Delaney, p.112.
2633
2634Usage:   aobrientransform(*args)    *args = 1D arrays, one per level of factor
2635Returns: transformed data for use in an ANOVA
2636"""
2637    TINY = 1e-10
2638    k = len(args)
2639    n = N.zeros(k,N.Float)
2640    v = N.zeros(k,N.Float)
2641    m = N.zeros(k,N.Float)
2642    nargs = []
2643    for i in range(k):
2644        nargs.append(args[i].astype(N.Float))
2645        n[i] = float(len(nargs[i]))
2646        v[i] = var(nargs[i])
2647        m[i] = mean(nargs[i])
2648    for j in range(k):
2649        for i in range(n[j]):
2650            t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2
2651            t2 = 0.5*v[j]*(n[j]-1.0)
2652            t3 = (n[j]-1.0)*(n[j]-2.0)
2653            nargs[j][i] = (t1-t2) / float(t3)
2654    check = 1
2655    for j in range(k):
2656        if v[j] - mean(nargs[j]) > TINY:
2657            check = 0
2658    if check <> 1:
2659        raise ValueError, 'Lack of convergence in obrientransform.'
2660    else:
2661        return N.array(nargs)
2662
2663
2664 def asamplevar (inarray,dimension=None,keepdims=0):
2665    """
2666Returns the sample standard deviation of the values in the passed
2667array (i.e., using N).  Dimension can equal None (ravel array first),
2668an integer (the dimension over which to operate), or a sequence
2669(operate over multiple dimensions).  Set keepdims=1 to return an array
2670with the same number of dimensions as inarray.
2671
2672Usage:   asamplevar(inarray,dimension=None,keepdims=0)
2673"""
2674    if dimension == None:
2675        inarray = N.ravel(inarray)
2676        dimension = 0
2677    if dimension == 1:
2678        mn = amean(inarray,dimension)[:,N.NewAxis]
2679    else:
2680        mn = amean(inarray,dimension,keepdims=1)
2681    deviations = inarray - mn
2682    if type(dimension) == ListType:
2683        n = 1
2684        for d in dimension:
2685            n = n*inarray.shape[d]
2686    else:
2687        n = inarray.shape[dimension]
2688    svar = ass(deviations,dimension,keepdims) / float(n)
2689    return svar
2690
2691
2692 def asamplestdev (inarray, dimension=None, keepdims=0):
2693    """
2694Returns the sample standard deviation of the values in the passed
2695array (i.e., using N).  Dimension can equal None (ravel array first),
2696an integer (the dimension over which to operate), or a sequence
2697(operate over multiple dimensions).  Set keepdims=1 to return an array
2698with the same number of dimensions as inarray.
2699
2700Usage:   asamplestdev(inarray,dimension=None,keepdims=0)
2701"""
2702    return N.sqrt(asamplevar(inarray,dimension,keepdims))
2703
2704
2705 def asignaltonoise(instack,dimension=0):
2706    """
2707Calculates signal-to-noise.  Dimension can equal None (ravel array
2708first), an integer (the dimension over which to operate), or a
2709sequence (operate over multiple dimensions).
2710
2711Usage:   asignaltonoise(instack,dimension=0):
2712Returns: array containing the value of (mean/stdev) along dimension,
2713         or 0 when stdev=0
2714"""
2715    m = mean(instack,dimension)
2716    sd = stdev(instack,dimension)
2717    return N.where(N.equal(sd,0),0,m/sd)
2718
2719
2720 def avar (inarray, dimension=None,keepdims=0):
2721    """
2722Returns the estimated population variance of the values in the passed
2723array (i.e., N-1).  Dimension can equal None (ravel array first), an
2724integer (the dimension over which to operate), or a sequence (operate
2725over multiple dimensions).  Set keepdims=1 to return an array with the
2726same number of dimensions as inarray.
2727
2728Usage:   avar(inarray,dimension=None,keepdims=0)
2729"""
2730    if dimension == None:
2731        inarray = N.ravel(inarray)
2732        dimension = 0
2733    mn = amean(inarray,dimension,1)
2734    deviations = inarray - mn
2735    if type(dimension) == ListType:
2736        n = 1
2737        for d in dimension:
2738            n = n*inarray.shape[d]
2739    else:
2740        n = inarray.shape[dimension]
2741    var = ass(deviations,dimension,keepdims)/float(n-1)
2742    return var
2743
2744
2745 def astdev (inarray, dimension=None, keepdims=0):
2746    """
2747Returns the estimated population standard deviation of the values in
2748the passed array (i.e., N-1).  Dimension can equal None (ravel array
2749first), an integer (the dimension over which to operate), or a
2750sequence (operate over multiple dimensions).  Set keepdims=1 to return
2751an array with the same number of dimensions as inarray.
2752
2753Usage:   astdev(inarray,dimension=None,keepdims=0)
2754"""
2755    return N.sqrt(avar(inarray,dimension,keepdims))
2756
2757
2758 def asterr (inarray, dimension=None, keepdims=0):
2759    """
2760Returns the estimated population standard error of the values in the
2761passed array (i.e., N-1).  Dimension can equal None (ravel array
2762first), an integer (the dimension over which to operate), or a
2763sequence (operate over multiple dimensions).  Set keepdims=1 to return
2764an array with the same number of dimensions as inarray.
2765
2766Usage:   asterr(inarray,dimension=None,keepdims=0)
2767"""
2768    if dimension == None:
2769        inarray = N.ravel(inarray)
2770        dimension = 0
2771    return astdev(inarray,dimension,keepdims) / float(N.sqrt(inarray.shape[dimension]))
2772
2773
2774 def asem (inarray, dimension=None, keepdims=0):
2775    """
2776Returns the standard error of the mean (i.e., using N) of the values
2777in the passed array.  Dimension can equal None (ravel array first), an
2778integer (the dimension over which to operate), or a sequence (operate
2779over multiple dimensions).  Set keepdims=1 to return an array with the
2780same number of dimensions as inarray.
2781
2782Usage:   asem(inarray,dimension=None, keepdims=0)
2783"""
2784    if dimension == None:
2785        inarray = N.ravel(inarray)
2786        dimension = 0
2787    if type(dimension) == ListType:
2788        n = 1
2789        for d in dimension:
2790            n = n*inarray.shape[d]
2791    else:
2792        n = inarray.shape[dimension]
2793    s = asamplestdev(inarray,dimension,keepdims) / N.sqrt(n-1)
2794    return s
2795
2796
2797 def az (a, score):
2798    """
2799Returns the z-score of a given input score, given thearray from which
2800that score came.  Not appropriate for population calculations, nor for
2801arrays > 1D.
2802
2803Usage:   az(a, score)
2804"""
2805    z = (score-amean(a)) / asamplestdev(a)
2806    return z
2807
2808
2809 def azs (a):
2810    """
2811Returns a 1D array of z-scores, one for each score in the passed array,
2812computed relative to the passed array.
2813
2814Usage:   azs(a)
2815"""
2816    zscores = []
2817    for item in a:
2818        zscores.append(z(a,item))
2819    return N.array(zscores)
2820
2821
2822 def azmap (scores, compare, dimension=0):
2823    """
2824Returns an array of z-scores the shape of scores (e.g., [x,y]), compared to
2825array passed to compare (e.g., [time,x,y]).  Assumes collapsing over dim 0
2826of the compare array.
2827
2828Usage:   azs(scores, compare, dimension=0)
2829"""
2830    mns = amean(compare,dimension)
2831    sstd = asamplestdev(compare,0)
2832    return (scores - mns) / sstd
2833
2834
2835#####################################
2836#######  ATRIMMING FUNCTIONS  #######
2837#####################################
2838
2839 def around(a,digits=1):
2840     """
2841Rounds all values in array a to 'digits' decimal places.
2842
2843Usage:   around(a,digits)
2844Returns: a, where each value is rounded to 'digits' decimals
2845"""
2846     def ar(x,d=digits):
2847         return round(x,d)
2848
2849     if type(a) <> N.ArrayType:
2850         try:
2851             a = N.array(a)
2852         except:
2853             a = N.array(a,'O')
2854     shp = a.shape
2855     if a.typecode() in ['f','F','d','D']:
2856         b = N.ravel(a)
2857         b = N.array(map(ar,b))
2858         b.shape = shp
2859     elif a.typecode() in ['o','O']:
2860         b = N.ravel(a)*1
2861         for i in range(len(b)):
2862             if type(b[i]) == FloatType:
2863                 b[i] = round(b[i],digits)
2864         b.shape = shp
2865     else:  # not a float, double or Object array
2866         b = a*1
2867     return b
2868
2869
2870 def athreshold(a,threshmin=None,threshmax=None,newval=0):
2871    """
2872Like Numeric.clip() except that values <threshmid or >threshmax are replaced
2873by newval instead of by threshmin/threshmax (respectively).
2874
2875Usage:   athreshold(a,threshmin=None,threshmax=None,newval=0)
2876Returns: a, with values <threshmin or >threshmax replaced with newval
2877"""
2878    mask = N.zeros(a.shape)
2879    if threshmin <> None:
2880        mask = mask + N.where(N.less(a,threshmin),1,0)
2881    if threshmax <> None:
2882        mask = mask + N.where(N.greater(a,threshmax),1,0)
2883    mask = N.clip(mask,0,1)
2884    return N.where(mask,newval,a)
2885
2886
2887 def atrimboth (a,proportiontocut):
2888    """
2889Slices off the passed proportion of items from BOTH ends of the passed
2890array (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND
2891'rightmost' 10% of scores.  You must pre-sort the array if you want
2892"proper" trimming.  Slices off LESS if proportion results in a
2893non-integer slice index (i.e., conservatively slices off
2894proportiontocut).
2895
2896Usage:   atrimboth (a,proportiontocut)
2897Returns: trimmed version of array a
2898"""
2899    lowercut = int(proportiontocut*len(a))
2900    uppercut = len(a) - lowercut
2901    return a[lowercut:uppercut]
2902
2903
2904 def atrim1 (a,proportiontocut,tail='right'):
2905    """
2906Slices off the passed proportion of items from ONE end of the passed
2907array (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost'
290810% of scores).  Slices off LESS if proportion results in a non-integer
2909slice index (i.e., conservatively slices off proportiontocut).
2910
2911Usage:   atrim1(a,proportiontocut,tail='right')  or set tail='left'
2912Returns: trimmed version of array a
2913"""
2914    if string.lower(tail) == 'right':
2915        lowercut = 0
2916        uppercut = len(a) - int(proportiontocut*len(a))
2917    elif string.lower(tail) == 'left':
2918        lowercut = int(proportiontocut*len(a))
2919        uppercut = len(a)
2920    return a[lowercut:uppercut]
2921
2922
2923#####################################
2924#####  ACORRELATION FUNCTIONS  ######
2925#####################################
2926
2927 def acovariance(X):
2928    """
2929Computes the covariance matrix of a matrix X.  Requires a 2D matrix input.
2930
2931Usage:   acovariance(X)
2932Returns: covariance matrix of X
2933"""
2934    if len(X.shape) <> 2:
2935        raise TypeError, "acovariance requires 2D matrices"
2936    n = X.shape[0]
2937    mX = amean(X,0)
2938    return N.dot(N.transpose(X),X) / float(n) - N.multiply.outer(mX,mX)
2939
2940
2941 def acorrelation(X):
2942    """
2943Computes the correlation matrix of a matrix X.  Requires a 2D matrix input.
2944
2945Usage:   acorrelation(X)
2946Returns: correlation matrix of X
2947"""
2948    C = acovariance(X)
2949    V = N.diagonal(C)
2950    return C / N.sqrt(N.multiply.outer(V,V))
2951
2952
2953 def apaired(x,y):
2954    """
2955Interactively determines the type of data in x and y, and then runs the
2956appropriated statistic for paired group data.
2957
2958Usage:   apaired(x,y)     x,y = the two arrays of values to be compared
2959Returns: appropriate statistic name, value, and probability
2960"""
2961    samples = ''
2962    while samples not in ['i','r','I','R','c','C']:
2963        print '\nIndependent or related samples, or correlation (i,r,c): ',
2964        samples = raw_input()
2965
2966    if samples in ['i','I','r','R']:
2967        print '\nComparing variances ...',
2968# USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112
2969        r = obrientransform(x,y)
2970        f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1))
2971        if p<0.05:
2972            vartype='unequal, p='+str(round(p,4))
2973        else:
2974            vartype='equal'
2975        print vartype
2976        if samples in ['i','I']:
2977            if vartype[0]=='e':
2978                t,p = ttest_ind(x,y,None,0)
2979                print '\nIndependent samples t-test:  ', round(t,4),round(p,4)
2980            else:
2981                if len(x)>20 or len(y)>20:
2982                    z,p = ranksums(x,y)
2983                    print '\nRank Sums test (NONparametric, n>20):  ', round(z,4),round(p,4)
2984                else:
2985                    u,p = mannwhitneyu(x,y)
2986                    print '\nMann-Whitney U-test (NONparametric, ns<20):  ', round(u,4),round(p,4)
2987
2988        else:  # RELATED SAMPLES
2989            if vartype[0]=='e':
2990                t,p = ttest_rel(x,y,0)
2991                print '\nRelated samples t-test:  ', round(t,4),round(p,4)
2992            else:
2993                t,p = ranksums(x,y)
2994                print '\nWilcoxon T-test (NONparametric):  ', round(t,4),round(p,4)
2995    else:  # CORRELATION ANALYSIS
2996        corrtype = ''
2997        while corrtype not in ['c','C','r','R','d','D']:
2998            print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ',
2999            corrtype = raw_input()
3000        if corrtype in ['c','C']:
3001            m,b,r,p,see = linregress(x,y)
3002            print '\nLinear regression for continuous variables ...'
3003            lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]]
3004            pstat.printcc(lol)
3005        elif corrtype in ['r','R']:
3006            r,p = spearmanr(x,y)
3007            print '\nCorrelation for ranked variables ...'
3008            print "Spearman's r: ",round(r,4),round(p,4)
3009        else: # DICHOTOMOUS
3010            r,p = pointbiserialr(x,y)
3011            print '\nAssuming x contains a dichotomous variable ...'
3012            print 'Point Biserial r: ',round(r,4),round(p,4)
3013    print '\n\n'
3014    return None
3015
3016
3017 def apearsonr(x,y,verbose=1):
3018    """
3019Calculates a Pearson correlation coefficient and returns p.  Taken
3020from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195.
3021
3022Usage:   apearsonr(x,y,verbose=1)      where x,y are equal length arrays
3023Returns: Pearson's r, two-tailed p-value
3024"""
3025    TINY = 1.0e-20
3026    n = len(x)
3027    xmean = amean(x)
3028    ymean = amean(y)
3029    r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y)
3030    r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y)))
3031    r = (r_num / r_den)
3032    df = n-2
3033    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
3034    prob = abetai(0.5*df,0.5,df/(df+t*t),verbose)
3035    return r,prob
3036
3037
3038 def aspearmanr(x,y):
3039    """
3040Calculates a Spearman rank-order correlation coefficient.  Taken
3041from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.
3042
3043Usage:   aspearmanr(x,y)      where x,y are equal-length arrays
3044Returns: Spearman's r, two-tailed p-value
3045"""
3046    TINY = 1e-30
3047    n = len(x)
3048    rankx = rankdata(x)
3049    ranky = rankdata(y)
3050    dsq = N.add.reduce((rankx-ranky)**2)
3051    rs = 1 - 6*dsq / float(n*(n**2-1))
3052    t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
3053    df = n-2
3054    probrs = abetai(0.5*df,0.5,df/(df+t*t))
3055# probability values for rs are from part 2 of the spearman function in
3056# Numerical Recipies, p.510.  They close to tables, but not exact.(?)
3057    return rs, probrs
3058
3059
3060 def apointbiserialr(x,y):
3061    """
3062Calculates a point-biserial correlation coefficient and the associated
3063probability value.  Taken from Heiman's Basic Statistics for the Behav.
3064Sci (1st), p.194.
3065
3066Usage:   apointbiserialr(x,y)      where x,y are equal length arrays
3067Returns: Point-biserial r, two-tailed p-value
3068"""
3069    TINY = 1e-30
3070    categories = pstat.aunique(x)
3071    data = pstat.aabut(x,y)
3072    if len(categories) <> 2:
3073        raise ValueError, "Exactly 2 categories required (in x) for pointbiserialr()."
3074    else:   # there are 2 categories, continue
3075        codemap = pstat.aabut(categories,N.arange(2))
3076        recoded = pstat.arecode(data,codemap,0)
3077        x = pstat.alinexand(data,0,categories[0])
3078        y = pstat.alinexand(data,0,categories[1])
3079        xmean = amean(pstat.acolex(x,1))
3080        ymean = amean(pstat.acolex(y,1))
3081        n = len(data)
3082        adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n)))
3083        rpb = (ymean - xmean)/asamplestdev(pstat.acolex(data,1))*adjust
3084        df = n-2
3085        t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY)))
3086        prob = abetai(0.5*df,0.5,df/(df+t*t))
3087        return rpb, prob
3088
3089
3090 def akendalltau(x,y):
3091    """
3092Calculates Kendall's tau ... correlation of ordinal data.  Adapted
3093from function kendl1 in Numerical Recipies.  Needs good test-cases.@@@
3094
3095Usage:   akendalltau(x,y)
3096Returns: Kendall's tau, two-tailed p-value
3097"""
3098    n1 = 0
3099    n2 = 0
3100    iss = 0
3101    for j in range(len(x)-1):
3102        for k in range(j,len(y)):
3103            a1 = x[j] - x[k]
3104            a2 = y[j] - y[k]
3105            aa = a1 * a2
3106            if (aa):             # neither array has a tie
3107                n1 = n1 + 1
3108                n2 = n2 + 1
3109                if aa > 0:
3110                    iss = iss + 1
3111                else:
3112                    iss = iss -1
3113            else:
3114                if (a1):
3115                    n1 = n1 + 1
3116                else:
3117                    n2 = n2 + 1
3118    tau = iss / math.sqrt(n1*n2)
3119    svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
3120    z = tau / math.sqrt(svar)
3121    prob = erfcc(abs(z)/1.4142136)
3122    return tau, prob
3123
3124
3125 def alinregress(*args):
3126    """
3127Calculates a regression line on two arrays, x and y, corresponding to x,y
3128pairs.  If a single 2D array is passed, alinregress finds dim with 2 levels
3129and splits data into x,y pairs along that dim.
3130
3131Usage:   alinregress(*args)    args=2 equal-length arrays, or one 2D array
3132Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate
3133"""
3134    TINY = 1.0e-20
3135    if len(args) == 1:  # more than 1D array?
3136        args = args[0]
3137        if len(args) == 2:
3138            x = args[0]
3139            y = args[1]
3140        else:
3141            x = args[:,0]
3142            y = args[:,1]
3143    else:
3144        x = args[0]
3145        y = args[1]
3146    n = len(x)
3147    xmean = amean(x)
3148    ymean = amean(y)
3149    r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y)
3150    r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y)))
3151    r = r_num / r_den
3152    z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY))
3153    df = n-2
3154    t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
3155    prob = abetai(0.5*df,0.5,df/(df+t*t))
3156    slope = r_num / (float(n)*ass(x) - asquare_of_sums(x))
3157    intercept = ymean - slope*xmean
3158    sterrest = math.sqrt(1-r*r)*asamplestdev(y)
3159    return slope, intercept, r, prob, sterrest
3160
3161
3162#####################################
3163#####  AINFERENTIAL STATISTICS  #####
3164#####################################
3165
3166 def attest_1samp(a,popmean,printit=0,name='Sample',writemode='a'):
3167    """
3168Calculates the t-obtained for the independent samples T-test on ONE group
3169of scores a, given a population mean.  If printit=1, results are printed
3170to the screen.  If printit='filename', the results are output to 'filename'
3171using the given writemode (default=append).  Returns t-value, and prob.
3172
3173Usage:   attest_1samp(a,popmean,Name='Sample',printit=0,writemode='a')
3174Returns: t-value, two-tailed prob
3175"""
3176    if type(a) != N.ArrayType:
3177        a = N.array(a)
3178    x = amean(a)
3179    v = avar(a)
3180    n = len(a)
3181    df = n-1
3182    svar = ((n-1)*v) / float(df)
3183    t = (x-popmean)/math.sqrt(svar*(1.0/n))
3184    prob = abetai(0.5*df,0.5,df/(df+t*t))
3185
3186    if printit <> 0:
3187        statname = 'Single-sample T-test.'
3188        outputpairedstats(printit,writemode,
3189                          'Population','--',popmean,0,0,0,
3190                          name,n,x,v,N.minimum.reduce(N.ravel(a)),
3191                          N.maximum.reduce(N.ravel(a)),
3192                          statname,t,prob)
3193    return t,prob
3194
3195
3196 def attest_ind (a, b, dimension=None, printit=0, name1='Samp1', name2='Samp2',writemode='a'):
3197    """
3198Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores
3199a, and b.  From Numerical Recipies, p.483.  If printit=1, results are
3200printed to the screen.  If printit='filename', the results are output
3201to 'filename' using the given writemode (default=append).  Dimension
3202can equal None (ravel array first), or an integer (the dimension over
3203which to operate on a and b).
3204
3205Usage:   attest_ind (a,b,dimension=None,printit=0,
3206                     Name1='Samp1',Name2='Samp2',writemode='a')
3207Returns: t-value, two-tailed p-value
3208"""
3209    if dimension == None:
3210        a = N.ravel(a)
3211        b = N.ravel(b)
3212        dimension = 0
3213    x1 = amean(a,dimension)
3214    x2 = amean(b,dimension)
3215    v1 = avar(a,dimension)
3216    v2 = avar(b,dimension)
3217    n1 = a.shape[dimension]
3218    n2 = b.shape[dimension]
3219    df = n1+n2-2
3220    svar = ((n1-1)*v1+(n2-1)*v2) / float(df)
3221    zerodivproblem = N.equal(svar,0)
3222    svar = N.where(zerodivproblem,1,svar)  # avoid zero-division in 1st place
3223    t = (x1-x2)/N.sqrt(svar*(1.0/n1 + 1.0/n2))  # N-D COMPUTATION HERE!!!!!!
3224    t = N.where(zerodivproblem,1.0,t)     # replace NaN/wrong t-values with 1.0
3225    probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
3226
3227    if type(t) == N.ArrayType:
3228        probs = N.reshape(probs,t.shape)
3229    if len(probs) == 1:
3230        probs = probs[0]
3231       
3232    if printit <> 0:
3233        if type(t) == N.ArrayType:
3234            t = t[0]
3235        if type(probs) == N.ArrayType:
3236            probs = probs[0]
3237        statname = 'Independent samples T-test.'
3238        outputpairedstats(printit,writemode,
3239                          name1,n1,x1,v1,N.minimum.reduce(N.ravel(a)),
3240                          N.maximum.reduce(N.ravel(a)),
3241                          name2,n2,x2,v2,N.minimum.reduce(N.ravel(b)),
3242                          N.maximum.reduce(N.ravel(b)),
3243                          statname,t,probs)
3244        return
3245    return t, probs
3246
3247
3248 def attest_rel (a,b,dimension=None,printit=0,name1='Samp1',name2='Samp2',writemode='a'):
3249    """
3250Calculates the t-obtained T-test on TWO RELATED samples of scores, a
3251and b.  From Numerical Recipies, p.483.  If printit=1, results are
3252printed to the screen.  If printit='filename', the results are output
3253to 'filename' using the given writemode (default=append).  Dimension
3254can equal None (ravel array first), or an integer (the dimension over
3255which to operate on a and b).
3256
3257Usage:   attest_rel(a,b,dimension=None,printit=0,
3258                    name1='Samp1',name2='Samp2',writemode='a')
3259Returns: t-value, two-tailed p-value
3260"""
3261    if dimension == None:
3262        a = N.ravel(a)
3263        b = N.ravel(b)
3264        dimension = 0
3265    if len(a)<>len(b):
3266        raise ValueError, 'Unequal length arrays.'
3267    x1 = amean(a,dimension)
3268    x2 = amean(b,dimension)
3269    v1 = avar(a,dimension)
3270    v2 = avar(b,dimension)
3271    n = a.shape[dimension]
3272    df = float(n-1)
3273    d = (a-b).astype('d')
3274
3275    denom = N.sqrt((n*N.add.reduce(d*d,dimension) - N.add.reduce(d,dimension)**2) /df)
3276    zerodivproblem = N.equal(denom,0)
3277    denom = N.where(zerodivproblem,1,denom)  # avoid zero-division in 1st place
3278    t = N.add.reduce(d,dimension) / denom      # N-D COMPUTATION HERE!!!!!!
3279    t = N.where(zerodivproblem,1.0,t)     # replace NaN/wrong t-values with 1.0
3280    probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
3281    if type(t) == N.ArrayType:
3282        probs = N.reshape(probs,t.shape)
3283    if len(probs) == 1:
3284        probs = probs[0]
3285
3286    if printit <> 0:
3287        statname = 'Related samples T-test.'
3288        outputpairedstats(printit,writemode,
3289                          name1,n,x1,v1,N.minimum.reduce(N.ravel(a)),
3290                          N.maximum.reduce(N.ravel(a)),
3291                          name2,n,x2,v2,N.minimum.reduce(N.ravel(b)),
3292                          N.maximum.reduce(N.ravel(b)),
3293                          statname,t,probs)
3294        return
3295    return t, probs
3296
3297
3298 def achisquare(f_obs,f_exp=None):
3299    """
3300Calculates a one-way chi square for array of observed frequencies and returns
3301the result.  If no expected frequencies are given, the total N is assumed to
3302be equally distributed across all groups.
3303
3304Usage:   achisquare(f_obs, f_exp=None)   f_obs = array of observed cell freq.
3305Returns: chisquare-statistic, associated p-value
3306"""
3307
3308    k = len(f_obs)
3309    if f_exp == None:
3310        f_exp = N.array([sum(f_obs)/float(k)] * len(f_obs),N.Float)
3311    f_exp = f_exp.astype(N.Float)
3312    chisq = N.add.reduce((f_obs-f_exp)**2 / f_exp)
3313    return chisq, chisqprob(chisq, k-1)
3314
3315
3316 def aks_2samp (data1,data2):
3317    """
3318Computes the Kolmogorov-Smirnof statistic on 2 samples.  Modified from
3319Numerical Recipies in C, page 493.  Returns KS D-value, prob.  Not ufunc-
3320like.
3321
3322Usage:   aks_2samp(data1,data2)  where data1 and data2 are 1D arrays
3323Returns: KS D-value, p-value
3324"""
3325    j1 = 0    # N.zeros(data1.shape[1:]) TRIED TO MAKE THIS UFUNC-LIKE
3326    j2 = 0    # N.zeros(data2.shape[1:])
3327    fn1 = 0.0 # N.zeros(data1.shape[1:],N.Float)
3328    fn2 = 0.0 # N.zeros(data2.shape[1:],N.Float)
3329    n1 = data1.shape[0]
3330    n2 = data2.shape[0]
3331    en1 = n1*1
3332    en2 = n2*1
3333    d = N.zeros(data1.shape[1:],N.Float)
3334    data1 = N.sort(data1,0)
3335    data2 = N.sort(data2,0)
3336    while j1 < n1 and j2 < n2:
3337        d1=data1[j1]
3338        d2=data2[j2]
3339        if d1 <= d2:
3340            fn1 = (j1)/float(en1)
3341            j1 = j1 + 1
3342        if d2 <= d1:
3343            fn2 = (j2)/float(en2)
3344            j2 = j2 + 1
3345        dt = (fn2-fn1)
3346        if abs(dt) > abs(d):
3347            d = dt
3348    try:
3349        en = math.sqrt(en1*en2/float(en1+en2))
3350        prob = aksprob((en+0.12+0.11/en)*N.fabs(d))
3351    except:
3352        prob = 1.0
3353    return d, prob
3354
3355
3356 def amannwhitneyu(x,y):
3357    """
3358Calculates a Mann-Whitney U statistic on the provided scores and
3359returns the result.  Use only when the n in each condition is < 20 and
3360you have 2 independent samples of ranks.  REMEMBER: Mann-Whitney U is
3361significant if the u-obtained is LESS THAN or equal to the critical
3362value of U.
3363
3364Usage:   amannwhitneyu(x,y)     where x,y are arrays of values for 2 conditions
3365Returns: u-statistic, one-tailed p-value (i.e., p(z(U)))
3366"""
3367    n1 = len(x)
3368    n2 = len(y)
3369    ranked = rankdata(N.concatenate((x,y)))
3370    rankx = ranked[0:n1]       # get the x-ranks
3371    ranky = ranked[n1:]        # the rest are y-ranks
3372    u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx)  # calc U for x
3373    u2 = n1*n2 - u1                            # remainder is U for y
3374    bigu = max(u1,u2)
3375    smallu = min(u1,u2)
3376    T = math.sqrt(tiecorrect(ranked))  # correction factor for tied scores
3377    if T == 0:
3378        raise ValueError, 'All numbers are identical in amannwhitneyu'
3379    sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0)
3380    z = abs((bigu-n1*n2/2.0) / sd)  # normal approximation for prob calc
3381    return smallu, 1.0 - zprob(z)
3382
3383
3384 def atiecorrect(rankvals):
3385    """
3386Tie-corrector for ties in Mann Whitney U and Kruskal Wallis H tests.
3387See Siegel, S. (1956) Nonparametric Statistics for the Behavioral
3388Sciences.  New York: McGraw-Hill.  Code adapted from |Stat rankind.c
3389code.
3390
3391Usage:   atiecorrect(rankvals)
3392Returns: T correction factor for U or H
3393"""
3394    sorted,posn = ashellsort(N.array(rankvals))
3395    n = len(sorted)
3396    T = 0.0
3397    i = 0
3398    while (i<n-1):
3399        if sorted[i] == sorted[i+1]:
3400            nties = 1
3401            while (i<n-1) and (sorted[i] == sorted[i+1]):
3402                nties = nties +1
3403                i = i +1
3404            T = T + nties**3 - nties
3405        i = i+1
3406    T = T / float(n**3-n)
3407    return 1.0 - T
3408
3409
3410 def aranksums(x,y):
3411    """
3412Calculates the rank sums statistic on the provided scores and returns
3413the result.
3414
3415Usage:   aranksums(x,y)     where x,y are arrays of values for 2 conditions
3416Returns: z-statistic, two-tailed p-value
3417"""
3418    n1 = len(x)
3419    n2 = len(y)
3420    alldata = N.concatenate((x,y))
3421    ranked = arankdata(alldata)
3422    x = ranked[:n1]
3423    y = ranked[n1:]
3424    s = sum(x)
3425    expected = n1*(n1+n2+1) / 2.0
3426    z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0)
3427    prob = 2*(1.0 -zprob(abs(z)))
3428    return z, prob
3429
3430
3431 def awilcoxont(x,y):
3432    """
3433Calculates the Wilcoxon T-test for related samples and returns the
3434result.  A non-parametric T-test.
3435
3436Usage:   awilcoxont(x,y)     where x,y are equal-length arrays for 2 conditions
3437Returns: t-statistic, two-tailed p-value
3438"""
3439    if len(x) <> len(y):
3440        raise ValueError, 'Unequal N in awilcoxont.  Aborting.'
3441    d = x-y
3442    d = N.compress(N.not_equal(d,0),d) # Keep all non-zero differences
3443    count = len(d)
3444    absd = abs(d)
3445    absranked = arankdata(absd)
3446    r_plus = 0.0
3447    r_minus = 0.0
3448    for i in range(len(absd)):
3449        if d[i] < 0:
3450            r_minus = r_minus + absranked[i]
3451        else:
3452            r_plus = r_plus + absranked[i]
3453    wt = min(r_plus, r_minus)
3454    mn = count * (count+1) * 0.25
3455    se =  math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0)
3456    z = math.fabs(wt-mn) / se
3457    z = math.fabs(wt-mn) / se
3458    prob = 2*(1.0 -zprob(abs(z)))
3459    return wt, prob
3460
3461
3462 def akruskalwallish(*args):
3463    """
3464The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more
3465groups, requiring at least 5 subjects in each group.  This function
3466calculates the Kruskal-Wallis H and associated p-value for 3 or more
3467independent samples.
3468
3469Usage:   akruskalwallish(*args)     args are separate arrays for 3+ conditions
3470Returns: H-statistic (corrected for ties), associated p-value
3471"""
3472    assert len(args) == 3, "Need at least 3 groups in stats.akruskalwallish()"
3473    args = list(args)
3474    n = [0]*len(args)
3475    n = map(len,args)
3476    all = []
3477    for i in range(len(args)):
3478        all = all + args[i].tolist()
3479    ranked = rankdata(all)
3480    T = tiecorrect(ranked)
3481    for i in range(len(args)):
3482        args[i] = ranked[0:n[i]]
3483        del ranked[0:n[i]]
3484    rsums = []
3485    for i in range(len(args)):
3486        rsums.append(sum(args[i])**2)
3487        rsums[i] = rsums[i] / float(n[i])
3488    ssbn = sum(rsums)
3489    totaln = sum(n)
3490    h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
3491    df = len(args) - 1
3492    if T == 0:
3493        raise ValueError, 'All numbers are identical in akruskalwallish'
3494    h = h / float(T)
3495    return h, chisqprob(h,df)
3496
3497
3498 def afriedmanchisquare(*args):
3499    """
3500Friedman Chi-Square is a non-parametric, one-way within-subjects
3501ANOVA.  This function calculates the Friedman Chi-square test for
3502repeated measures and returns the result, along with the associated
3503probability value.  It assumes 3 or more repeated measures.  Only 3
3504levels requires a minimum of 10 subjects in the study.  Four levels
3505requires 5 subjects per level(??).
3506
3507Usage:   afriedmanchisquare(*args)   args are separate arrays for 2+ conditions
3508Returns: chi-square statistic, associated p-value
3509"""
3510    k = len(args)
3511    if k < 3:
3512        raise ValueError, '\nLess than 3 levels.  Friedman test not appropriate.\n'
3513    n = len(args[0])
3514    data = apply(pstat.aabut,args)
3515    data = data.astype(N.Float)
3516    for i in range(len(data)):
3517        data[i] = arankdata(data[i])
3518    ssbn = asum(asum(args,1)**2)
3519    chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)
3520    return chisq, chisqprob(chisq,k-1)
3521
3522
3523#####################################
3524####  APROBABILITY CALCULATIONS  ####
3525#####################################
3526
3527 def achisqprob(chisq,df):
3528    """
3529Returns the (1-tail) probability value associated with the provided chi-square
3530value and df.  Heavily modified from chisq.c in Gary Perlman's |Stat.  Can
3531handle multiple dimensions.
3532
3533Usage:   achisqprob(chisq,df)    chisq=chisquare stat., df=degrees of freedom
3534"""
3535    BIG = 200.0
3536    def ex(x):
3537        BIG = 200.0
3538        exponents = N.where(N.less(x,-BIG),-BIG,x)
3539        return N.exp(exponents)
3540
3541    if type(chisq) == N.ArrayType:
3542        arrayflag = 1
3543    else:
3544        arrayflag = 0
3545        chisq = N.array([chisq])
3546    if df < 1:
3547        return N.ones(chisq.shape,N.float)
3548    probs = N.zeros(chisq.shape,N.Float)
3549    probs = N.where(N.less_equal(chisq,0),1.0,probs)  # set prob=1 for chisq<0
3550    a = 0.5 * chisq
3551    if df > 1:
3552        y = ex(-a)
3553    if df%2 == 0:
3554        even = 1
3555        s = y*1
3556        s2 = s*1
3557    else:
3558        even = 0
3559        s = 2.0 * azprob(-N.sqrt(chisq))
3560        s2 = s*1
3561    if (df > 2):
3562        chisq = 0.5 * (df - 1.0)
3563        if even:
3564            z = N.ones(probs.shape,N.Float)
3565        else:
3566            z = 0.5 *N.ones(probs.shape,N.Float)
3567        if even:
3568            e = N.zeros(probs.shape,N.Float)
3569        else:
3570            e = N.log(N.sqrt(N.pi)) *N.ones(probs.shape,N.Float)
3571        c = N.log(a)
3572        mask = N.zeros(probs.shape)
3573        a_big = N.greater(a,BIG)
3574        a_big_frozen = -1 *N.ones(probs.shape,N.Float)
3575        totalelements = N.multiply.reduce(N.array(probs.shape))
3576        while asum(mask)<>totalelements:
3577            e = N.log(z) + e
3578            s = s + ex(c*z-a-e)
3579            z = z + 1.0
3580#            print z, e, s
3581            newmask = N.greater(z,chisq)
3582            a_big_frozen = N.where(newmask*N.equal(mask,0)*a_big, s, a_big_frozen)
3583            mask = N.clip(newmask+mask,0,1)
3584        if even:
3585            z = N.ones(probs.shape,N.Float)
3586            e = N.ones(probs.shape,N.Float)
3587        else:
3588            z = 0.5 *N.ones(probs.shape,N.Float)
3589            e = 1.0 / N.sqrt(N.pi) / N.sqrt(a) * N.ones(probs.shape,N.Float)
3590        c = 0.0
3591        mask = N.zeros(probs.shape)
3592        a_notbig_frozen = -1 *N.ones(probs.shape,N.Float)
3593        while asum(mask)<>totalelements:
3594            e = e * (a/z.astype(N.Float))
3595            c = c + e
3596            z = z + 1.0
3597#            print '#2', z, e, c, s, c*y+s2
3598            newmask = N.greater(z,chisq)
3599            a_notbig_frozen = N.where(newmask*N.equal(mask,0)*(1-a_big),
3600                                      c*y+s2, a_notbig_frozen)
3601            mask = N.clip(newmask+mask,0,1)
3602        probs = N.where(N.equal(probs,1),1,
3603                        N.where(N.greater(a,BIG),a_big_frozen,a_notbig_frozen))
3604        return probs
3605    else:
3606        return s
3607
3608
3609 def aerfcc(x):
3610    """
3611Returns the complementary error function erfc(x) with fractional error
3612everywhere less than 1.2e-7.  Adapted from Numerical Recipies.  Can
3613handle multiple dimensions.
3614
3615Usage:   aerfcc(x)
3616"""
3617    z = abs(x)
3618    t = 1.0 / (1.0+0.5*z)
3619    ans = t * N.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277)))))))))
3620    return N.where(N.greater_equal(x,0), ans, 2.0-ans)
3621
3622
3623 def azprob(z):
3624    """
3625Returns the area under the normal curve 'to the left of' the given z value.
3626Thus,
3627    for z<0, zprob(z) = 1-tail probability
3628    for z>0, 1.0-zprob(z) = 1-tail probability
3629    for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability
3630Adapted from z.c in Gary Perlman's |Stat.  Can handle multiple dimensions.
3631
3632Usage:   azprob(z)    where z is a z-value
3633"""
3634    def yfunc(y):
3635        x = (((((((((((((-0.000045255659 * y
3636                         +0.000152529290) * y -0.000019538132) * y
3637                       -0.000676904986) * y +0.001390604284) * y
3638                     -0.000794620820) * y -0.002034254874) * y
3639                   +0.006549791214) * y -0.010557625006) * y
3640                 +0.011630447319) * y -0.009279453341) * y
3641               +0.005353579108) * y -0.002141268741) * y
3642             +0.000535310849) * y +0.999936657524
3643        return x
3644
3645    def wfunc(w):
3646        x = ((((((((0.000124818987 * w
3647                    -0.001075204047) * w +0.005198775019) * w
3648                  -0.019198292004) * w +0.059054035642) * w
3649                -0.151968751364) * w +0.319152932694) * w
3650              -0.531923007300) * w +0.797884560593) * N.sqrt(w) * 2.0
3651        return x
3652
3653    Z_MAX = 6.0    # maximum meaningful z-value
3654    x = N.zeros(z.shape,N.Float) # initialize
3655    y = 0.5 * N.fabs(z)
3656    x = N.where(N.less(y,1.0),wfunc(y*y),yfunc(y-2.0)) # get x's
3657    x = N.where(N.greater(y,Z_MAX*0.5),1.0,x)          # kill those with big Z
3658    prob = N.where(N.greater(z,0),(x+1)*0.5,(1-x)*0.5)
3659    return prob
3660
3661
3662 def aksprob(alam):
3663     """
3664Returns the probability value for a K-S statistic computed via ks_2samp.
3665Adapted from Numerical Recipies.  Can handle multiple dimensions.
3666
3667Usage:   aksprob(alam)
3668"""
3669     if type(alam) == N.ArrayType:
3670         frozen = -1 *N.ones(alam.shape,N.Float64)
3671         alam = alam.astype(N.Float64)
3672         arrayflag = 1
3673     else:
3674         frozen = N.array(-1.)
3675         alam = N.array(alam,N.Float64)
3676     mask = N.zeros(alam.shape)
3677     fac = 2.0 *N.ones(alam.shape,N.Float)
3678     sum = N.zeros(alam.shape,N.Float)
3679     termbf = N.zeros(alam.shape,N.Float)
3680     a2 = N.array(-2.0*alam*alam,N.Float64)
3681     totalelements = N.multiply.reduce(N.array(mask.shape))
3682     for j in range(1,201):
3683         if asum(mask) == totalelements:
3684             break
3685         exponents = (a2*j*j)
3686         overflowmask = N.less(exponents,-746)
3687         frozen = N.where(overflowmask,0,frozen)
3688         mask = mask+overflowmask
3689         term = fac*N.exp(exponents)
3690         sum = sum + term
3691         newmask = N.where(N.less_equal(abs(term),(0.001*termbf)) +
3692                           N.less(abs(term),1.0e-8*sum), 1, 0)
3693         frozen = N.where(newmask*N.equal(mask,0), sum, frozen)
3694         mask = N.clip(mask+newmask,0,1)
3695         fac = -fac
3696         termbf = abs(term)
3697     if arrayflag:
3698         return N.where(N.equal(frozen,-1), 1.0, frozen)  # 1.0 if doesn't converge
3699     else:
3700         return N.where(N.equal(frozen,-1), 1.0, frozen)[0]  # 1.0 if doesn't converge
3701
3702
3703 def afprob (dfnum, dfden, F):
3704    """
3705Returns the 1-tailed significance level (p-value) of an F statistic
3706given the degrees of freedom for the numerator (dfR-dfF) and the degrees
3707of freedom for the denominator (dfF).  Can handle multiple dims for F.
3708
3709Usage:   afprob(dfnum, dfden, F)   where usually dfnum=dfbn, dfden=dfwn
3710"""
3711    if type(F) == N.ArrayType:
3712        return abetai(0.5*dfden, 0.5*dfnum, dfden/(1.0*dfden+dfnum*F))
3713    else:
3714        return abetai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F))
3715
3716
3717 def abetacf(a,b,x,verbose=1):
3718    """
3719Evaluates the continued fraction form of the incomplete Beta function,
3720betai.  (Adapted from: Numerical Recipies in C.)  Can handle multiple
3721dimensions for x.
3722
3723Usage:   abetacf(a,b,x,verbose=1)
3724"""
3725    ITMAX = 200
3726    EPS = 3.0e-7
3727
3728    arrayflag = 1
3729    if type(x) == N.ArrayType:
3730        frozen = N.ones(x.shape,N.Float) *-1  #start out w/ -1s, should replace all
3731    else:
3732        arrayflag = 0
3733        frozen = N.array([-1])
3734        x = N.array([x])
3735    mask = N.zeros(x.shape)
3736    bm = az = am = 1.0
3737    qab = a+b
3738    qap = a+1.0
3739    qam = a-1.0
3740    bz = 1.0-qab*x/qap
3741    for i in range(ITMAX+1):
3742        if N.sum(N.ravel(N.equal(frozen,-1)))==0:
3743            break
3744        em = float(i+1)
3745        tem = em + em
3746        d = em*(b-em)*x/((qam+tem)*(a+tem))
3747        ap = az + d*am
3748        bp = bz+d*bm
3749        d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem))
3750        app = ap+d*az
3751        bpp = bp+d*bz
3752        aold = az*1
3753        am = ap/bpp
3754        bm = bp/bpp
3755        az = app/bpp
3756        bz = 1.0
3757        newmask = N.less(abs(az-aold),EPS*abs(az))
3758        frozen = N.where(newmask*N.equal(mask,0), az, frozen)
3759        mask = N.clip(mask+newmask,0,1)
3760    noconverge = asum(N.equal(frozen,-1))
3761    if noconverge <> 0 and verbose:
3762        print 'a or b too big, or ITMAX too small in Betacf for ',noconverge,' elements'
3763    if arrayflag:
3764        return frozen
3765    else:
3766        return frozen[0]
3767
3768
3769 def agammln(xx):
3770    """
3771Returns the gamma function of xx.
3772    Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
3773Adapted from: Numerical Recipies in C.  Can handle multiple dims ... but
3774probably doesn't normally have to.
3775
3776Usage:   agammln(xx)
3777"""
3778    coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
3779             0.120858003e-2, -0.536382e-5]
3780    x = xx - 1.0
3781    tmp = x + 5.5
3782    tmp = tmp - (x+0.5)*N.log(tmp)
3783    ser = 1.0
3784    for j in range(len(coeff)):
3785        x = x + 1
3786        ser = ser + coeff[j]/x
3787    return -tmp + N.log(2.50662827465*ser)
3788
3789
3790 def abetai(a,b,x,verbose=1):
3791    """
3792Returns the incomplete beta function:
3793
3794    I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt)
3795
3796where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma
3797function of a.  The continued fraction formulation is implemented
3798here, using the betacf function.  (Adapted from: Numerical Recipies in
3799C.)  Can handle multiple dimensions.
3800
3801Usage:   abetai(a,b,x,verbose=1)
3802"""
3803    TINY = 1e-15
3804    if type(a) == N.ArrayType:
3805        if asum(N.less(x,0)+N.greater(x,1)) <> 0:
3806            raise ValueError, 'Bad x in abetai'
3807    x = N.where(N.equal(x,0),TINY,x)
3808    x = N.where(N.equal(x,1.0),1-TINY,x)
3809
3810    bt = N.where(N.equal(x,0)+N.equal(x,1), 0, -1)
3811    exponents = ( gammln(a+b)-gammln(a)-gammln(b)+a*N.log(x)+b*
3812                  N.log(1.0-x) )
3813    # 746 (below) is the MAX POSSIBLE BEFORE OVERFLOW
3814    exponents = N.where(N.less(exponents,-740),-740,exponents)
3815    bt = N.exp(exponents)
3816    if type(x) == N.ArrayType:
3817        ans = N.where(N.less(x,(a+1)/(a+b+2.0)),
3818                      bt*abetacf(a,b,x,verbose)/float(a),
3819                      1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b))
3820    else:
3821        if x<(a+1)/(a+b+2.0):
3822            ans = bt*abetacf(a,b,x,verbose)/float(a)
3823        else:
3824            ans = 1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b)
3825    return ans
3826
3827
3828#####################################
3829#######  AANOVA CALCULATIONS  #######
3830#####################################
3831
3832 import LinearAlgebra, operator
3833 LA = LinearAlgebra
3834
3835 def aglm(data,para):
3836    """
3837Calculates a linear model fit ... anova/ancova/lin-regress/t-test/etc. Taken
3838from:
3839    Peterson et al. Statistical limitations in functional neuroimaging
3840    I. Non-inferential methods and statistical models.  Phil Trans Royal Soc
3841    Lond B 354: 1239-1260.
3842
3843Usage:   aglm(data,para)
3844Returns: statistic, p-value ???
3845"""
3846    if len(para) <> len(data):
3847        print "data and para must be same length in aglm"
3848        return
3849    n = len(para)
3850    p = pstat.aunique(para)
3851    x = N.zeros((n,len(p)))  # design matrix
3852    for l in range(len(p)):
3853        x[:,l] = N.equal(para,p[l])
3854    b = N.dot(N.dot(LA.inverse(N.dot(N.transpose(x),x)),  # i.e., b=inv(X'X)X'Y
3855                    N.transpose(x)),
3856              data)
3857    diffs = (data - N.dot(x,b))
3858    s_sq = 1./(n-len(p)) * N.dot(N.transpose(diffs), diffs)
3859
3860    if len(p) == 2:  # ttest_ind
3861        c = N.array([1,-1])
3862        df = n-2
3863        fact = asum(1.0/asum(x,0))  # i.e., 1/n1 + 1/n2 + 1/n3 ...
3864        t = N.dot(c,b) / N.sqrt(s_sq*fact)
3865        probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
3866        return t, probs
3867
3868
3869 def aF_oneway(*args):
3870    """
3871Performs a 1-way ANOVA, returning an F-value and probability given
3872any number of groups.  From Heiman, pp.394-7.
3873
3874Usage:   aF_oneway (*args)    where *args is 2 or more arrays, one per
3875                                  treatment group
3876Returns: f-value, probability
3877"""
3878    na = len(args)            # ANOVA on 'na' groups, each in it's own array
3879    means = [0]*na
3880    vars = [0]*na
3881    ns = [0]*na
3882    alldata = []
3883    tmp = map(N.array,args)
3884    means = map(amean,tmp)
3885    vars = map(avar,tmp)
3886    ns = map(len,args)
3887    alldata = N.concatenate(args)
3888    bign = len(alldata)
3889    sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign))
3890    ssbn = 0
3891    for a in args:
3892        ssbn = ssbn + asquare_of_sums(N.array(a))/float(len(a))
3893    ssbn = ssbn - (asquare_of_sums(alldata)/float(bign))
3894    sswn = sstot-ssbn
3895    dfbn = na-1
3896    dfwn = bign - na
3897    msb = ssbn/float(dfbn)
3898    msw = sswn/float(dfwn)
3899    f = msb/msw
3900    prob = fprob(dfbn,dfwn,f)
3901    return f, prob
3902
3903
3904 def aF_value (ER,EF,dfR,dfF):
3905    """
3906Returns an F-statistic given the following:
3907        ER  = error associated with the null hypothesis (the Restricted model)
3908        EF  = error associated with the alternate hypothesis (the Full model)
3909        dfR = degrees of freedom the Restricted model
3910        dfF = degrees of freedom associated with the Restricted model
3911"""
3912    return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF)))
3913
3914
3915 def outputfstats(Enum, Eden, dfnum, dfden, f, prob):
3916     Enum = round(Enum,3)
3917     Eden = round(Eden,3)
3918     dfnum = round(Enum,3)
3919     dfden = round(dfden,3)
3920     f = round(f,3)
3921     prob = round(prob,3)
3922     suffix = ''                       # for *s after the p-value
3923     if  prob < 0.001:  suffix = '  ***'
3924     elif prob < 0.01:  suffix = '  **'
3925     elif prob < 0.05:  suffix = '  *'
3926     title = [['EF/ER','DF','Mean Square','F-value','prob','']]
3927     lofl = title+[[Enum, dfnum, round(Enum/float(dfnum),3), f, prob, suffix],
3928                   [Eden, dfden, round(Eden/float(dfden),3),'','','']]
3929     pstat.printcc(lofl)
3930     return
3931
3932
3933 def F_value_multivariate(ER, EF, dfnum, dfden):
3934     """
3935Returns an F-statistic given the following:
3936        ER  = error associated with the null hypothesis (the Restricted model)
3937        EF  = error associated with the alternate hypothesis (the Full model)
3938        dfR = degrees of freedom the Restricted model
3939        dfF = degrees of freedom associated with the Restricted model
3940where ER and EF are matrices from a multivariate F calculation.
3941"""
3942     if type(ER) in [IntType, FloatType]:
3943         ER = N.array([[ER]])
3944     if type(EF) in [IntType, FloatType]:
3945         EF = N.array([[EF]])
3946     n_um = (LA.determinant(ER) - LA.determinant(EF)) / float(dfnum)
3947     d_en = LA.determinant(EF) / float(dfden)
3948     return n_um / d_en
3949
3950
3951#####################################
3952#######  ASUPPORT FUNCTIONS  ########
3953#####################################
3954
3955 def asign(a):
3956    """
3957Usage:   asign(a)
3958Returns: array shape of a, with -1 where a<0 and +1 where a>=0
3959"""
3960    a = N.asarray(a)
3961    if ((type(a) == type(1.4)) or (type(a) == type(1))):
3962        return a-a-N.less(a,0)+N.greater(a,0)
3963    else:
3964        return N.zeros(N.shape(a))-N.less(a,0)+N.greater(a,0)
3965
3966
3967 def asum (a, dimension=None,keepdims=0):
3968     """
3969An alternative to the Numeric.add.reduce function, which allows one to
3970(1) collapse over multiple dimensions at once, and/or (2) to retain
3971all dimensions in the original array (squashing one down to size.
3972Dimension can equal None (ravel array first), an integer (the
3973dimension over which to operate), or a sequence (operate over multiple
3974dimensions).  If keepdims=1, the resulting array will have as many
3975dimensions as the input array.
3976
3977Usage:   asum(a, dimension=None, keepdims=0)
3978Returns: array summed along 'dimension'(s), same _number_ of dims if keepdims=1
3979"""
3980     if type(a) == N.ArrayType and a.typecode() in ['l','s','b']:
3981         a = a.astype(N.Float)
3982     if dimension == None:
3983         s = N.sum(N.ravel(a))
3984     elif type(dimension) in [IntType,FloatType]:
3985         s = N.add.reduce(a, dimension)
3986         if keepdims == 1:
3987             shp = list(a.shape)
3988             shp[dimension] = 1
3989             s = N.reshape(s,shp)
3990     else: # must be a SEQUENCE of dims to sum over
3991        dims = list(dimension)
3992        dims.sort()
3993        dims.reverse()
3994        s = a *1.0
3995        for dim in dims:
3996            s = N.add.reduce(s,dim)
3997        if keepdims == 1:
3998            shp = list(a.shape)
3999            for dim in dims:
4000                shp[dim] = 1
4001            s = N.reshape(s,shp)
4002     return s
4003
4004
4005 def acumsum (a,dimension=None):
4006    """
4007Returns an array consisting of the cumulative sum of the items in the
4008passed array.  Dimension can equal None (ravel array first), an
4009integer (the dimension over which to operate), or a sequence (operate
4010over multiple dimensions, but this last one just barely makes sense).
4011
4012Usage:   acumsum(a,dimension=None)
4013"""
4014    if dimension == None:
4015        a = N.ravel(a)
4016        dimension = 0
4017    if type(dimension) in [ListType, TupleType, N.ArrayType]:
4018        dimension = list(dimension)
4019        dimension.sort()
4020        dimension.reverse()
4021        for d in dimension:
4022            a = N.add.accumulate(a,d)
4023        return a
4024    else:
4025        return N.add.accumulate(a,dimension)
4026
4027
4028 def ass(inarray, dimension=None, keepdims=0):
4029    """
4030Squares each value in the passed array, adds these squares & returns
4031the result.  Unfortunate function name. :-) Defaults to ALL values in
4032the array.  Dimension can equal None (ravel array first), an integer
4033(the dimension over which to operate), or a sequence (operate over
4034multiple dimensions).  Set keepdims=1 to maintain the original number
4035of dimensions.
4036
4037Usage:   ass(inarray, dimension=None, keepdims=0)
4038Returns: sum-along-'dimension' for (inarray*inarray)
4039"""
4040    if dimension == None:
4041        inarray = N.ravel(inarray)
4042        dimension = 0
4043    return asum(inarray*inarray,dimension,keepdims)
4044
4045
4046 def asummult (array1,array2,dimension=None,keepdims=0):
4047    """
4048Multiplies elements in array1 and array2, element by element, and
4049returns the sum (along 'dimension') of all resulting multiplications.
4050Dimension can equal None (ravel array first), an integer (the
4051dimension over which to operate), or a sequence (operate over multiple
4052dimensions).  A trivial function, but included for completeness.
4053
4054Usage:   asummult(array1,array2,dimension=None,keepdims=0)
4055"""
4056    if dimension == None:
4057        array1 = N.ravel(array1)
4058        array2 = N.ravel(array2)
4059        dimension = 0
4060    return asum(array1*array2,dimension,keepdims)
4061
4062
4063 def asquare_of_sums(inarray, dimension=None, keepdims=0):
4064    """
4065Adds the values in the passed array, squares that sum, and returns the
4066result.  Dimension can equal None (ravel array first), an integer (the
4067dimension over which to operate), or a sequence (operate over multiple
4068dimensions).  If keepdims=1, the returned array will have the same
4069NUMBER of dimensions as the original.
4070
4071Usage:   asquare_of_sums(inarray, dimension=None, keepdims=0)
4072Returns: the square of the sum over dim(s) in dimension
4073"""
4074    if dimension == None:
4075        inarray = N.ravel(inarray)
4076        dimension = 0
4077    s = asum(inarray,dimension,keepdims)
4078    if type(s) == N.ArrayType:
4079        return s.astype(N.Float)*s
4080    else:
4081        return float(s)*s
4082
4083
4084 def asumdiffsquared(a,b, dimension=None, keepdims=0):
4085    """
4086Takes pairwise differences of the values in arrays a and b, squares
4087these differences, and returns the sum of these squares.  Dimension
4088can equal None (ravel array first), an integer (the dimension over
4089which to operate), or a sequence (operate over multiple dimensions).
4090keepdims=1 means the return shape = len(a.shape) = len(b.shape)
4091
4092Usage:   asumdiffsquared(a,b)
4093Returns: sum[ravel(a-b)**2]
4094"""
4095    if dimension == None:
4096        inarray = N.ravel(a)
4097        dimension = 0
4098    return asum((a-b)**2,dimension,keepdims)
4099
4100
4101 def ashellsort(inarray):
4102    """
4103Shellsort algorithm.  Sorts a 1D-array.
4104
4105Usage:   ashellsort(inarray)
4106Returns: sorted-inarray, sorting-index-vector (for original array)
4107"""
4108    n = len(inarray)
4109    svec = inarray *1.0
4110    ivec = range(n)
4111    gap = n/2   # integer division needed
4112    while gap >0:
4113        for i in range(gap,n):
4114            for j in range(i-gap,-1,-gap):
4115                while j>=0 and svec[j]>svec[j+gap]:
4116                    temp        = svec[j]
4117                    svec[j]     = svec[j+gap]
4118                    svec[j+gap] = temp
4119                    itemp       = ivec[j]
4120                    ivec[j]     = ivec[j+gap]
4121                    ivec[j+gap] = itemp
4122        gap = gap / 2  # integer division needed
4123#    svec is now sorted input vector, ivec has the order svec[i] = vec[ivec[i]]
4124    return svec, ivec
4125
4126
4127 def arankdata(inarray):
4128    """
4129Ranks the data in inarray, dealing with ties appropritely.  Assumes
4130a 1D inarray.  Adapted from Gary Perlman's |Stat ranksort.
4131
4132Usage:   arankdata(inarray)
4133Returns: array of length equal to inarray, containing rank scores
4134"""
4135    n = len(inarray)
4136    svec, ivec = ashellsort(inarray)
4137    sumranks = 0
4138    dupcount = 0
4139    newarray = N.zeros(n,N.Float)
4140    for i in range(n):
4141        sumranks = sumranks + i
4142        dupcount = dupcount + 1
4143        if i==n-1 or svec[i] <> svec[i+1]:
4144            averank = sumranks / float(dupcount) + 1
4145            for j in range(i-dupcount+1,i+1):
4146                newarray[ivec[j]] = averank
4147            sumranks = 0
4148            dupcount = 0
4149    return newarray
4150
4151
4152 def afindwithin(data):
4153    """
4154Returns a binary vector, 1=within-subject factor, 0=between.  Input
4155equals the entire data array (i.e., column 1=random factor, last
4156column = measured values.
4157
4158Usage:   afindwithin(data)     data in |Stat format
4159"""
4160    numfact = len(data[0])-2
4161    withinvec = [0]*numfact
4162    for col in range(1,numfact+1):
4163        rows = pstat.linexand(data,col,pstat.unique(pstat.colex(data,1))[0])  # get 1 level of this factor
4164        if len(pstat.unique(pstat.colex(rows,0))) < len(rows):   # if fewer subjects than scores on this factor
4165            withinvec[col-1] = 1
4166    return withinvec
4167
4168
4169 #########################################################
4170 #########################################################
4171 ######  RE-DEFINE DISPATCHES TO INCLUDE ARRAYS  #########
4172 #########################################################
4173 #########################################################
4174
4175## CENTRAL TENDENCY:
4176 geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)),
4177                            (ageometricmean, (N.ArrayType,)) )
4178 harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)),
4179                           (aharmonicmean, (N.ArrayType,)) )
4180 mean = Dispatch ( (lmean, (ListType, TupleType)),
4181                   (amean, (N.ArrayType,)) )
4182 median = Dispatch ( (lmedian, (ListType, TupleType)),
4183                     (amedian, (N.ArrayType,)) )
4184 medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)),
4185                          (amedianscore, (N.ArrayType,)) )
4186 mode = Dispatch ( (lmode, (ListType, TupleType)),
4187                   (amode, (N.ArrayType,)) )
4188 tmean = Dispatch ( (atmean, (N.ArrayType,)) )
4189 tvar = Dispatch ( (atvar, (N.ArrayType,)) )
4190 tstdev = Dispatch ( (atstdev, (N.ArrayType,)) )
4191 tsem = Dispatch ( (atsem, (N.ArrayType,)) )
4192
4193## VARIATION:
4194 moment = Dispatch ( (lmoment, (ListType, TupleType)),
4195                     (amoment, (N.ArrayType,)) )
4196 variation = Dispatch ( (lvariation, (ListType, TupleType)),
4197                        (avariation, (N.ArrayType,)) )
4198 skew = Dispatch ( (lskew, (ListType, TupleType)),
4199                   (askew, (N.ArrayType,)) )
4200 kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)),
4201                       (akurtosis, (N.ArrayType,)) )
4202 describe = Dispatch ( (ldescribe, (ListType, TupleType)),
4203                       (adescribe, (N.ArrayType,)) )
4204
4205## DISTRIBUTION TESTS
4206
4207 skewtest = Dispatch ( (askewtest, (ListType, TupleType)),
4208                       (askewtest, (N.ArrayType,)) )
4209 kurtosistest = Dispatch ( (akurtosistest, (ListType, TupleType)),
4210                           (akurtosistest, (N.ArrayType,)) )
4211 normaltest = Dispatch ( (anormaltest, (ListType, TupleType)),
4212                         (anormaltest, (N.ArrayType,)) )
4213
4214## FREQUENCY STATS:
4215 itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)),
4216                       (aitemfreq, (N.ArrayType,)) )
4217 scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)),
4218                                (ascoreatpercentile, (N.ArrayType,)) )
4219 percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)),
4220                                 (apercentileofscore, (N.ArrayType,)) )
4221 histogram = Dispatch ( (lhistogram, (ListType, TupleType)),
4222                        (ahistogram, (N.ArrayType,)) )
4223 cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)),
4224                      (acumfreq, (N.ArrayType,)) )
4225 relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)),
4226                      (arelfreq, (N.ArrayType,)) )
4227 
4228## VARIABILITY:
4229 obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)),
4230                              (aobrientransform, (N.ArrayType,)) )
4231 samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)),
4232                        (asamplevar, (N.ArrayType,)) )
4233 samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)),
4234                          (asamplestdev, (N.ArrayType,)) )
4235 signaltonoise = Dispatch( (asignaltonoise, (N.ArrayType,)),)
4236 var = Dispatch ( (lvar, (ListType, TupleType)),
4237                  (avar, (N.ArrayType,)) )
4238 stdev = Dispatch ( (lstdev, (ListType, TupleType)),
4239                    (astdev, (N.ArrayType,)) )
4240 sterr = Dispatch ( (lsterr, (ListType, TupleType)),
4241                    (asterr, (N.ArrayType,)) )
4242 sem = Dispatch ( (lsem, (ListType, TupleType)),
4243                  (asem, (N.ArrayType,)) )
4244 z = Dispatch ( (lz, (ListType, TupleType)),
4245                (az, (N.ArrayType,)) )
4246 zs = Dispatch ( (lzs, (ListType, TupleType)),
4247                 (azs, (N.ArrayType,)) )
4248 
4249## TRIMMING FCNS:
4250 threshold = Dispatch( (athreshold, (N.ArrayType,)),)
4251 trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)),
4252                       (atrimboth, (N.ArrayType,)) )
4253 trim1 = Dispatch ( (ltrim1, (ListType, TupleType)),
4254                    (atrim1, (N.ArrayType,)) )
4255 
4256## CORRELATION FCNS:
4257 paired = Dispatch ( (lpaired, (ListType, TupleType)),
4258                     (apaired, (N.ArrayType,)) )
4259 pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)),
4260                       (apearsonr, (N.ArrayType,)) )
4261 spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)),
4262                        (aspearmanr, (N.ArrayType,)) )
4263 pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)),
4264                             (apointbiserialr, (N.ArrayType,)) )
4265 kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)),
4266                         (akendalltau, (N.ArrayType,)) )
4267 linregress = Dispatch ( (llinregress, (ListType, TupleType)),
4268                         (alinregress, (N.ArrayType,)) )
4269 
4270## INFERENTIAL STATS:
4271 ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)),
4272                          (attest_1samp, (N.ArrayType,)) )
4273 ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)),
4274                        (attest_ind, (N.ArrayType,)) )
4275 ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)),
4276                        (attest_rel, (N.ArrayType,)) )
4277 chisquare = Dispatch ( (lchisquare, (ListType, TupleType)),
4278                        (achisquare, (N.ArrayType,)) )
4279 ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)),
4280                       (aks_2samp, (N.ArrayType,)) )
4281 mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)),
4282                           (amannwhitneyu, (N.ArrayType,)) )
4283 tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)),
4284                         (atiecorrect, (N.ArrayType,)) )
4285 ranksums = Dispatch ( (lranksums, (ListType, TupleType)),
4286                       (aranksums, (N.ArrayType,)) )
4287 wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)),
4288                        (awilcoxont, (N.ArrayType,)) )
4289 kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)),
4290                             (akruskalwallish, (N.ArrayType,)) )
4291 friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)),
4292                                (afriedmanchisquare, (N.ArrayType,)) )
4293 
4294## PROBABILITY CALCS:
4295 chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)),
4296                        (achisqprob, (N.ArrayType,)) )
4297 zprob = Dispatch ( (lzprob, (IntType, FloatType)),
4298                    (azprob, (N.ArrayType,)) )
4299 ksprob = Dispatch ( (lksprob, (IntType, FloatType)),
4300                     (aksprob, (N.ArrayType,)) )
4301 fprob = Dispatch ( (lfprob, (IntType, FloatType)),
4302                    (afprob, (N.ArrayType,)) )
4303 betacf = Dispatch ( (lbetacf, (IntType, FloatType)),
4304                     (abetacf, (N.ArrayType,)) )
4305 betai = Dispatch ( (lbetai, (IntType, FloatType)),
4306                    (abetai, (N.ArrayType,)) )
4307 erfcc = Dispatch ( (lerfcc, (IntType, FloatType)),
4308                    (aerfcc, (N.ArrayType,)) )
4309 gammln = Dispatch ( (lgammln, (IntType, FloatType)),
4310                     (agammln, (N.ArrayType,)) )
4311 
4312## ANOVA FUNCTIONS:
4313 F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)),
4314                       (aF_oneway, (N.ArrayType,)) )
4315 F_value = Dispatch ( (lF_value, (ListType, TupleType)),
4316                      (aF_value, (N.ArrayType,)) )
4317
4318## SUPPORT FUNCTIONS:
4319 incr = Dispatch ( (lincr, (ListType, TupleType, N.ArrayType)), )
4320 sum = Dispatch ( (lsum, (ListType, TupleType)),
4321                  (asum, (N.ArrayType,)) )
4322 cumsum = Dispatch ( (lcumsum, (ListType, TupleType)),
4323                     (acumsum, (N.ArrayType,)) )
4324 ss = Dispatch ( (lss, (ListType, TupleType)),
4325                 (ass, (N.ArrayType,)) )
4326 summult = Dispatch ( (lsummult, (ListType, TupleType)),
4327                      (asummult, (N.ArrayType,)) )
4328 square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)),
4329                             (asquare_of_sums, (N.ArrayType,)) )
4330 sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)),
4331                             (asumdiffsquared, (N.ArrayType,)) )
4332 shellsort = Dispatch ( (lshellsort, (ListType, TupleType)),
4333                        (ashellsort, (N.ArrayType,)) )
4334 rankdata = Dispatch ( (lrankdata, (ListType, TupleType)),
4335                       (arankdata, (N.ArrayType,)) )
4336 findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)),
4337                         (afindwithin, (N.ArrayType,)) )
4338
4339######################  END OF NUMERIC FUNCTION BLOCK  #####################
4340
4341######################  END OF STATISTICAL FUNCTIONS  ######################
4342
4343except ImportError:
4344 pass
Note: See TracBrowser for help on using the repository browser.