source: orange-bioinformatics/Orange/bioinformatics/widgets/Anova.py @ 1625:cefeb35cbfc9

Revision 1625:cefeb35cbfc9, 80.4 KB checked in by mitar, 2 years ago (diff)

Moving files around.

Line 
1## Automatically adapted for numpy.oldnumeric Oct 03, 2007 by
2
3import math
4import numpy.oldnumeric as Numeric, numpy.oldnumeric.ma as MA
5import numpy.oldnumeric.linear_algebra as LinearAlgebra
6import scipy.stats
7import numpyExtn
8
9
10#######################################################################################
11## ANOVA based on Multivariate Linear Regression
12#######################################################################################
13
14
15class AnovaLRBase:
16    """Base class for all ANOVA types based on multivariate linear regression: manipulation with dummy variables.
17    """
18
19    def getDummyRC(self, numVars, numReplicas=1):
20        """Returns 2D array of reference cell encoded dummy variables.
21        """
22        if not isinstance(numReplicas, int):
23            numReplicas = Numeric.asarray(numReplicas)
24            assert len(numReplicas) == numVars, "numReplicas: int or argument of length numVars expected"
25        return Numeric.repeat(Numeric.concatenate((Numeric.zeros((1,numVars-1), Numeric.Float), Numeric.identity(numVars-1, Numeric.Float)), 0), numReplicas, 0)
26       
27
28    def getDummyEE(self, numVars, numReplicas=1):
29        """Returns 2D array of effects endoced dummy variables, shape (n, n-1) where n = numVars*numReplicas | sum(numReplicas).
30        Each row represents a code for a single variable value, encoded by numVar-1 dummy variables.
31        numReplicas = int | list | array of shape (numVars,): the number (or list) of observations per cell.
32        """
33        assert numVars > 0, "numVars <= 0"
34        if isinstance(numReplicas, int):
35            numReplicas = Numeric.ones((numVars,)) * numReplicas
36        else:
37            numReplicas = Numeric.asarray(numReplicas)
38            assert len(numReplicas) == numVars, "numReplicas: int or argument of length numVars expected"
39        if numVars == 1:
40            return Numeric.zeros((numReplicas[0], 0))
41        else:
42            return Numeric.repeat(Numeric.concatenate((Numeric.identity(numVars-1, Numeric.Int), -1*Numeric.ones((1,numVars-1), Numeric.Int)), 0), numReplicas, 0)
43       
44
45    def getSubjectDummyEE(self, numSubjList, numReplicas=1):
46        """Returns 2D array of effects endoced subject dummy variables, shape (sum(ni), sum(ni-1)) where numSubjList=[n0,n1,...].
47        numSubjList: a list of the number of subjects within each level of the non-repeated factor.
48        numReplicas: int | list | array of shape sum(ni): the number of times each subject code (row) is repeated.
49        """
50        numSubjList = Numeric.asarray(numSubjList)
51        assert len(numSubjList.shape) == 1, "len(numSubjList.shape) != 1"
52        if isinstance(numReplicas, int):
53            numReplicas = Numeric.ones((numSubjList.shape[0],)) * numReplicas
54        else:
55            numReplicas = Numeric.asarray(numReplicas)
56            assert len(numReplicas.shape) == 1, "len(numReplicas.shape) != 1"
57            assert numReplicas.shape[0] == numSubjList.shape[0], "numReplicas: int or a list of the same length as numSubjList expected"
58        si0 = Numeric.concatenate(([0], Numeric.add.accumulate(numSubjList)))      # indices for axis 0 of dm
59        si1 = Numeric.concatenate(([0], Numeric.add.accumulate(numSubjList-1)))    # indices for axis 1 of dm
60        dm = Numeric.zeros((si0[-1], si1[-1]))                                     # subject dummy codes matrix
61        numRepeats = Numeric.ones((si0[-1],))
62        for i in range(numSubjList.shape[0]):
63            if numSubjList[i] == 1: print "Warning: a single subject in group %i" % (i)
64            dm[si0[i]:si0[i+1], si1[i]:si1[i+1]] = self.getDummyEE(numSubjList[i],1)
65            numRepeats[si0[i]:si0[i+1]] = Numeric.ones((numSubjList[i],)) * numReplicas[i]
66        return Numeric.repeat(dm, numRepeats, 0)
67
68
69    def getDummiesJoin(self, dummyMtrx1, dummyMtrx2):
70        """Returns a tuple of resized dummy matrices containing the cartesian product of rows.
71        """
72        dummyMtrx1 = Numeric.repeat(dummyMtrx1, dummyMtrx2.shape[0], 0)                         # repeat each row separately
73        dummyMtrx2 = Numeric.resize(dummyMtrx2, (dummyMtrx1.shape[0], dummyMtrx2.shape[1]))     # repeat whole matrix, add new rows
74        return (dummyMtrx1, dummyMtrx2)
75
76
77    def getDummyInteraction(self, dummyMtrx1, dummyMtrx2):
78        """Returns a product of dummy matrices resized to the cartesian product of columns.
79        """
80        dmIntShp = (dummyMtrx1.shape[0], dummyMtrx1.shape[1] * dummyMtrx2.shape[1])
81        dmInt1 = Numeric.repeat(dummyMtrx1, dummyMtrx2.shape[1], 1)                             # repeat each column separately
82        dmInt2 = Numeric.transpose(Numeric.resize(Numeric.transpose(dummyMtrx2, (1,0)), (dmIntShp[1], dmIntShp[0])), (1,0)) # repeat whole matrix, add new columns
83        return dmInt1*dmInt2
84
85
86class Anova2wayLRBase(AnovaLRBase):
87    """Base class for all 2 way ANOVA types (repeated and non-repeated measures) includes various post-hoc tests.
88    The following variables should be defined by base classes:
89        self._addInteraction = addInteraction
90        self._arr2d = arr2d
91        self._groupLens = groupLens
92        self._MSpool = LR_treat.MSres
93        self._DFpool = LR_treat.MSreg
94    """
95
96    def posthoc_tstat_B(self):
97        """Fisher LSD: Compares all levels of factor B and returns (b,b) matrix of t-stat values and a matrix of corresponding p-values.
98        Use when there is no significant interaction.
99        Warning: Statistica computes it WRONG!
100        """
101        b = self._groupLens.shape[0]
102        # precompute averages
103        x_avrg = Numeric.zeros((b,), Numeric.Float)         # mean of cell means over factor A
104        sum_count_x = Numeric.zeros((b,), Numeric.Float)
105        groupLensAcc0 = Numeric.array([0] + Numeric.add.accumulate(self._groupLens).tolist())
106        for idx in range(b):
107            groupInd = range(groupLensAcc0[idx], groupLensAcc0[idx+1])
108            xi = self._arr2d.take(groupInd, 1)
109            x_avrg[idx] = MA.average(MA.average(xi,1))                  # first average over replicas to obtain cell mean, then over factor A
110            sum_count_x[idx] = Numeric.add.reduce(1./MA.count(xi,1))    # first count the number of measurements within cells, then sum inverses over factor A
111            ##x_avrg[idx] = MA.average(MA.ravel(xi))                    # this is how Statistica computes it, but it is WRONG!
112            ##sum_count_x[idx] = 1./MA.count(xi)                        # this is how Statistica computes it, but it is WRONG!
113        # t-statistics
114        x_avrg_2 = MA.resize(x_avrg, (x_avrg.shape[0], x_avrg.shape[0]))
115        sum_count_x_2 = Numeric.resize(sum_count_x, (sum_count_x.shape[0],sum_count_x.shape[0]))
116        tstat = (MA.transpose(x_avrg_2) - x_avrg_2) * self._arr2d.shape[0] / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))
117        ##tstat = (MA.transpose(x_avrg_2) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))     # this is how Statistica computes it, but it is WRONG!
118        tprob = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat**2))),1,1)
119        return tstat, tprob
120
121
122    def isSignif_holm_B(self, alpha):
123        """Conduct Holm step-down sequential multiple comparison on factor B.
124        Returns (b,b) matrix indicating significant differences between levels of factor B and the direction of changes [-1|0|1].
125        """
126        tstat, pvals = self.posthoc_tstat_B()
127        pvals = numpyExtn.triangularGet(pvals)
128        sortInd = Numeric.argsort(pvals)
129        k = pvals.shape[0]
130        isSignif = -1*Numeric.ones((k,), Numeric.Int)
131        for j in range(k):
132            isSignif[sortInd[j]] = pvals[sortInd[j]] < float(alpha) / (k-j)
133        return numpyExtn.triangularPut(isSignif, upper=1, lower=1) * (MA.greater(tstat,0) - MA.less(tstat,0))
134
135
136    def posthoc_tstat_BbyA(self):
137        """Fisher LSD: Compares all levels of factor B by each level of factor A separately.
138        Returns: (b,b,a) matrix of t-stat values
139                 (b,b,a) matrix of corresponding p-values
140                 (b,b) matrix of geometric means of p-values over all levels of factor A.
141        Use when there is a significant interaction and factor A is of no interest.
142        """
143        a = self._arr2d.shape[0]
144        b = self._groupLens.shape[0]
145        # precompute averages
146        x_avrg = Numeric.zeros((a,b), Numeric.Float)
147        sum_count_x = Numeric.zeros((a,b), Numeric.Float)
148        groupLensAcc0 = Numeric.array([0] + Numeric.add.accumulate(self._groupLens).tolist())
149        for idx in range(b):
150            groupInd = range(groupLensAcc0[idx], groupLensAcc0[idx+1])
151            xi = self._arr2d.take(groupInd, 1)
152            x_avrg[:,idx] = MA.average(xi, 1)
153            sum_count_x[:,idx] = 1. / MA.count(xi, 1)
154        # t-statistics
155        x_avrg_2 = MA.resize(x_avrg, (b,a,b))
156        sum_count_x_2 = Numeric.resize(sum_count_x, (b,a,b))
157        tstat = (MA.transpose(x_avrg_2, (2,1,0)) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2, (2,1,0)) + sum_count_x_2))
158        # get p-values for each level of factor a separately (axis 1)
159        tprob = MA.array(-1*Numeric.ones(tstat.shape, Numeric.Float), mask=Numeric.ones(tstat.shape))
160        for idx1 in range(tprob.shape[1]):
161            tprob[:,idx1,:] = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat[:,idx1,:]**2))),1,1)
162        return tstat, tprob
163
164
165    def posthoc_anova(self):
166        """Conduct ANOVA for each pair of levels of factor B.
167        Returns for each pair of factor B levels:
168            (b,b) matrix of F-stat values for effect of factor A
169            (b,b) matrix of p-values for effect of factor A
170            (b,b) matrix of F-stat values for effect of factor B
171            (b,b) matrix of p-values for effect of factor B
172            (b,b) matrix of F-stat values for interaction effect
173            (b,b) matrix of p-values for interaction effect
174        """
175        if self._addInteraction != 1:
176            raise "Error: posthoc_anova can be conducted only when the interaction effect has been tested"
177        b = self._groupLens.shape[0]
178        FA = MA.masked * MA.ones((b,b), Numeric.Float)
179        FAprob = MA.masked * MA.ones((b,b), Numeric.Float)
180        FB = MA.masked * MA.ones((b,b), Numeric.Float)
181        FBprob = MA.masked * MA.ones((b,b), Numeric.Float)
182        FAB = MA.masked * MA.ones((b,b), Numeric.Float)
183        FABprob = MA.masked * MA.ones((b,b), Numeric.Float)
184        groupLensAcc0 = Numeric.array([0] + Numeric.add.accumulate(self._groupLens).tolist())
185        groupInd = map(lambda i,j: range(i,j), groupLensAcc0[:-1],groupLensAcc0[1:])
186        for i in range(b):
187            for j in range(i+1,b):
188                takeInd = groupInd[i] + groupInd[j]
189                an = self.__class__(self._arr2d.take(takeInd, 1), [self._groupLens[i],self._groupLens[j]], addInteraction=1)
190                FA[i,j] = FA[j,i] = an.FA
191                FAprob[i,j] = FAprob[j,i] = an.FAprob
192                FB[i,j] = FB[j,i] = an.FB
193                FBprob[i,j] = FBprob[j,i] = an.FBprob
194                FAB[i,j] = FAB[j,i] = an.FAB
195                FABprob[i,j] = FABprob[j,i] = an.FABprob
196        return FA, FAprob, FB, FBprob, FAB, FABprob
197
198
199class Anova1wayLR(AnovaLRBase):
200    """1 way ANOVA for unbalanced designs using multivariate linear regression.
201    Multiple observations given as a list of indices groups, e.g. [[0,1,2],[3,4]].
202    BUG: the indices are not taken into account, but only the lenghts of the groups!
203    Supports balanced and unbalanced designs and missing (masked) data.
204    """
205
206    def __init__(self, arr1d, replicaGroupLens):
207        """arr1d: 1D masked array: [x1,x2,x3, y1,y2, z1,z2,z3,z4] where x,y,z correspond to factor-levels with 3,2 and 4 replicas, respectively
208        replicaGroupLens: the number of replicas in individual groups, i.e. [3,2,4]
209        """
210        arr1d = MA.asarray(arr1d)
211        assert len(arr1d.shape) == 1, "len(arr1d.shape) != 1"
212
213        # remove missing observations and the corresponding dummy variable codes
214        self.y = MA.asarray(arr1d)
215        self.y, takeInd = numpyExtn.compressIndices(self.y)
216
217        # dummy variables
218        self.dummyA = self.getDummyEE(len(replicaGroupLens), replicaGroupLens)
219        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
220
221        # run analysis
222        LRA = MultLinReg(self.dummyA, self.y)
223        try:
224            self.F = LRA.MSreg / LRA.MSres
225            self.Fprob = scipy.stats.fprob(LRA.DFreg, LRA.DFres, self.F)
226        except ZeroDivisionError:
227            self.F = 0
228            self.Fprob = 1
229
230
231class Anova1wayLR_2D(AnovaLRBase):
232    """1 way ANOVA for unbalanced designs using multivariate linear regression.
233    Axis 0 correspond to replicas and axis 1 correspond to different factor-levels.
234    Supports balanced and unbalanced designs and missing (masked) data.
235    """
236
237    def __init__(self, arr2d):
238        arr2d = MA.asarray(arr2d)
239        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
240
241        # remove missing observations and the corresponding dummy variable codes
242        self.y = MA.ravel(MA.transpose(arr2d))
243        self.y, takeInd = numpyExtn.compressIndices(self.y)
244
245        # adjust degrees of freedom for factor-leves that contain no data
246        numFactorLevels = int(MA.add.reduce(MA.greater(arr2d.count(0), 0)))
247        zeroLengthFactorLevels = Numeric.compress(MA.equal(arr2d.count(0), 0), range(arr2d.shape[1]))
248        if numFactorLevels > 1:
249
250##            # dummy variables (BUG: index error when the number of levels is reduced)
251##            print numFactorLevels, arr2d.shape[0], takeInd
252##            self.dummyA = self.getDummyEE(numFactorLevels, arr2d.shape[0])
253##            self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
254            # dummy variables
255            self.dummyA = self.getDummyEE(arr2d.shape[1], arr2d.shape[0])   # create all dummies
256            for lev in zeroLengthFactorLevels:                              # reduce length of dummies to adjust for actual number of factor A levels
257                self.dummyA = Numeric.concatenate([self.dummyA[:,0:lev],self.dummyA[:,lev+1:]],1)
258            if len(zeroLengthFactorLevels) > 0 and zeroLengthFactorLevels[-1] == arr2d.shape[1]-1:
259                self.dummyA[self.dummyA.shape[0]-arr2d.shape[1]-1:] = -1    # if the last factor level is missing, manually set the last dummies to matrix of -1's
260            self.dummyA = Numeric.take(self.dummyA, takeInd, 0)             # take dummies where data is not masked
261
262            # run analysis
263            LRA = MultLinReg(self.dummyA, self.y)
264            try:
265                self.F = LRA.MSreg / LRA.MSres
266                self.Fprob = scipy.stats.fprob(LRA.DFreg, LRA.DFres, self.F)
267            except ZeroDivisionError:
268                self.F = 0
269                self.Fprob = 1
270
271        else:
272            print "Giving up ANOVA: a factor with a single level encountered."
273            self.F = 0
274            self.Fprob = 1
275
276
277class _Anova2wayLR_bug_missing_factor_leves(AnovaLRBase):
278    """2 way ANOVA with multiple observations per cell for unbalanced designs using multivariate linear regression.
279    Multiple observations given at axis 1 together with the list of indices groups, e.g. [[0,1,2],[3,4]].
280    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
281    TODO: account for empty cells (this implementation results in singular design matrix)
282    """
283
284    def __init__(self, arr2d, replicaGroupInd, addInteraction=0):
285        """arr2d: 2D masked array where replicas are given at axis 1 together with the indices groups.
286        replicaGroupInd: indices of axis 1 that belong to a common factor (multiple observations per cell)
287        addInteraction: include / exclude the interaction between factors
288        """
289        arr2d = MA.asarray(arr2d)
290        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
291
292        # check if data is balanced
293        rgLens = Numeric.array(map(lambda x: len(x), replicaGroupInd), Numeric.Int)
294        isBalanced = Numeric.add.reduce((1.*rgLens/rgLens[0]) == Numeric.ones((len(rgLens)))) / len(rgLens) == 1
295
296        # check for empty cells, raise exception if empty cells exist and addInteraction=1
297        if addInteraction:
298            ax1Ind = Numeric.concatenate(([0], Numeric.add.accumulate(rgLens)))
299            for idx in range(rgLens.shape[0]):
300                if Numeric.add.reduce(MA.count(arr2d[:,ax1Ind[idx]:ax1Ind[idx+1]],1) == 0) > 0:
301                    raise ValueError, "arr2d has empty cells, cannot evaluate the interaction between factors, try addInteraction=0"
302
303        # remove missing observations and the corresponding dummy variable codes
304        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
305        noMissing = MA.count(self.y) == self.y.shape[0]
306        self.y, takeInd = numpyExtn.compressIndices(self.y)
307
308        # dummy variables
309        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
310        self.dummyB = self.getDummyEE(len(replicaGroupInd), rgLens)
311        self.dummyA, self.dummyB = self.getDummiesJoin(self.dummyA, self.dummyB)
312        if addInteraction:
313            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
314            self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
315        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
316        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
317
318        # run analysis
319        if addInteraction:
320            LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB), 1), self.y)               
321        else:
322            LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
323           
324        if isBalanced and noMissing:
325            LR_A = MultLinReg(self.dummyA, self.y)
326            LR_B = MultLinReg(self.dummyB, self.y)
327            self.FA = LR_A.MSreg / LR_treat.MSres
328            self.FB = LR_B.MSreg / LR_treat.MSres
329            if addInteraction:
330                LR_AB = MultLinReg(self.dummyAB, self.y)
331                self.FAB = LR_AB.MSreg / LR_treat.MSres
332        else:
333            if addInteraction:
334                LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB), 1), self.y)
335                LR_A_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyAB), 1), self.y)
336                LR_B_AB = MultLinReg(Numeric.concatenate((self.dummyB, self.dummyAB), 1), self.y)
337                self.FA = (LR_treat.SSreg - LR_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
338                self.FB = (LR_treat.SSreg - LR_A_AB.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
339                self.FAB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
340            else:
341                LR_A = MultLinReg(self.dummyA, self.y)
342                LR_B = MultLinReg(self.dummyB, self.y)
343                self.FA = (LR_treat.SSreg - LR_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
344                self.FB = (LR_treat.SSreg - LR_A.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
345
346        self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
347        self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], LR_treat.DFres, self.FB)
348        if addInteraction:
349            self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
350
351        # store variables needed for posthoc tests
352        self._arr2d = arr2d
353        self._replicaGroupInd = replicaGroupInd
354        self._LR_treat_MSres = LR_treat.MSres
355        self._LR_treat_DFres = LR_treat.DFres
356
357       
358    def posthoc_tstat_B(self):
359        """Fisher LSD: Compares all levels of factor B and returns (b,b) matrix of t-stat values and a matrix of corresponding p-values.
360        Use when there is no significant interaction.
361        Warning: Statistica computes it WRONG!
362        """
363        b = len(self._replicaGroupInd)
364        # precompute averages
365        x_avrg = Numeric.zeros((b,), Numeric.Float)         # mean of cell means over factor A
366        sum_count_x = Numeric.zeros((b,), Numeric.Float)
367        for idx, replicaInd in enumerate(self._replicaGroupInd):
368            xi = self._arr2d.take(replicaInd, 1)
369            x_avrg[idx] = MA.average(MA.average(xi,1))                  # first average over replicas to obtain cell mean, then over factor A
370            sum_count_x[idx] = Numeric.add.reduce(1./MA.count(xi,1))    # first count the number of measurements within cells, then sum inverses over factor A
371            ##x_avrg[idx] = MA.average(MA.ravel(xi))                    # this is how Statistica computes it, but it is WRONG!
372            ##sum_count_x[idx] = 1./MA.count(xi)                        # this is how Statistica computes it, but it is WRONG!
373        # t-statistics
374        x_avrg_2 = MA.resize(x_avrg, (x_avrg.shape[0], x_avrg.shape[0]))
375        sum_count_x_2 = Numeric.resize(sum_count_x, (sum_count_x.shape[0],sum_count_x.shape[0]))
376        tstat = (MA.transpose(x_avrg_2) - x_avrg_2) * self._arr2d.shape[0] / Numeric.sqrt(self._LR_treat_MSres * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))
377        ##tstat = (MA.transpose(x_avrg_2) - x_avrg_2) / Numeric.sqrt(self._LR_treat_MSres * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))     # this is how Statistica computes it, but it is WRONG!
378        tprob = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._LR_treat_DFres, 0.5, float(self._LR_treat_DFres) / (self._LR_treat_DFres + numpyExtn.triangularGet(tstat**2))),1,1)
379        return tstat, tprob
380
381
382    def isSignif_holm_B(self, alpha):
383        """Conduct Holm step-down sequential multiple comparison on factor B.
384        Returns (b,b) matrix indicating significant differences between levels of factor B and the direction of changes [-1|0|1].
385        """
386        tstat, pvals = self.posthoc_tstat_B()
387        pvals = numpyExtn.triangularGet(pvals)
388        sortInd = Numeric.argsort(pvals)
389        k = pvals.shape[0]
390        isSignif = -1*Numeric.ones((k,), Numeric.Int)
391        for j in range(k):
392            isSignif[sortInd[j]] = pvals[sortInd[j]] < float(alpha) / (k-j)
393        return numpyExtn.triangularPut(isSignif, upper=1, lower=1) * (MA.greater(tstat,0) - MA.less(tstat,0))
394
395
396    def posthoc_tstat_BbyA(self):
397        """Fisher LSD: Compares all levels of factor B by each level of factor A separately.
398        Returns: (b,b,a) matrix of t-stat values
399                 (b,b,a) matrix of corresponding p-values
400                 (b,b) matrix of geometric means of p-values over all levels of factor A.
401        Use when there is a significant interaction and factor A is of no interest.
402        """
403        a = self._arr2d.shape[0]
404        b = len(self._replicaGroupInd)
405        # precompute averages
406        x_avrg = Numeric.zeros((a,b), Numeric.Float)
407        sum_count_x = Numeric.zeros((a,b), Numeric.Float)
408        for idx, replicaInd in enumerate(self._replicaGroupInd):
409            xi = self._arr2d.take(replicaInd, 1)
410            x_avrg[:,idx] = MA.average(xi, 1)
411            sum_count_x[:,idx] = 1. / MA.count(xi, 1)
412       
413        # t-statistics
414        x_avrg_2 = MA.resize(x_avrg, (b,a,b))
415        sum_count_x_2 = Numeric.resize(sum_count_x, (b,a,b))
416
417        tstat = (MA.transpose(x_avrg_2, (2,1,0)) - x_avrg_2) / Numeric.sqrt(self._LR_treat_MSres * (Numeric.transpose(sum_count_x_2, (2,1,0)) + sum_count_x_2))
418        # get p-values for each level of factor a separately (axis 1)
419        tprob = MA.array(-1*Numeric.ones(tstat.shape, Numeric.Float), mask=Numeric.ones(tstat.shape))
420        for idx1 in range(tprob.shape[1]):
421            tprob[:,idx1,:] = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._LR_treat_DFres, 0.5, float(self._LR_treat_DFres) / (self._LR_treat_DFres + numpyExtn.triangularGet(tstat[:,idx1,:]**2))),1,1)
422        return tstat, tprob
423
424
425    def posthoc_anova_B_AB(self):
426        """Conduct ANOVA for each pair of levels of factor B to detect interaction effect.
427        Returns for each pair of factor B levels:
428            (b,b) matrix of F-stat values for effect of factor B
429            (b,b) matrix of p-values for effect of factor B
430            (b,b) matrix of F-stat values for interaction effect
431            (b,b) matrix of p-values for interaction effect
432        """
433        b = len(self._replicaGroupInd)
434        FB = MA.masked * MA.ones((b,b), Numeric.Float)
435        FBprob = MA.masked * MA.ones((b,b), Numeric.Float)
436        FAB = MA.masked * MA.ones((b,b), Numeric.Float)
437        FABprob = MA.masked * MA.ones((b,b), Numeric.Float)
438        for i in range(b):
439            for j in range(i+1,b):
440                rglSub = [self._replicaGroupInd[i], self._replicaGroupInd[j]]
441                takeInd = self._replicaGroupInd[i] + self._replicaGroupInd[j]
442                an = Anova2wayLR(self._arr2d.take(takeInd, 1), rglSub, addInteraction=1)
443                FB[i,j] = FB[j,i] = an.FB
444                FBprob[i,j] = FBprob[j,i] = an.FBprob
445                FAB[i,j] = FAB[j,i] = an.FAB
446                FABprob[i,j] = FABprob[j,i] = an.FABprob
447        return FB, FBprob, FAB, FABprob
448
449
450class Anova2wayLR(Anova2wayLRBase):
451    """2 way ANOVA with multiple observations per cell for unbalanced designs using multivariate linear regression.
452    Multiple observations given at axis 1 together with the list of group lenghts, e.g. [3,2].
453    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
454    TODO: account for empty cells (this implementation results in singular design matrix)
455    """
456
457    def __init__(self, arr2d, groupLens, addInteraction=0, allowReductA=True, allowReductB=False):
458        """arr2d: 2D masked array where replicas are given at axis 1 together with lenghts of the groups (multiple observations per cell);
459        addInteraction: include / exclude the interaction between factors;
460        allowReduct[A|B]: whether to allow or not the reduction of levels of factor A (B).
461        """
462        arr2d = MA.asarray(arr2d)
463        groupLens = Numeric.array(groupLens)    # make a copy for the case if subjects or levels of factor B are removed
464        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
465
466        # check if there exist factor-levels with all values missing (for A and B)
467        # if so, reduce the corresponding DF and fix the number of dummy variables
468        missIndA = Numeric.compress(MA.count(arr2d, 1) == 0, Numeric.arange(arr2d.shape[0]))
469        if missIndA.shape[0] > 0:
470            if allowReductA:
471                print "Warning: removig factor A level(s) %s" % str(missIndA.tolist())
472                takeIndA = Numeric.compress(MA.count(arr2d, 1) != 0, Numeric.arange(arr2d.shape[0]))
473                arr2d = arr2d.take(takeIndA, 0)
474            else:
475                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor A have no values: %s" % str(missIndA.tolist()))
476                return
477        missIndSubj = Numeric.compress(MA.count(arr2d, 0) == 0, Numeric.arange(arr2d.shape[1]))
478        if missIndSubj.shape[0] > 0:
479            takeIndSubj = Numeric.compress(MA.count(arr2d, 0) != 0, Numeric.arange(arr2d.shape[1]))
480            # fix groupLens
481##            mapSubj2Group = -1*Numeric.ones(arr2d.shape[1])
482##            groupLensAcc0 = [0] + Numeric.add.accumulate(groupLens).tolist()
483##            for i in range(len(groupLensAcc0) - 1):
484##                mapSubj2Group[groupLensAcc0[i]:groupLensAcc0[i+1]] = i
485            mapSubj2Group = Numeric.repeat(range(len(groupLens)), groupLens)
486            for subjIdx in missIndSubj:
487                groupLens[mapSubj2Group[subjIdx]] -= 1
488            # remove data columns that are missing all the values
489            arr2d = arr2d.take(takeIndSubj, 1)
490        # fix number of factor B levels (if the length of any group became 0 due to removed subjects)
491        missIndB = Numeric.compress(groupLens <= 0, Numeric.arange(groupLens.shape[0]))
492        if missIndB.shape[0] > 0:
493            if allowReductB:
494                print "Warning: removig factor B level(s) %s" % str(missIndB)
495                takeIndB = Numeric.compress(groupLens > 0, Numeric.arange(groupLens.shape[0]))
496                groupLens = Numeric.take(groupLens, takeIndB)
497                # arr2d has already been taken care of by removing the subjects without observations
498            else:
499                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have no values: %s" % str(missIndB))
500                return
501        # remove levels of factor B with single observation per cell
502        grpIndSingle = Numeric.compress(Numeric.equal(groupLens,1), Numeric.arange(groupLens.shape[0]))
503        if grpIndSingle.shape[0] > 0:
504            if allowReductB:
505                # fix arr2d
506                print "Warning: removing factor B level(s) with single observation: %s" % str(grpIndSingle)
507                groupLensAcc = [0] + list(Numeric.add.accumulate(groupLens))
508                arr2dRemInd = map(lambda i: groupLensAcc[i], grpIndSingle)
509                arr2dTakeInd = Numeric.ones(arr2d.shape[1])
510                Numeric.put(arr2dTakeInd, arr2dRemInd, 0)
511                arr2d = MA.compress(arr2dTakeInd, arr2d, 1)
512                # fix groupLens
513                groupLens = Numeric.compress(Numeric.not_equal(groupLens,1), groupLens)
514            else:
515                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have single observations: %s" % str(grpIndSingle))
516                return               
517        # check that there exist at least 2 levels of factors A and B
518        # and that there is at least single duplicated observation
519        if arr2d.shape[0] < 2 or len(groupLens) < 2: ##or MA.count(arr2d) <= arr2d.shape[0] * len(groupLens):
520            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not enough levels of factor A (%i) or factor B (%i)" % (arr2d.shape[0],len(groupLens)))
521            return
522
523        # check if data is balanced
524        isBalanced = (1. * Numeric.add.reduce((1.*groupLens/groupLens[0]) == Numeric.ones(groupLens.shape[0])) / groupLens.shape[0]) == 1
525
526        # check for empty cells; if exist and addInteraction=1, switch to no interaction and continue
527        if addInteraction:
528            ax1Ind = Numeric.concatenate(([0], Numeric.add.accumulate(groupLens)))
529            for idx in range(groupLens.shape[0]):
530                if Numeric.add.reduce(MA.count(arr2d[:,ax1Ind[idx]:ax1Ind[idx+1]],1) == 0) > 0:
531##                    raise ValueError, "arr2d has empty cells, cannot evaluate the interaction between factors, try addInteraction=0"
532                    print ValueError, "Warning: data has empty cells, cannot evaluate the interaction between factors, continuing without interaction"
533                    addInteraction=0
534                    # store member varibales that are stored only if addInteraction==1
535                    self.FAB = 0
536                    self.FABprob = 1
537                    self.addInteractionFailed = True
538
539        # remove missing observations and the corresponding dummy variable codes
540        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
541        noMissing = MA.count(self.y) == self.y.shape[0]
542        self.y, takeInd = numpyExtn.compressIndices(self.y)
543
544        # dummy variables
545        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
546        self.dummyB = self.getDummyEE(len(groupLens), groupLens)
547        self.dummyA, self.dummyB = self.getDummiesJoin(self.dummyA, self.dummyB)
548        if addInteraction:
549            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
550        else:
551            self.dummyAB = Numeric.zeros((self.dummyA.shape[0],0))
552        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
553        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
554        self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
555
556        # check that there is enough observations to allow for variability (DFres > 0)
557        if self.dummyA.shape[0] - sum([self.dummyA.shape[1], self.dummyB.shape[1], self.dummyAB.shape[1]]) - 1 <= 0:
558            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not values (dummy condition)")
559            return
560
561        # run analysis
562        try:
563##            if addInteraction:
564            LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB), 1), self.y)               
565##            else:
566##                LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
567               
568            if isBalanced and noMissing:
569                LR_A = MultLinReg(self.dummyA, self.y)
570                LR_B = MultLinReg(self.dummyB, self.y)
571                self.FA = LR_A.MSreg / LR_treat.MSres
572                self.FB = LR_B.MSreg / LR_treat.MSres
573                if addInteraction:
574                    LR_AB = MultLinReg(self.dummyAB, self.y)
575                    self.FAB = LR_AB.MSreg / LR_treat.MSres
576            else:
577                if addInteraction:
578                    LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB), 1), self.y)
579                    LR_A_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyAB), 1), self.y)
580                    LR_B_AB = MultLinReg(Numeric.concatenate((self.dummyB, self.dummyAB), 1), self.y)
581                    self.FA = (LR_treat.SSreg - LR_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
582                    self.FB = (LR_treat.SSreg - LR_A_AB.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
583                    self.FAB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
584                else:
585                    LR_A = MultLinReg(self.dummyA, self.y)
586                    LR_B = MultLinReg(self.dummyB, self.y)
587                    self.FA = (LR_treat.SSreg - LR_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
588                    self.FB = (LR_treat.SSreg - LR_A.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
589        except:
590            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: miltivariate linear regression failed due to lack of variability")
591            return
592
593        try:
594            self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
595        except ValueError:
596            self.FAprob = 1
597        try:
598            self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], LR_treat.DFres, self.FB)
599        except ValueError:
600            self.FBprob = 1
601        self.ps = [self.FAprob, self.FBprob]
602        if addInteraction:
603            try:
604                self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
605            except ValueError:
606                self.FABprob = 1
607            self.ps.append(self.FABprob)
608
609        # store variables needed for posthoc tests
610        self._MSpool = LR_treat.MSres
611        self._DFpool = LR_treat.MSreg
612        self._addInteraction = addInteraction
613        self._arr2d = arr2d
614        self._groupLens = groupLens
615
616
617    def _giveUp(self, arr2d, groupLens, addInteraction, output=None):
618        if output != None:
619            print output
620        self.FA = 0
621        self.FAprob = 1
622        self.FB = 0
623        self.FBprob = 1
624        self.ps = [self.FAprob, self.FBprob]
625        if addInteraction:
626            self.FAB = 0
627            self.FABprob = 1
628            self.ps.append(self.FABprob)
629        self._addInteraction = addInteraction
630        self._arr2d = arr2d
631        self._groupLens = groupLens
632       
633
634
635class AnovaRM11LR(AnovaLRBase):
636    """1 way REPEATED MEASURES ANOVA using multivariate linear regression.
637    Axis 0 correspond to subjects, axis 1 correspond to different factor-levels.
638    Does NOT support missing (masked) data.
639    """
640   
641    def __init__(self, arr2d):
642        """arr2d: 2D masked array: [[x1,x2], [y1,y2], [z1,z2]]
643        where xi,yi,zi correspond to measurements of three different subjects (x,y,z) under the i-th factor-level.
644        """
645        arr2d = MA.asarray(arr2d)
646        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
647
648        # remove missing observations and the corresponding dummy variable codes
649        self.y = MA.ravel(arr2d)
650        noMissing = MA.count(self.y) == self.y.shape[0]
651        self.y, takeInd = numpyExtn.compressIndices(self.y)
652
653        # dummy variables for coding subjects and factor-levels
654        self.dummySubj = self.getDummyEE(arr2d.shape[0], 1)
655        self.dummyA = self.getDummyEE(arr2d.shape[1], 1)
656        self.dummySubj, self.dummyA = self.getDummiesJoin(self.dummySubj, self.dummyA)
657        self.dummySubj = Numeric.take(self.dummySubj, takeInd, 0)
658        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
659
660        # run analysis (better: only 2 LR instead of 3)
661        try:
662            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubj, self.dummyA), 1), self.y)
663            if noMissing:
664                LR_A = MultLinReg(self.dummyA, self.y)
665                self.FA = LR_A.MSreg / LR_treat.MSres
666            else:
667                print "WARNING: missing data with 1-way repeated measures ANOVA"
668                LR_S = MultLinReg(self.dummySubj, self.y)
669                self.FA = (LR_treat.SSreg - LR_S.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
670            self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
671        except ZeroDivisionError:
672            self.FA = 0
673            self.FAprob = 1
674
675
676class _AnovaRM12LR_bug_missing_factor_leves(AnovaLRBase):
677    """2 way ANOVA with REPEATED MEASURES on factor A using multivariate linear regression.
678    Axis 0 correspond to different levels of factor A, subjects given at axis 1, subjects nested inside factor B according to the given group indices.
679    Factor A is a within-subject effect, factor B is a between-ubject effect.
680    Example:
681        arr2d = [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]]: 2 levels of factor A, 2 levels of factor B, 5 subjects
682        groupInd = [[0,1],[2,3,4]]
683    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
684    TODO: fix the F and DF for factor B when there are missing values (see Glantz, pp. 462-5)
685    """
686
687    def __init__(self, arr2d, groupInd, addInteraction=0):
688        """arr2d: 2D masked array: [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]]
689        groupInd: axis 1 indices that belong to the same level of factor B: [[0,1],[2,3,4]]
690        addInteraction: include / exclude the interaction between factors A and B
691        """
692        arr2d = MA.asarray(arr2d)
693        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
694
695        # remove missing observations and the corresponding dummy variable codes
696        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
697        noMissing = MA.count(self.y) == self.y.shape[0]
698        self.y, takeInd = numpyExtn.compressIndices(self.y)
699
700        # check if data is balanced
701        rgLens = Numeric.array(map(lambda x: len(x), groupInd), Numeric.Int)
702        assert Numeric.add.reduce(rgLens) == arr2d.shape[1], "arr2d.shape[1] != sum_of_len(groupInd)"
703        isBalanced = Numeric.add.reduce((1.*rgLens/rgLens[0]) == Numeric.ones((len(rgLens)))) / len(rgLens) == 1
704
705        # dummy variables
706        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
707        self.dummyB = self.getDummyEE(len(groupInd), rgLens)
708        self.dummySubjInB = self.getSubjectDummyEE(rgLens, 1)
709        dummyB_Subj = Numeric.concatenate((self.dummyB, self.dummySubjInB), 1)
710        self.dummyA, dummyB_Subj = self.getDummiesJoin(self.dummyA, dummyB_Subj)
711        self.dummyB = dummyB_Subj[:, 0:self.dummyB.shape[1]]
712        self.dummySubjInB = dummyB_Subj[:, self.dummyB.shape[1]:]
713        if addInteraction:
714            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
715            self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
716        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
717        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
718        self.dummySubjInB = Numeric.take(self.dummySubjInB, takeInd, 0)
719
720        # run analysis
721        if addInteraction:
722            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB, self.dummyAB),1), self.y)
723        else:
724            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)           
725
726        if isBalanced and noMissing and False:
727            # non-repeated-measures factor (B)
728            LR_B = MultLinReg(self.dummyB, self.y)
729            LR_SubjInB = MultLinReg(self.dummySubjInB, self.y)
730            self.FB = LR_B.MSreg / LR_SubjInB.MSreg
731            # repeated measures factor (A)
732            LR_A = MultLinReg(self.dummyA, self.y)
733            self.FA = LR_A.MSreg / LR_treat.MSres
734            # interaction (AB)
735            if addInteraction:
736                LR_AB = MultLinReg(self.dummyAB, self.y)
737                self.FAB = LR_AB.MSreg / LR_treat.MSres
738            # store variables needed for posthoc tests
739            MS_SubjInB = LR_SubjInB.MSreg
740        else:
741            ######################################################################################################################################
742            ## print "Warning: missing data with 2 way repeated measures ANOVA: F and DF should be adjusted due to random effect factor (subjects)"
743            ######################################################################################################################################
744            if addInteraction:
745                # non-repeated-measures factor (B)
746                LR_S_A_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyAB),1), self.y)
747                LR_A_B_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB),1), self.y)
748                self.FB = (LR_treat.SSreg - LR_S_A_AB.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B_AB.SSreg) * self.dummySubjInB.shape[1]
749                # repeated measures factor (A)
750                LR_S_B_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB, self.dummyAB),1), self.y)
751                self.FA = (LR_treat.SSreg - LR_S_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
752                # interaction (AB)
753                LR_S_A_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)
754                self.FAB = (LR_treat.SSreg - LR_S_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
755                # store variables needed for posthoc tests
756                MS_SubjInB = (LR_treat.SSreg - LR_A_B_AB.SSreg) / self.dummySubjInB.shape[1]
757            else:
758                # non-repeated-measures factor (B)
759                LR_S_A = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA),1), self.y)
760                LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
761                self.FB = (LR_treat.SSreg - LR_S_A.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B.SSreg) * self.dummySubjInB.shape[1]
762                # repeated measures factor (A)
763                LR_S_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB),1), self.y)
764                self.FA = (LR_treat.SSreg - LR_S_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
765                # store variables needed for posthoc tests
766                MS_SubjInB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummySubjInB.shape[1]
767
768        self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], self.dummySubjInB.shape[1], self.FB)
769        self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
770        if addInteraction:
771            self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
772
773        # store variables needed for posthoc tests
774        self._arr2d = arr2d
775        self._groupInd = groupInd
776        self._LR_treat_MSres = LR_treat.MSres
777        self._LR_treat_DFres = LR_treat.DFres
778        # estimate the variance by pooling residual variance for factors A and B
779        MS_res = LR_treat.MSres
780        a = self.dummyA.shape[1] + 1    # number of levels of factor A
781        DF_SubjInB = self.dummySubjInB.shape[1] # = LR_SubjInB.DFreg
782        DF_res = LR_treat.DFres
783        self._MSpool = (MS_SubjInB + (a-1)*MS_res) / a
784        self._DFpool = ((MS_SubjInB + (a-1)*MS_res)**2) / ((MS_SubjInB**2 / DF_SubjInB) + (((a-1)*MS_res)**2 / DF_res))
785
786
787    def posthoc_tstat_B(self):
788        """Fisher LSD: Compares all levels of factor B and returns (b,b) matrix of t-stat values and a matrix of corresponding p-values.
789        Use when there is no significant interaction.
790        Warning: Statistica computes it WRONG!
791        """
792        b = len(self._groupInd)
793        # precompute averages
794        x_avrg = Numeric.zeros((b,), Numeric.Float)
795        sum_count_x = Numeric.zeros((b,), Numeric.Float)
796        for idx, replicaInd in enumerate(self._groupInd):
797            xi = self._arr2d.take(replicaInd, 1)
798            x_avrg[idx] = MA.average(MA.average(xi,1))                  # first average over replicas to obtain cell mean, then over factor A
799            sum_count_x[idx] = Numeric.add.reduce(1./MA.count(xi,1))    # first count the number of measurements within cells, then sum inverses over factor A
800            ##x_avrg[idx] = MA.average(MA.ravel(xi))                    # this is how Statistica computes it, but it is WRONG!
801            ##sum_count_x[idx] = 1./MA.count(xi)                        # this is how Statistica computes it, but it is WRONG!
802        # t-statistics
803        x_avrg_2 = MA.resize(x_avrg, (x_avrg.shape[0], x_avrg.shape[0]))
804        sum_count_x_2 = Numeric.resize(sum_count_x, (sum_count_x.shape[0],sum_count_x.shape[0]))
805        tstat = (MA.transpose(x_avrg_2) - x_avrg_2) * self._arr2d.shape[0] / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))
806        ##tstat = (MA.transpose(x_avrg_2) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))     # this is how Statistica computes it, but it is WRONG!
807        tprob = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat**2))),1,1)
808        return tstat, tprob
809       
810           
811    def posthoc_tstat_BbyA(self):
812        """Fisher LSD: Compares all levels of factor B by each level of factor A separately.
813        Returns: (b,b,a) matrix of t-stat values
814                 (b,b,a) matrix of corresponding p-values
815                 (b,b) matrix of geometric means of p-values over all levels of factor A.
816        Use when there is a significant interaction and factor A is of no interest.
817        """
818        a = self._arr2d.shape[0]
819        b = len(self._groupInd)
820        # precompute averages
821        x_avrg = Numeric.zeros((a,b), Numeric.Float)
822        sum_count_x = Numeric.zeros((a,b), Numeric.Float)
823        for idx, replicaInd in enumerate(self._groupInd):
824            xi = self._arr2d.take(replicaInd, 1)
825            x_avrg[:,idx] = MA.average(xi, 1)
826            sum_count_x[:,idx] = 1. / MA.count(xi, 1)
827       
828        # t-statistics
829        x_avrg_2 = MA.resize(x_avrg, (b,a,b))
830        sum_count_x_2 = Numeric.resize(sum_count_x, (b,a,b))
831
832        tstat = (MA.transpose(x_avrg_2, (2,1,0)) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2, (2,1,0)) + sum_count_x_2))
833        # get p-values for each level of factor a separately (axis 1)
834        tprob = MA.array(-1*Numeric.ones(tstat.shape, Numeric.Float), mask=Numeric.ones(tstat.shape))
835        for idx1 in range(tprob.shape[1]):
836            tprob[:,idx1,:] = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat[:,idx1,:]**2))),1,1)
837        return tstat, tprob
838
839    def posthoc_anova_B_AB(self):
840        """Conduct ANOVA for each pair of levels of factor B to detect interaction effect.
841        Returns for each pair of factor B levels:
842            (b,b) matrix of F-stat values for effect of factor B
843            (b,b) matrix of p-values for effect of factor B
844            (b,b) matrix of F-stat values for interaction effect
845            (b,b) matrix of p-values for interaction effect
846        """
847        b = len(self._groupInd)
848        FB = MA.masked * MA.ones((b,b), Numeric.Float)
849        FBprob = MA.masked * MA.ones((b,b), Numeric.Float)
850        FAB = MA.masked * MA.ones((b,b), Numeric.Float)
851        FABprob = MA.masked * MA.ones((b,b), Numeric.Float)
852        for i in range(b):
853            for j in range(i+1,b):
854                rglSub = [self._groupInd[i], self._groupInd[j]]
855                takeInd = self._groupInd[i] + self._groupInd[j]
856                an = AnovaRM12LR(self._arr2d.take(takeInd, 1), rglSub, addInteraction=1)
857                FB[i,j] = FB[j,i] = an.FB
858                FBprob[i,j] = FBprob[j,i] = an.FBprob
859                FAB[i,j] = FAB[j,i] = an.FAB
860                FABprob[i,j] = FABprob[j,i] = an.FABprob
861        return FB, FBprob, FAB, FABprob
862   
863
864class AnovaRM12LR(Anova2wayLRBase):
865    """2 way ANOVA with REPEATED MEASURES on factor A using multivariate linear regression.
866    Axis 0 correspond to different levels of factor A, subjects given at axis 1, subjects nested inside factor B according to the given group lenghts.
867    Factor A is a within-subject effect, factor B is a between-subject effect.
868    Example:
869        arr2d = [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]]: 2 levels of factor A, 2 levels of factor B, 5 subjects
870        groupLens = [2,3]
871    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
872    TODO: fix the F and DF for factor B when there are missing values (see Glantz, pp. 462-5)
873    """
874
875    def __init__(self, arr2d, groupLens, addInteraction=0, allowReductA=True, allowReductB=False):
876        """arr2d: 2D masked array: [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]];
877        groupLens: lenghts of axis 1 indices that belong to the same level of factor B, e.g. [2,3];
878        addInteraction: include / exclude the interaction between factors A and B;
879        allowReduct[A|B]: whether to allow or not the reduction of levels of factor A (B).
880        """
881        arr2d = MA.asarray(arr2d)
882        groupLens = Numeric.array(groupLens)    # make a copy for the case if subjects or levels of factor B are removed
883        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
884        assert arr2d.shape[1] == Numeric.add.reduce(groupLens), "arr2d.shape[1] != Numeric.add.reduce(groupLens)"
885
886        # check if there exist factor-levels with all values missing (for A, B and Subj)
887        # if so, reduce the corresponding DF and fix the number of dummy variables
888        missIndA = Numeric.compress(MA.count(arr2d, 1) == 0, Numeric.arange(arr2d.shape[0]))
889        if missIndA.shape[0] > 0:
890            if allowReductA:
891                print "Warning: removig factor A level(s) %s" % str(missIndA.tolist())
892                takeIndA = Numeric.compress(MA.count(arr2d, 1) != 0, Numeric.arange(arr2d.shape[0]))
893                arr2d = arr2d.take(takeIndA, 0)
894            else:
895                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor A have no values: %s" % str(missIndA.tolist()))
896                return
897        missIndSubj = Numeric.compress(MA.count(arr2d, 0) == 0, Numeric.arange(arr2d.shape[1]))
898        if missIndSubj.shape[0] > 0:
899            print "Warning: removig subject(s) %s" % str(missIndSubj.tolist())
900            takeIndSubj = Numeric.compress(MA.count(arr2d, 0) != 0, Numeric.arange(arr2d.shape[1]))
901            # fix groupLens
902##            mapSubj2Group = -1*Numeric.ones(arr2d.shape[1])
903##            groupLensAcc0 = [0] + Numeric.add.accumulate(groupLens).tolist()
904##            for i in range(len(groupLensAcc0) - 1):
905##                mapSubj2Group[groupLensAcc0[i]:groupLensAcc0[i+1]] = i
906            mapSubj2Group = Numeric.repeat(range(len(groupLens)), groupLens)
907            for subjIdx in missIndSubj:
908                groupLens[mapSubj2Group[subjIdx]] -= 1
909            # remove data columns that are missing all the values
910            arr2d = arr2d.take(takeIndSubj, 1)
911        # fix number of factor B levels (if the length of any group became 0 due to removed subjects)
912        missIndB = Numeric.compress(groupLens <= 0, Numeric.arange(groupLens.shape[0]))
913        if missIndB.shape[0] > 0:
914            if allowReductB:
915                print "Warning: removig factor B level(s) %s" % str(missIndB.tolist())
916                takeIndB = Numeric.compress(groupLens > 0, Numeric.arange(groupLens.shape[0]))
917                groupLens = Numeric.take(groupLens, takeIndB)
918                # arr2d has already been taken care of by removing the subjects without observations
919            else:
920                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have no values: %s" % str(missIndB))
921                return
922        # remove levels of factor B with single observation per cell
923        grpIndSingle = Numeric.compress(Numeric.equal(groupLens,1), Numeric.arange(groupLens.shape[0]))
924        if grpIndSingle.shape[0] > 0:
925            if allowReductB:
926                # fix arr2d
927                print "Warning: removing factor B level(s) with single observation: %s" % str(grpIndSingle)
928                groupLensAcc = [0] + list(Numeric.add.accumulate(groupLens))
929                arr2dRemInd = map(lambda i: groupLensAcc[i], grpIndSingle)
930                arr2dTakeInd = Numeric.ones(arr2d.shape[1])
931                Numeric.put(arr2dTakeInd, arr2dRemInd, 0)
932                arr2d = MA.compress(arr2dTakeInd, arr2d, 1)
933                # fix groupLens
934                groupLens = Numeric.compress(Numeric.not_equal(groupLens,1), groupLens)
935            else:
936                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have single observations: %s" % str(grpIndSingle))
937                return               
938        # check that there exist at least 2 levels of factors A and B
939        # and that there is at least single duplicated observation
940        if arr2d.shape[0] < 2 or len(groupLens) < 2: ## or MA.count(arr2d) <= arr2d.shape[0] * len(groupLens):
941            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not enough levels of factor A (%i) or factor B (%i)" % (arr2d.shape[0],len(groupLens)))
942            return
943
944        # remove other missing observations and the corresponding dummy variable codes
945        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
946        noMissing = MA.count(self.y) == self.y.shape[0]
947        self.y, takeInd = numpyExtn.compressIndices(self.y)
948
949        # check if data is balanced
950        isBalanced = (1. * Numeric.add.reduce((1.*groupLens/groupLens[0]) == Numeric.ones(groupLens.shape[0])) / groupLens.shape[0]) == 1
951
952        # dummy variables
953        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
954        self.dummyB = self.getDummyEE(groupLens.shape[0], groupLens)
955        self.dummySubjInB = self.getSubjectDummyEE(groupLens, 1)
956        dummyB_Subj = Numeric.concatenate((self.dummyB, self.dummySubjInB), 1)
957        self.dummyA, dummyB_Subj = self.getDummiesJoin(self.dummyA, dummyB_Subj)
958        self.dummyB = dummyB_Subj[:, 0:self.dummyB.shape[1]]
959        self.dummySubjInB = dummyB_Subj[:, self.dummyB.shape[1]:]
960        if addInteraction:
961            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
962            self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
963        else:
964            self.dummyAB = Numeric.zeros((self.dummyA.shape[0],0))
965        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
966        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
967        self.dummySubjInB = Numeric.take(self.dummySubjInB, takeInd, 0)
968
969        # check that there is enough observations to allow for variability (DFres > 0)
970        if self.dummyA.shape[0] - sum([self.dummySubjInB.shape[1], self.dummyA.shape[1], self.dummyB.shape[1], self.dummyAB.shape[1]]) - 1 <= 0:
971            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not values (dummy condition)")
972            return
973
974        # run analysis
975        try:
976##            if addInteraction:
977            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB, self.dummyAB),1), self.y)
978##            else:
979##                LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)           
980
981            if isBalanced and noMissing and False:
982                # non-repeated-measures factor (B)
983                LR_B = MultLinReg(self.dummyB, self.y)
984                LR_SubjInB = MultLinReg(self.dummySubjInB, self.y)
985                self.FB = LR_B.MSreg / LR_SubjInB.MSreg
986                # repeated measures factor (A)
987                LR_A = MultLinReg(self.dummyA, self.y)
988                self.FA = LR_A.MSreg / LR_treat.MSres
989                # interaction (AB)
990                if addInteraction:
991                    LR_AB = MultLinReg(self.dummyAB, self.y)
992                    self.FAB = LR_AB.MSreg / LR_treat.MSres
993                # store variables needed for posthoc tests
994                MS_SubjInB = LR_SubjInB.MSreg
995            else:
996                ######################################################################################################################################
997                ## print "Warning: missing data with 2 way repeated measures ANOVA: F and DF should be adjusted due to random effect factor (subjects)"
998                ######################################################################################################################################
999                if addInteraction:
1000                    # non-repeated-measures factor (B)
1001                    LR_S_A_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyAB),1), self.y)
1002                    LR_A_B_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB),1), self.y)
1003                    self.FB = (LR_treat.SSreg - LR_S_A_AB.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B_AB.SSreg) * self.dummySubjInB.shape[1]
1004                    # repeated measures factor (A)
1005                    LR_S_B_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB, self.dummyAB),1), self.y)
1006                    self.FA = (LR_treat.SSreg - LR_S_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
1007                    # interaction (AB)
1008                    LR_S_A_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)
1009                    self.FAB = (LR_treat.SSreg - LR_S_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
1010                    # store variables needed for posthoc tests
1011                    MS_SubjInB = (LR_treat.SSreg - LR_A_B_AB.SSreg) / self.dummySubjInB.shape[1]
1012                else:
1013                    # non-repeated-measures factor (B)
1014                    LR_S_A = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA),1), self.y)
1015                    LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
1016                    self.FB = (LR_treat.SSreg - LR_S_A.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B.SSreg) * self.dummySubjInB.shape[1]
1017                    # repeated measures factor (A)
1018                    LR_S_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB),1), self.y)
1019                    self.FA = (LR_treat.SSreg - LR_S_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
1020                    # store variables needed for posthoc tests
1021                    MS_SubjInB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummySubjInB.shape[1]
1022        except:
1023            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: miltivariate linear regression failed, probably due to lack of variability")
1024            return
1025       
1026        try:
1027            self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], self.dummySubjInB.shape[1], self.FB)
1028        except ValueError:
1029            self.FBprob = 1
1030        try:
1031            self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
1032        except ValueError:
1033            self.FAprob = 1
1034        self.ps = [self.FAprob, self.FBprob]
1035        if addInteraction:
1036            try:
1037                self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
1038            except ValueError:
1039                self.FABprob = 1
1040            self.ps.append(self.FABprob)
1041
1042        # estimate the variance and degrees of freedom by pooling residual variance for factors A and B
1043        MS_res = LR_treat.MSres
1044        a = self.dummyA.shape[1] + 1    # number of levels of factor A
1045        DF_SubjInB = self.dummySubjInB.shape[1] # = LR_SubjInB.DFreg
1046        DF_res = LR_treat.DFres
1047        self._MSpool = (MS_SubjInB + (a-1)*MS_res) / a
1048        self._DFpool = ((MS_SubjInB + (a-1)*MS_res)**2) / ((MS_SubjInB**2 / DF_SubjInB) + (((a-1)*MS_res)**2 / DF_res))
1049
1050        # store some variables
1051        self._addInteraction = addInteraction
1052        self._arr2d = arr2d
1053        self._groupLens = groupLens
1054
1055
1056    def _giveUp(self, arr2d, groupLens, addInteraction, output=None):
1057        if output != None:
1058            print output
1059        self.FA = 0
1060        self.FAprob = 1
1061        self.FB = 0
1062        self.FBprob = 1
1063        self.ps = [self.FAprob, self.FBprob]
1064        if addInteraction:
1065            self.FAB = 0
1066            self.FABprob = 1
1067            self.ps.append(self.FABprob)
1068        self._addInteraction = addInteraction
1069        self._arr2d = arr2d
1070        self._groupLens = groupLens
1071
1072
1073#######################################################################################
1074## Traditional ANOVA
1075#######################################################################################
1076
1077class Anova2way:
1078    """2 way ANOVA with multiple observations per cell for balanced designs.
1079    Multiple observations given at axis 1 together with the number of repeats.
1080    """
1081
1082    def __init__(self, arr2d, numRepeats):
1083        """Input: 2d array with multiple observations per cell ginven at axis 1.
1084        """
1085        arr2d = MA.asarray(arr2d)
1086        assert math.modf(arr2d.shape[1] / numRepeats)[0] == 0.0, "arr2d.shape[1] / numRepeats not a whole number"
1087        self.a = arr2d.shape[0]
1088        self.b = arr2d.shape[1] / numRepeats
1089        self.n = numRepeats     # number of observations per cell
1090        self.Y = arr2d
1091        self._init_sums()
1092        self._init_SS()
1093        self._init_F()
1094
1095
1096    def _init_sums(self):
1097        """Initializes sums of Y.
1098        """
1099        self.dfA = self.a - 1
1100        self.dfB = self.b - 1
1101        self.dfAB = self.dfA * self.dfB
1102        self.dfTotal = self.a * self.b * self.- 1
1103        self.dfError = self.dfTotal - self.dfA - self.dfB - self.dfAB
1104        # YijK
1105        self.YijK = Numeric.zeros((self.a,self.b), Numeric.Float)
1106        for idx, bIdx in enumerate(range(0, self.b*self.n, self.n)):
1107            self.YijK[:,idx] = Numeric.add.reduce(self.Y[:,bIdx:bIdx+self.n], 1)
1108        # YiJK
1109        self.YiJK = Numeric.add.reduce(self.YijK, 1)
1110        # YIjK
1111        self.YIjK = Numeric.add.reduce(self.YijK, 0)
1112        # YIJK
1113        self.YIJK = Numeric.add.reduce(self.YIjK, 0)
1114
1115       
1116    def _init_SS(self):
1117        y2IJKdivnab = Numeric.power(self.YIJK,2) / (self.n * self.a * self.b)
1118        self.TSS = Numeric.add.reduce(Numeric.add.reduce(Numeric.power(self.Y,2))) - y2IJKdivnab
1119        self.SSA = Numeric.add.reduce(Numeric.power(self.YiJK,2)) / (self.n * self.b) - y2IJKdivnab
1120        self.SSB = Numeric.add.reduce(Numeric.power(self.YIjK,2)) / (self.n * self.a) - y2IJKdivnab
1121        self.SSAB = Numeric.add.reduce(Numeric.add.reduce(Numeric.power(self.YijK,2))) / (self.n) - y2IJKdivnab - self.SSA - self.SSB
1122        self.SSE = self.TSS - self.SSA - self.SSB - self.SSAB
1123       
1124
1125    def _init_F(self):
1126        self.FA = (self.SSA / self.dfA) / (self.SSE / self.dfError)
1127        self.FAprob = scipy.stats.fprob(self.dfA, self.dfError, self.FA)
1128        self.FB = (self.SSB / self.dfB) / (self.SSE / self.dfError)
1129        self.FBprob = scipy.stats.fprob(self.dfB, self.dfError, self.FB)
1130        self.FAB = (self.SSAB / self.dfAB) / (self.SSE / self.dfError)
1131        self.FABprob = scipy.stats.fprob(self.dfAB, self.dfError, self.FAB)
1132       
1133       
1134class Anova3way:
1135    """3 way ANOVA, optional AB interaction, single observation per cell, balanced design.
1136    Removes factor-levels with mising observations.
1137    """
1138
1139    def __init__(self, arr3d, addInterAB):
1140        """Input: 3d masked array where axis correspond factors.
1141        Leave out the factor-levels with masked values.
1142        """
1143        arr3d = MA.asarray(arr3d)
1144        assert len(arr3d.shape) == 3, "len(arr3d.shape) != 3"
1145        a,b,c = arr3d.shape
1146        self.Y = arr3d
1147
1148        self.YijK = Numeric.add.reduce(self.Y, 2)
1149        self.YiJK = Numeric.add.reduce(self.YijK, 1)
1150        self.YIjK = Numeric.add.reduce(self.YijK, 0)
1151        self.YIJk = Numeric.add.reduce(Numeric.add.reduce(self.Y))
1152        self.YIJK2_abc = Numeric.power(Numeric.add.reduce(self.YIJk),2) / (a*b*c)
1153
1154        self.TSS = Numeric.add.reduce(Numeric.add.reduce(Numeric.add.reduce(Numeric.power(self.Y,2)))) - self.YIJK2_abc
1155        self.SSA = Numeric.add.reduce(Numeric.power(self.YiJK,2)) / (b*c) - self.YIJK2_abc
1156        self.SSB = Numeric.add.reduce(Numeric.power(self.YIjK,2)) / (a*c) - self.YIJK2_abc
1157        self.SSC = Numeric.add.reduce(Numeric.power(self.YIJk,2)) / (a*b) - self.YIJK2_abc
1158
1159        self.dfA = a - 1
1160        self.dfB = b - 1
1161        self.dfC = c - 1
1162        self.dfTotal = a * b * c  - 1
1163
1164        if addInterAB:
1165
1166            self.SSAB = Numeric.add.reduce(Numeric.add.reduce(Numeric.power(self.YijK,2))) / c - self.YIJK2_abc - self.SSA - self.SSB
1167            self.dfAB = self.dfA * self.dfB
1168           
1169            self.SSE = self.TSS - self.SSA - self.SSB - self.SSC - self.SSAB
1170            self.dfError = self.dfTotal - self.dfA - self.dfB - self.dfC - self.dfAB
1171
1172            self.FAB = (self.SSAB / self.dfAB) / (self.SSE / self.dfError)
1173            self.FABprob = scipy.stats.fprob(self.dfAB, self.dfError, self.FAB)
1174
1175        else:           
1176
1177            self.SSE = self.TSS - self.SSA - self.SSB - self.SSC
1178            self.dfError = self.dfTotal - self.dfA - self.dfB - self.dfC
1179
1180        self.FA = (self.SSA / self.dfA) / (self.SSE / self.dfError)
1181        self.FAprob = scipy.stats.fprob(self.dfA, self.dfError, self.FA)
1182        self.FB = (self.SSB / self.dfB) / (self.SSE / self.dfError)
1183        self.FBprob = scipy.stats.fprob(self.dfB, self.dfError, self.FB)
1184        self.FC = (self.SSC / self.dfC) / (self.SSE / self.dfError)
1185        self.FCprob = scipy.stats.fprob(self.dfC, self.dfError, self.FC)
1186
1187
1188#######################################################################################
1189## Multivariate Linear Regression
1190#######################################################################################
1191
1192
1193class MultLinReg:
1194    """Multivariate Linear Regression.
1195    """
1196    def __init__(self, X, y, summary=0):
1197        """
1198        x[i,j]: ith observation for the jth independent variable, shape (n,k)
1199        y[i]: ith observations of the dependent variable, shape (n,)
1200        """
1201        X = Numeric.asarray(X)
1202        y = Numeric.asarray(y)
1203        assert len(X.shape) == 2, "len(X.shape) != 2"
1204        assert len(y.shape) == 1, "len(y.shape) != 1"
1205        self.DFreg = X.shape[1]                         # number of independent variables (k)
1206        self.DFres = X.shape[0] - (X.shape[1]) - 1      # number of observations (n) - number of ind. variables (k) - 1
1207        self.DFtot = X.shape[0] - 1                     # number of observations (n) - 1
1208        X = Numeric.concatenate((Numeric.ones((X.shape[0],1), Numeric.Float), X), 1)
1209        XT = Numeric.transpose(X)
1210        try:
1211            self._XTXinv = LinearAlgebra.inverse(Numeric.dot(XT,X))
1212        except LinearAlgebra.LinAlgError:
1213            print "Warning: singular matrix, using generalized_inverse"
1214            self._XTX = Numeric.dot(XT,X)   # store the singuar matrix
1215            self._XTXinv = LinearAlgebra.generalized_inverse(self._XTX)
1216        y_mean = Numeric.average(y)
1217        # regression
1218        self.b = Numeric.dot(Numeric.dot(self._XTXinv, XT), y)
1219        self.y_hat = Numeric.dot(X, self.b)
1220        # release the variables not needed
1221        X = None
1222        XT = None
1223        # SS
1224        self.SSreg = Numeric.add.reduce((self.y_hat - y_mean)**2)
1225        self.MSreg = self.SSreg / self.DFreg
1226        self.SSres = Numeric.add.reduce((y - self.y_hat)**2)
1227##        self.MSres = self.SSres / self.DFres                    # equals to square std. error: s^2_{y|x}
1228##        if self.MSres == 0.0:
1229##            print "Warning MultLinReg: MSres equals 0, replaced by 1e-20"
1230##            self.MSres = 1e-20
1231        if self.DFres > 0:
1232            self.MSres = self.SSres / self.DFres                    # equals to square std. error: s^2_{y|x}
1233        else:
1234            print "Warning MultLinReg: SSres=%f, DFres=%f" % (self.SSres, self.DFres)
1235            self.MSres = 0
1236        self.SStot = self.SSreg + self.SSres                    # equal to: self.SStot = Numeric.add.reduce((y - y_mean)**2)
1237        self.MStot = self.SStot / self.DFtot
1238        # regression summary
1239        if summary: self.getRegressionSummary()
1240
1241
1242    def __call__(self, X):
1243        X = Numeric.asarray(X)
1244        if len(X.shape) == 1:
1245            X = Numeric.concatenate(([1], X),0)
1246        elif len(X.shape) == 2:
1247            X = Numeric.concatenate((Numeric.ones((X.shape[0],1), Numeric.Float), X), 1)
1248        return Numeric.dot(X, self.b)
1249
1250 
1251    def getRegressionSummary(self):
1252        self.stdErr = math.sqrt(self.MSres)                         # std. error of the regression estimate: s_{x|y}
1253        self.VarCovar = self.MSres * self._XTXinv                   # variance-covariance matrix: s^2_b
1254        self.b_se = Numeric.sqrt(Numeric.diagonal(self.VarCovar))   # std. error of regression coef.: s_b_i
1255        self.b_t = self.b / self.b_se                               # t-test whether each regression coef. is significantly different from 0
1256        self.b_tprob = scipy.stats.abetai(0.5*self.DFres, 0.5, float(self.DFres)/(self.DFres + self.b_t**2))
1257        # correlation between coefficients (b's)
1258        # ERROR in Glantz et al.: self.Corr = Numeric.sqrt(self.VarCovar) / Numeric.sqrt(Numeric.dot(self.b_se[:,Numeric.NewAxis],self.b_se[Numeric.NewAxis,:]))
1259        self.Corr = self.VarCovar / Numeric.dot(self.b_se[:,Numeric.NewAxis],self.b_se[Numeric.NewAxis,:])
1260        self.F = self.MSreg / self.MSres        # overall goodness of fit
1261        self.Fprob = scipy.stats.fprob(self.DFreg, self.DFres, self.F)
1262        self.R2 = self.SSreg/self.SStot         # coef. of determination: fraction of variance explained by regression plane
1263        self.R2_adj = 1-(self.MSres/self.MStot) # adjusted coef. of determination (unbiased estimator of population R2; larget the better)
1264        self.R = math.sqrt(self.R2)             # multiple correlation coefficeint: how tight the data points cluster around the regression plane
1265
1266       
1267
1268#######################################################################################
1269## FDR and q-value
1270#######################################################################################
1271
1272def qVals(pVals, estimatePi0=False, verbose=False):
1273    """Returns q-values calculated from given p-values.
1274    Input:  array of p-values.
1275            estimate pi0: False (pi0==1)
1276                          "spline": cubic spline fit on pi0(lambda) curve, estimete at lambda=1
1277                          "loess": loess fit on pi0(lambda) curve, estimate at lambda=1
1278    Reference:  Storey et al. (2003) Statistical significance for genomewide studies.
1279                PNAS 100(16), pp. 9440-5.
1280    Changes: loess fit instead of cubic spline
1281    Added:  handles missing values -> it compresses it out
1282            lambda values depend on pValues:
1283                min(pVals) <= lmbd < max(pVals)
1284                len(lmbd) == len(pVals)
1285    TODO: remove calculation of flmbd in all points except for lmbd=1
1286    """
1287    assert estimatePi0 in [False, "spline", "loess"]
1288    pValsMA = MA.asarray(pVals)
1289    qVals = MA.ones(pValsMA.shape, Numeric.Float) * MA.masked
1290    putInd = Numeric.compress(Numeric.logical_not(MA.getmaskarray(pValsMA)), Numeric.arange(pValsMA.shape[0]))
1291    pValsMA = pValsMA.compressed()
1292    minp = min(pValsMA); maxp = max(pValsMA)
1293    m = pValsMA.shape[0]
1294    x = Numeric.arange(0,1.01,0.01)
1295    if estimatePi0 == "spline":
1296        import scipy.interpolate
1297        lmbd = Numeric.arange(0,0.96,0.01,Numeric.Float)
1298        pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1299        splineRep = scipy.interpolate.splrep(lmbd, pi0lmbd, k=3, s=0.01)   # spline representation: (knots, coefficeints, degree)
1300        flmbd = scipy.interpolate.splev(x, splineRep, der=0)
1301        pi0 = flmbd[-1]
1302        if pi0 <= 0:
1303            print "Warning: pi0<=0, trying smaller lambda..."
1304            while pi0 <= 0:
1305                lmbd = lmbd[:-1]
1306                pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1307                splineRep = scipy.interpolate.splrep(lmbd, pi0lmbd, k=3, s=0.1)   # spline representation: (knots, coefficeints, degree)
1308                flmbd = scipy.interpolate.splev(x, splineRep, der=0)
1309                pi0 = flmbd[-1]
1310    elif estimatePi0 == "loess":
1311        lmbd = Numeric.arange(0,1.0,0.01,Numeric.Float)
1312        pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1313        flmbd = Numeric.asarray(statc.loess(zip(lmbd, pi0lmbd), list(x), 0.4))[:,1]
1314        pi0 = flmbd[-1]
1315        if pi0 <= 0:
1316            print "Warning: pi0<=0, trying smaller lambda..."
1317            while pi0 <= 0:
1318                lmbd = lmbd[:-1]
1319                pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1320                flmbd = Numeric.asarray(statc.loess(zip(lmbd, pi0lmbd), list(x), 0.4))[:,1]
1321                pi0 = flmbd[-1]
1322    else:
1323        pi0=1
1324    if verbose and estimatePi0:
1325        M.plot(lmbd, pi0lmbd)
1326        M.plot(x,flmbd)
1327        M.legend(["pi0(lambda", "fit, len(lmbd)==%i"%len(lmbd)])
1328        M.title("pi0: %.4f"%pi0)
1329        M.show()
1330    args = Numeric.argsort(pValsMA)
1331    q = Numeric.ones(pValsMA.shape, Numeric.Float)
1332    q[args[m-1]] = pi0 * pValsMA[args[m-1]]
1333    for i in range(m-1, 0, -1):
1334        q[args[i-1]] = min(pi0*m*pValsMA[args[i-1]]/i, q[args[i]])
1335    MA.put(qVals, putInd, q)
1336    return qVals
1337
1338
1339
1340if __name__=="__main__":
1341
1342    #====================== MULTIVARIATE LINEAR REGRESSION ==========================
1343
1344    # MLR: compare with orange.LinRegLearner
1345##    import Dicty.DData, Meda.Preproc, orange
1346##    DD = Dicty.DData.DData_Nancy()
1347##    p = DD.getMean2d("pkaC")
1348##    po = Meda.Preproc.ma2orng(p, Meda.Preproc.getTcDomain(13))
1349##    po.domain.classVar = po.domain.attributes[-1]
1350##    lr = orange.LinRegLearner(po)
1351##    pn = Numeric.array(p)
1352##    mlr = MultLinReg(pn[:,0:12], pn[:,12])
1353
1354    # mlr: simple test
1355##    xy1 = Numeric.array([[1,1.1],[0,-0.01],[-1,-1.1],[2,2.3],[-2,-1.5]], Numeric.Float)
1356##    ml1 = MultLinReg(xy1[:,:-1],xy1[:,-1])
1357##    et1 = Meda.Preproc.ma2orng(xy1, Meda.Preproc.getTcDomain(2,False,[]))
1358##    et1.domain.classVar = et1.domain.attributes[-1]
1359##    lr1 = orange.LinRegLearner(et1)
1360
1361    # mlr: test with random numbers
1362##    import numpy.oldnumeric.random_array as RandomArray
1363##    xy2 = RandomArray.random((5,4))
1364##    ml2 = MultLinReg(xy2[:,:-1],xy2[:,-1],1)
1365##    et2 = Meda.Preproc.ma2orng(xy2, Meda.Preproc.getTcDomain(4,False,[]))
1366##    et2.domain.classVar = et2.domain.attributes[-1]
1367##    d3 = orange.Domain(et2.domain.attributes[:-1], et2.domain.attributes[-1])
1368##    et3 = orange.ExampleTable(d3, et2)
1369##    lr2 = orange.LinRegLearner(et3)
1370
1371
1372    #====================== 2 WAY ANOVA ==========================
1373
1374    def print3d(arr3d):
1375        for k in range(arr3d.shape[1]):
1376            print "----",k,"----"
1377            for j in range(arr3d.shape[2]):
1378                for i in range(arr3d.shape[0]):
1379                    print arr3d[i,k,j]
1380
1381##    import Dicty.DData
1382##    DN = Dicty.DData.DData_Nancy()
1383##    rawN = DN.getRaw3dAll()
1384##    # subset of data
1385##    rawSub = rawN[0,0:3,0:6]
1386##    rgiSub = [[0,1,2],[3,4,5]]
1387
1388##    # compare traditional ANOVA and ANOVA using linear regression on a balanced design
1389##    as = Anova2way(rawSub, 3)
1390##    print "\ntrad.anova, interact", as.FAprob, as.FBprob, as.FABprob
1391##    ar0 = Anova2wayLR(rawSub, rgiSub, 0)
1392##    print "regr.anova, no inter", ar0.FAprob, ar0.FBprob
1393##    ar1 = Anova2wayLR(rawSub, rgiSub, 1)
1394##    print "regr.anova, interact", ar1.FAprob, ar1.FBprob, ar1.FABprob
1395##   
1396##    # missing data, no empty cells
1397##    dSubM1 = MA.array(rawSub)
1398##    dSubM1[-1,-1] = MA.masked
1399##    arM1_0 = Anova2wayLR(dSubM1, rgiSub, 0)
1400##    print "\nregr.anova, no inter", arM1_0.FAprob, arM1_0.FBprob
1401##    arM1_1 = Anova2wayLR(dSubM1, rgiSub, 1)
1402##    print "regr.anova, interact", arM1_1.FAprob, arM1_1.FBprob, arM1_1.FABprob
1403##
1404##    # missing data and empty cell   
1405##    dSubM3 = MA.array(rawSub)
1406##    dSubM3[-1,-3:] = MA.masked
1407##    arM3_0 = Anova2wayLR(dSubM3, rgiSub, 0)
1408##    print "\nregr.anova, no inter", arM3_0.FAprob, arM3_0.FBprob
1409##    arM3_1 = Anova2wayLR(dSubM3, rgiSub, 1)
1410##    print "regr.anova, interact", arM3_1.FAprob, arM3_1.FBprob, arM3_1.FABprob
1411
1412    # test LR ANOVA for selection of genes on a whole dataset
1413    import time
1414
1415    def anova_2fact_testSpeed(rawAll, rgi):
1416        """factors: time, strain
1417        """
1418        as1 = Numeric.zeros((rawAll.shape[0],2), Numeric.Float)
1419        print "2 fact start", time.ctime()
1420        for i in range(rawAll.shape[0]):
1421            a0 = Anova2wayLR(rawAll[i], rgi, 0)
1422            as1[i] = [a0.FAprob, a0.FBprob]
1423        print "2 fact end  ", time.ctime()
1424        return as1
1425       
1426    def anova_fulfact_testSpeed(rawAll, rgi):
1427        """factors: time, strain & their interaction
1428        """
1429        try:
1430            as1 = Numeric.zeros((rawAll.shape[0],3), Numeric.Float)
1431            print "2 fact start", time.ctime()
1432            for i in range(rawAll.shape[0]):
1433                a0 = Anova2wayLR(rawAll[i], rgi, 1)
1434                as1[i] = [a0.FAprob, a0.FBprob, a0.FABprob]
1435            print "2 fact end  ", time.ctime()
1436        except:
1437            print "gene", i, time.ctime()
1438            raise
1439        return as1
1440
1441##    asff = anova_fulfact_testSpeed(rawN, DN.replicaGroupInd)  # takes appx. 3 hours!
1442##    as2f = anova_2fact_testSpeed(rawN, DN.replicaGroupInd)      # takes appx 2 minutes
1443
1444    #====================== 3 WAY ANOVA ==========================
1445   
1446##    import Dicty.DData
1447##    DN = Dicty.DData.DData_Nancy()
1448##    rawN = DN.getRaw3dAll()
1449
1450    # small test: Anova3way
1451##    rawSub = rawN[0:3, 0:3, 0:3]
1452##    a3Sub = Anova3way(rawSub, True)
1453##    print "FprobGene, FprobTime, FprobRep", a3Sub.FAprob, a3Sub.FBprob, a3Sub.FCprob
1454
1455    # are replicas equal? if FprobRep == 0: different, else equal
1456##    DN = Dicty.DData.DData_Nancy()
1457##    DDPJ = Dicty.DData.DData_PJ1()
1458##    DDMEDSTD = Dicty.DData.DData_MedStd()
1459##    def anova_testRepEq(DD, addInterAB):
1460##        print
1461##        for st in DD.strains:
1462##            a3 = Anova3way(DD.getRaw3dStrains([st]).filled(0),addInterAB)
1463##            print "FpG %.10f FpT %.10f FpGxT %.10f FpRep(F) %.20f(%3.5f)\t%12s (%i rep)" % (a3.FAprob, a3.FBprob, a3.FABprob, a3.FCprob, a3.FC, st, len(DD.replicaGroupInd[DD.getIdxStrain(st)]))
1464##
1465##    print "------------------ START ---------------"
1466##    anova_testRepEq(DN, True)
1467##    anova_testRepEq(DDPJ, True)
1468##    anova_testRepEq(DDMEDSTD, True)
1469##    print "------------------ END ---------------"
1470
1471    #====================== 1 WAY ANOVA LR ==========================
1472
1473##    a1 = Anova1wayLR(Numeric.ravel(rawSub), [range(0,10), range(10,15), range(15,27)])
1474##    subT = MA.transpose(rawN[0,:,8:11])
1475
1476    #====================== 1 WAY REPEATED MEASURES ANOVA LR ==========================
1477
1478##    a1rm = AnovaRM11LR(subT)
1479##    a1rmf = AnovaRM11LR(subT.filled(-1))
1480
1481    #====================== 2 WAY REPEATED MEASURES (on factor A) ANOVA LR ==========================
1482
1483##    ab = AnovaLRBase()
1484##    print
1485##    print Numeric.concatenate((a2rm1.dummyA,a2rm1.dummyB,a2rm1.dummySubjInB),1)
1486##    print
1487##
1488##    import Dicty.DData
1489##    DN = Dicty.DData.DData_Nancy()
1490##    rawN = DN.getRaw3dAll()
1491##    rawSub = rawN[0, 0:5, 0:9]
1492##    gi = [[0,1,2],[3,4,5],[6,7,8]]
1493
1494##    a2rm1 = AnovaRM12LR(rawSub,gi,0)
1495##    print "---no missing---"
1496##    print a2rm1.FAprob, a2rm1.FBprob#, a2rm1.FABprob
1497##
1498##    rawMiss = MA.array(rawSub)
1499##    rawMiss[0,-1] = MA.masked
1500##    a2m = AnovaRM12LR(rawMiss,gi,1)
1501##    print "---with missing---"
1502##    print a2m.FAprob, a2m.FBprob, a2m.FABprob
1503
1504
1505    #====================== q-values ==========================
1506
1507    import matplotlib.pylab as M
1508
1509    for st, epist in epist_MMAG_nonRM.items():
1510        for i in range(3):
1511            print st, i
1512            ps1 = epist.posthoc_pIntByStrains[:,i].compressed()
1513            qs1 = qVals(ps, "loess", True)
1514            ps2 = epist.posthoc_pIntByStrains[:,i].compressed()
1515            qs2 = qVals(ps2, "loess", True)
1516            M.title("%s, %i" %(st,i))
1517            M.subplot(1,2,1)
1518            M.scatter(ps1, qs1, 0.001)
1519            M.subplot(1,2,2)
1520            M.scatter(ps2, qs2, 0.001)
1521            M.show()
1522
1523     # increasing pi0lmbd function
1524##    qqc = qVals(ppc)
1525
1526     # negative pi0
1527##    pp = epist_DS_nonRM["pkaRregA"].posthoc_pIntByStrains[:,0].compressed()
1528##    qq = qVals(pp)
1529##    M.scatter(pp,qq)
1530##    M.show()
1531   
Note: See TracBrowser for help on using the repository browser.