Revision 1636:10d234fdadb9, 80.5 KB checked in by mitar, 2 years ago (diff)

Restructuring because we will not be using namespaces.

RevLine
[188]1## Automatically adapted for numpy.oldnumeric Oct 03, 2007 by
2
[1632]3from __future__ import absolute_import
4
[188]5import math
[1632]6
[188]7import numpy.oldnumeric as Numeric, numpy.oldnumeric.ma as MA
8import numpy.oldnumeric.linear_algebra as LinearAlgebra
[1632]9
[188]10import scipy.stats
11
[1632]12from . import numpyExtn
[188]13
14#######################################################################################
15## ANOVA based on Multivariate Linear Regression
16#######################################################################################
17
18class AnovaLRBase:
19    """Base class for all ANOVA types based on multivariate linear regression: manipulation with dummy variables.
20    """
21
22    def getDummyRC(self, numVars, numReplicas=1):
23        """Returns 2D array of reference cell encoded dummy variables.
24        """
25        if not isinstance(numReplicas, int):
26            numReplicas = Numeric.asarray(numReplicas)
27            assert len(numReplicas) == numVars, "numReplicas: int or argument of length numVars expected"
28        return Numeric.repeat(Numeric.concatenate((Numeric.zeros((1,numVars-1), Numeric.Float), Numeric.identity(numVars-1, Numeric.Float)), 0), numReplicas, 0)
29
30
31    def getDummyEE(self, numVars, numReplicas=1):
32        """Returns 2D array of effects endoced dummy variables, shape (n, n-1) where n = numVars*numReplicas | sum(numReplicas).
33        Each row represents a code for a single variable value, encoded by numVar-1 dummy variables.
34        numReplicas = int | list | array of shape (numVars,): the number (or list) of observations per cell.
35        """
36        assert numVars > 0, "numVars <= 0"
37        if isinstance(numReplicas, int):
38            numReplicas = Numeric.ones((numVars,)) * numReplicas
39        else:
40            numReplicas = Numeric.asarray(numReplicas)
41            assert len(numReplicas) == numVars, "numReplicas: int or argument of length numVars expected"
42        if numVars == 1:
43            return Numeric.zeros((numReplicas[0], 0))
44        else:
45            return Numeric.repeat(Numeric.concatenate((Numeric.identity(numVars-1, Numeric.Int), -1*Numeric.ones((1,numVars-1), Numeric.Int)), 0), numReplicas, 0)
46
47
48    def getSubjectDummyEE(self, numSubjList, numReplicas=1):
49        """Returns 2D array of effects endoced subject dummy variables, shape (sum(ni), sum(ni-1)) where numSubjList=[n0,n1,...].
50        numSubjList: a list of the number of subjects within each level of the non-repeated factor.
51        numReplicas: int | list | array of shape sum(ni): the number of times each subject code (row) is repeated.
52        """
53        numSubjList = Numeric.asarray(numSubjList)
54        assert len(numSubjList.shape) == 1, "len(numSubjList.shape) != 1"
55        if isinstance(numReplicas, int):
56            numReplicas = Numeric.ones((numSubjList.shape[0],)) * numReplicas
57        else:
58            numReplicas = Numeric.asarray(numReplicas)
59            assert len(numReplicas.shape) == 1, "len(numReplicas.shape) != 1"
60            assert numReplicas.shape[0] == numSubjList.shape[0], "numReplicas: int or a list of the same length as numSubjList expected"
61        si0 = Numeric.concatenate(([0], Numeric.add.accumulate(numSubjList)))      # indices for axis 0 of dm
62        si1 = Numeric.concatenate(([0], Numeric.add.accumulate(numSubjList-1)))    # indices for axis 1 of dm
63        dm = Numeric.zeros((si0[-1], si1[-1]))                                     # subject dummy codes matrix
64        numRepeats = Numeric.ones((si0[-1],))
65        for i in range(numSubjList.shape[0]):
66            if numSubjList[i] == 1: print "Warning: a single subject in group %i" % (i)
67            dm[si0[i]:si0[i+1], si1[i]:si1[i+1]] = self.getDummyEE(numSubjList[i],1)
68            numRepeats[si0[i]:si0[i+1]] = Numeric.ones((numSubjList[i],)) * numReplicas[i]
69        return Numeric.repeat(dm, numRepeats, 0)
70
71
72    def getDummiesJoin(self, dummyMtrx1, dummyMtrx2):
73        """Returns a tuple of resized dummy matrices containing the cartesian product of rows.
74        """
75        dummyMtrx1 = Numeric.repeat(dummyMtrx1, dummyMtrx2.shape[0], 0)                         # repeat each row separately
76        dummyMtrx2 = Numeric.resize(dummyMtrx2, (dummyMtrx1.shape[0], dummyMtrx2.shape[1]))     # repeat whole matrix, add new rows
77        return (dummyMtrx1, dummyMtrx2)
78
79
80    def getDummyInteraction(self, dummyMtrx1, dummyMtrx2):
81        """Returns a product of dummy matrices resized to the cartesian product of columns.
82        """
83        dmIntShp = (dummyMtrx1.shape[0], dummyMtrx1.shape[1] * dummyMtrx2.shape[1])
84        dmInt1 = Numeric.repeat(dummyMtrx1, dummyMtrx2.shape[1], 1)                             # repeat each column separately
85        dmInt2 = Numeric.transpose(Numeric.resize(Numeric.transpose(dummyMtrx2, (1,0)), (dmIntShp[1], dmIntShp[0])), (1,0)) # repeat whole matrix, add new columns
86        return dmInt1*dmInt2
87
88
89class Anova2wayLRBase(AnovaLRBase):
90    """Base class for all 2 way ANOVA types (repeated and non-repeated measures) includes various post-hoc tests.
91    The following variables should be defined by base classes:
93        self._arr2d = arr2d
94        self._groupLens = groupLens
95        self._MSpool = LR_treat.MSres
96        self._DFpool = LR_treat.MSreg
97    """
98
99    def posthoc_tstat_B(self):
100        """Fisher LSD: Compares all levels of factor B and returns (b,b) matrix of t-stat values and a matrix of corresponding p-values.
101        Use when there is no significant interaction.
102        Warning: Statistica computes it WRONG!
103        """
104        b = self._groupLens.shape[0]
105        # precompute averages
106        x_avrg = Numeric.zeros((b,), Numeric.Float)         # mean of cell means over factor A
107        sum_count_x = Numeric.zeros((b,), Numeric.Float)
108        groupLensAcc0 = Numeric.array([0] + Numeric.add.accumulate(self._groupLens).tolist())
109        for idx in range(b):
110            groupInd = range(groupLensAcc0[idx], groupLensAcc0[idx+1])
111            xi = self._arr2d.take(groupInd, 1)
112            x_avrg[idx] = MA.average(MA.average(xi,1))                  # first average over replicas to obtain cell mean, then over factor A
113            sum_count_x[idx] = Numeric.add.reduce(1./MA.count(xi,1))    # first count the number of measurements within cells, then sum inverses over factor A
114            ##x_avrg[idx] = MA.average(MA.ravel(xi))                    # this is how Statistica computes it, but it is WRONG!
115            ##sum_count_x[idx] = 1./MA.count(xi)                        # this is how Statistica computes it, but it is WRONG!
116        # t-statistics
117        x_avrg_2 = MA.resize(x_avrg, (x_avrg.shape[0], x_avrg.shape[0]))
118        sum_count_x_2 = Numeric.resize(sum_count_x, (sum_count_x.shape[0],sum_count_x.shape[0]))
119        tstat = (MA.transpose(x_avrg_2) - x_avrg_2) * self._arr2d.shape[0] / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))
120        ##tstat = (MA.transpose(x_avrg_2) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))     # this is how Statistica computes it, but it is WRONG!
121        tprob = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat**2))),1,1)
122        return tstat, tprob
123
124
125    def isSignif_holm_B(self, alpha):
126        """Conduct Holm step-down sequential multiple comparison on factor B.
127        Returns (b,b) matrix indicating significant differences between levels of factor B and the direction of changes [-1|0|1].
128        """
129        tstat, pvals = self.posthoc_tstat_B()
130        pvals = numpyExtn.triangularGet(pvals)
131        sortInd = Numeric.argsort(pvals)
132        k = pvals.shape[0]
133        isSignif = -1*Numeric.ones((k,), Numeric.Int)
134        for j in range(k):
135            isSignif[sortInd[j]] = pvals[sortInd[j]] < float(alpha) / (k-j)
136        return numpyExtn.triangularPut(isSignif, upper=1, lower=1) * (MA.greater(tstat,0) - MA.less(tstat,0))
137
138
139    def posthoc_tstat_BbyA(self):
140        """Fisher LSD: Compares all levels of factor B by each level of factor A separately.
141        Returns: (b,b,a) matrix of t-stat values
142                 (b,b,a) matrix of corresponding p-values
143                 (b,b) matrix of geometric means of p-values over all levels of factor A.
144        Use when there is a significant interaction and factor A is of no interest.
145        """
146        a = self._arr2d.shape[0]
147        b = self._groupLens.shape[0]
148        # precompute averages
149        x_avrg = Numeric.zeros((a,b), Numeric.Float)
150        sum_count_x = Numeric.zeros((a,b), Numeric.Float)
151        groupLensAcc0 = Numeric.array([0] + Numeric.add.accumulate(self._groupLens).tolist())
152        for idx in range(b):
153            groupInd = range(groupLensAcc0[idx], groupLensAcc0[idx+1])
154            xi = self._arr2d.take(groupInd, 1)
155            x_avrg[:,idx] = MA.average(xi, 1)
156            sum_count_x[:,idx] = 1. / MA.count(xi, 1)
157        # t-statistics
158        x_avrg_2 = MA.resize(x_avrg, (b,a,b))
159        sum_count_x_2 = Numeric.resize(sum_count_x, (b,a,b))
160        tstat = (MA.transpose(x_avrg_2, (2,1,0)) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2, (2,1,0)) + sum_count_x_2))
161        # get p-values for each level of factor a separately (axis 1)
162        tprob = MA.array(-1*Numeric.ones(tstat.shape, Numeric.Float), mask=Numeric.ones(tstat.shape))
163        for idx1 in range(tprob.shape[1]):
164            tprob[:,idx1,:] = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat[:,idx1,:]**2))),1,1)
165        return tstat, tprob
166
167
168    def posthoc_anova(self):
169        """Conduct ANOVA for each pair of levels of factor B.
170        Returns for each pair of factor B levels:
171            (b,b) matrix of F-stat values for effect of factor A
172            (b,b) matrix of p-values for effect of factor A
173            (b,b) matrix of F-stat values for effect of factor B
174            (b,b) matrix of p-values for effect of factor B
175            (b,b) matrix of F-stat values for interaction effect
176            (b,b) matrix of p-values for interaction effect
177        """
179            raise "Error: posthoc_anova can be conducted only when the interaction effect has been tested"
180        b = self._groupLens.shape[0]
181        FA = MA.masked * MA.ones((b,b), Numeric.Float)
182        FAprob = MA.masked * MA.ones((b,b), Numeric.Float)
183        FB = MA.masked * MA.ones((b,b), Numeric.Float)
184        FBprob = MA.masked * MA.ones((b,b), Numeric.Float)
185        FAB = MA.masked * MA.ones((b,b), Numeric.Float)
186        FABprob = MA.masked * MA.ones((b,b), Numeric.Float)
187        groupLensAcc0 = Numeric.array([0] + Numeric.add.accumulate(self._groupLens).tolist())
188        groupInd = map(lambda i,j: range(i,j), groupLensAcc0[:-1],groupLensAcc0[1:])
189        for i in range(b):
190            for j in range(i+1,b):
191                takeInd = groupInd[i] + groupInd[j]
192                an = self.__class__(self._arr2d.take(takeInd, 1), [self._groupLens[i],self._groupLens[j]], addInteraction=1)
193                FA[i,j] = FA[j,i] = an.FA
194                FAprob[i,j] = FAprob[j,i] = an.FAprob
195                FB[i,j] = FB[j,i] = an.FB
196                FBprob[i,j] = FBprob[j,i] = an.FBprob
197                FAB[i,j] = FAB[j,i] = an.FAB
198                FABprob[i,j] = FABprob[j,i] = an.FABprob
199        return FA, FAprob, FB, FBprob, FAB, FABprob
200
201
202class Anova1wayLR(AnovaLRBase):
203    """1 way ANOVA for unbalanced designs using multivariate linear regression.
204    Multiple observations given as a list of indices groups, e.g. [[0,1,2],[3,4]].
205    BUG: the indices are not taken into account, but only the lenghts of the groups!
206    Supports balanced and unbalanced designs and missing (masked) data.
207    """
208
209    def __init__(self, arr1d, replicaGroupLens):
210        """arr1d: 1D masked array: [x1,x2,x3, y1,y2, z1,z2,z3,z4] where x,y,z correspond to factor-levels with 3,2 and 4 replicas, respectively
211        replicaGroupLens: the number of replicas in individual groups, i.e. [3,2,4]
212        """
213        arr1d = MA.asarray(arr1d)
214        assert len(arr1d.shape) == 1, "len(arr1d.shape) != 1"
215
216        # remove missing observations and the corresponding dummy variable codes
217        self.y = MA.asarray(arr1d)
218        self.y, takeInd = numpyExtn.compressIndices(self.y)
219
220        # dummy variables
221        self.dummyA = self.getDummyEE(len(replicaGroupLens), replicaGroupLens)
222        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
223
224        # run analysis
225        LRA = MultLinReg(self.dummyA, self.y)
226        try:
227            self.F = LRA.MSreg / LRA.MSres
228            self.Fprob = scipy.stats.fprob(LRA.DFreg, LRA.DFres, self.F)
229        except ZeroDivisionError:
230            self.F = 0
231            self.Fprob = 1
232
233
234class Anova1wayLR_2D(AnovaLRBase):
235    """1 way ANOVA for unbalanced designs using multivariate linear regression.
236    Axis 0 correspond to replicas and axis 1 correspond to different factor-levels.
237    Supports balanced and unbalanced designs and missing (masked) data.
238    """
239
240    def __init__(self, arr2d):
241        arr2d = MA.asarray(arr2d)
242        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
243
244        # remove missing observations and the corresponding dummy variable codes
245        self.y = MA.ravel(MA.transpose(arr2d))
246        self.y, takeInd = numpyExtn.compressIndices(self.y)
247
248        # adjust degrees of freedom for factor-leves that contain no data
250        zeroLengthFactorLevels = Numeric.compress(MA.equal(arr2d.count(0), 0), range(arr2d.shape[1]))
251        if numFactorLevels > 1:
252
253##            # dummy variables (BUG: index error when the number of levels is reduced)
254##            print numFactorLevels, arr2d.shape[0], takeInd
255##            self.dummyA = self.getDummyEE(numFactorLevels, arr2d.shape[0])
256##            self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
257            # dummy variables
258            self.dummyA = self.getDummyEE(arr2d.shape[1], arr2d.shape[0])   # create all dummies
259            for lev in zeroLengthFactorLevels:                              # reduce length of dummies to adjust for actual number of factor A levels
260                self.dummyA = Numeric.concatenate([self.dummyA[:,0:lev],self.dummyA[:,lev+1:]],1)
261            if len(zeroLengthFactorLevels) > 0 and zeroLengthFactorLevels[-1] == arr2d.shape[1]-1:
262                self.dummyA[self.dummyA.shape[0]-arr2d.shape[1]-1:] = -1    # if the last factor level is missing, manually set the last dummies to matrix of -1's
263            self.dummyA = Numeric.take(self.dummyA, takeInd, 0)             # take dummies where data is not masked
264
265            # run analysis
266            LRA = MultLinReg(self.dummyA, self.y)
267            try:
268                self.F = LRA.MSreg / LRA.MSres
269                self.Fprob = scipy.stats.fprob(LRA.DFreg, LRA.DFres, self.F)
270            except ZeroDivisionError:
271                self.F = 0
272                self.Fprob = 1
273
274        else:
275            print "Giving up ANOVA: a factor with a single level encountered."
276            self.F = 0
277            self.Fprob = 1
278
279
280class _Anova2wayLR_bug_missing_factor_leves(AnovaLRBase):
281    """2 way ANOVA with multiple observations per cell for unbalanced designs using multivariate linear regression.
282    Multiple observations given at axis 1 together with the list of indices groups, e.g. [[0,1,2],[3,4]].
283    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
284    TODO: account for empty cells (this implementation results in singular design matrix)
285    """
286
287    def __init__(self, arr2d, replicaGroupInd, addInteraction=0):
288        """arr2d: 2D masked array where replicas are given at axis 1 together with the indices groups.
289        replicaGroupInd: indices of axis 1 that belong to a common factor (multiple observations per cell)
290        addInteraction: include / exclude the interaction between factors
291        """
292        arr2d = MA.asarray(arr2d)
293        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
294
295        # check if data is balanced
296        rgLens = Numeric.array(map(lambda x: len(x), replicaGroupInd), Numeric.Int)
297        isBalanced = Numeric.add.reduce((1.*rgLens/rgLens[0]) == Numeric.ones((len(rgLens)))) / len(rgLens) == 1
298
299        # check for empty cells, raise exception if empty cells exist and addInteraction=1
302            for idx in range(rgLens.shape[0]):
303                if Numeric.add.reduce(MA.count(arr2d[:,ax1Ind[idx]:ax1Ind[idx+1]],1) == 0) > 0:
304                    raise ValueError, "arr2d has empty cells, cannot evaluate the interaction between factors, try addInteraction=0"
305
306        # remove missing observations and the corresponding dummy variable codes
307        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
308        noMissing = MA.count(self.y) == self.y.shape[0]
309        self.y, takeInd = numpyExtn.compressIndices(self.y)
310
311        # dummy variables
312        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
313        self.dummyB = self.getDummyEE(len(replicaGroupInd), rgLens)
314        self.dummyA, self.dummyB = self.getDummiesJoin(self.dummyA, self.dummyB)
316            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
317            self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
318        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
319        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
320
321        # run analysis
323            LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB), 1), self.y)
324        else:
325            LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
326
327        if isBalanced and noMissing:
328            LR_A = MultLinReg(self.dummyA, self.y)
329            LR_B = MultLinReg(self.dummyB, self.y)
330            self.FA = LR_A.MSreg / LR_treat.MSres
331            self.FB = LR_B.MSreg / LR_treat.MSres
333                LR_AB = MultLinReg(self.dummyAB, self.y)
334                self.FAB = LR_AB.MSreg / LR_treat.MSres
335        else:
337                LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB), 1), self.y)
338                LR_A_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyAB), 1), self.y)
339                LR_B_AB = MultLinReg(Numeric.concatenate((self.dummyB, self.dummyAB), 1), self.y)
340                self.FA = (LR_treat.SSreg - LR_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
341                self.FB = (LR_treat.SSreg - LR_A_AB.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
342                self.FAB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
343            else:
344                LR_A = MultLinReg(self.dummyA, self.y)
345                LR_B = MultLinReg(self.dummyB, self.y)
346                self.FA = (LR_treat.SSreg - LR_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
347                self.FB = (LR_treat.SSreg - LR_A.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
348
349        self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
350        self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], LR_treat.DFres, self.FB)
352            self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
353
354        # store variables needed for posthoc tests
355        self._arr2d = arr2d
356        self._replicaGroupInd = replicaGroupInd
357        self._LR_treat_MSres = LR_treat.MSres
358        self._LR_treat_DFres = LR_treat.DFres
359
360
361    def posthoc_tstat_B(self):
362        """Fisher LSD: Compares all levels of factor B and returns (b,b) matrix of t-stat values and a matrix of corresponding p-values.
363        Use when there is no significant interaction.
364        Warning: Statistica computes it WRONG!
365        """
366        b = len(self._replicaGroupInd)
367        # precompute averages
368        x_avrg = Numeric.zeros((b,), Numeric.Float)         # mean of cell means over factor A
369        sum_count_x = Numeric.zeros((b,), Numeric.Float)
370        for idx, replicaInd in enumerate(self._replicaGroupInd):
371            xi = self._arr2d.take(replicaInd, 1)
372            x_avrg[idx] = MA.average(MA.average(xi,1))                  # first average over replicas to obtain cell mean, then over factor A
373            sum_count_x[idx] = Numeric.add.reduce(1./MA.count(xi,1))    # first count the number of measurements within cells, then sum inverses over factor A
374            ##x_avrg[idx] = MA.average(MA.ravel(xi))                    # this is how Statistica computes it, but it is WRONG!
375            ##sum_count_x[idx] = 1./MA.count(xi)                        # this is how Statistica computes it, but it is WRONG!
376        # t-statistics
377        x_avrg_2 = MA.resize(x_avrg, (x_avrg.shape[0], x_avrg.shape[0]))
378        sum_count_x_2 = Numeric.resize(sum_count_x, (sum_count_x.shape[0],sum_count_x.shape[0]))
379        tstat = (MA.transpose(x_avrg_2) - x_avrg_2) * self._arr2d.shape[0] / Numeric.sqrt(self._LR_treat_MSres * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))
380        ##tstat = (MA.transpose(x_avrg_2) - x_avrg_2) / Numeric.sqrt(self._LR_treat_MSres * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))     # this is how Statistica computes it, but it is WRONG!
381        tprob = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._LR_treat_DFres, 0.5, float(self._LR_treat_DFres) / (self._LR_treat_DFres + numpyExtn.triangularGet(tstat**2))),1,1)
382        return tstat, tprob
383
384
385    def isSignif_holm_B(self, alpha):
386        """Conduct Holm step-down sequential multiple comparison on factor B.
387        Returns (b,b) matrix indicating significant differences between levels of factor B and the direction of changes [-1|0|1].
388        """
389        tstat, pvals = self.posthoc_tstat_B()
390        pvals = numpyExtn.triangularGet(pvals)
391        sortInd = Numeric.argsort(pvals)
392        k = pvals.shape[0]
393        isSignif = -1*Numeric.ones((k,), Numeric.Int)
394        for j in range(k):
395            isSignif[sortInd[j]] = pvals[sortInd[j]] < float(alpha) / (k-j)
396        return numpyExtn.triangularPut(isSignif, upper=1, lower=1) * (MA.greater(tstat,0) - MA.less(tstat,0))
397
398
399    def posthoc_tstat_BbyA(self):
400        """Fisher LSD: Compares all levels of factor B by each level of factor A separately.
401        Returns: (b,b,a) matrix of t-stat values
402                 (b,b,a) matrix of corresponding p-values
403                 (b,b) matrix of geometric means of p-values over all levels of factor A.
404        Use when there is a significant interaction and factor A is of no interest.
405        """
406        a = self._arr2d.shape[0]
407        b = len(self._replicaGroupInd)
408        # precompute averages
409        x_avrg = Numeric.zeros((a,b), Numeric.Float)
410        sum_count_x = Numeric.zeros((a,b), Numeric.Float)
411        for idx, replicaInd in enumerate(self._replicaGroupInd):
412            xi = self._arr2d.take(replicaInd, 1)
413            x_avrg[:,idx] = MA.average(xi, 1)
414            sum_count_x[:,idx] = 1. / MA.count(xi, 1)
415
416        # t-statistics
417        x_avrg_2 = MA.resize(x_avrg, (b,a,b))
418        sum_count_x_2 = Numeric.resize(sum_count_x, (b,a,b))
419
420        tstat = (MA.transpose(x_avrg_2, (2,1,0)) - x_avrg_2) / Numeric.sqrt(self._LR_treat_MSres * (Numeric.transpose(sum_count_x_2, (2,1,0)) + sum_count_x_2))
421        # get p-values for each level of factor a separately (axis 1)
422        tprob = MA.array(-1*Numeric.ones(tstat.shape, Numeric.Float), mask=Numeric.ones(tstat.shape))
423        for idx1 in range(tprob.shape[1]):
424            tprob[:,idx1,:] = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._LR_treat_DFres, 0.5, float(self._LR_treat_DFres) / (self._LR_treat_DFres + numpyExtn.triangularGet(tstat[:,idx1,:]**2))),1,1)
425        return tstat, tprob
426
427
428    def posthoc_anova_B_AB(self):
429        """Conduct ANOVA for each pair of levels of factor B to detect interaction effect.
430        Returns for each pair of factor B levels:
431            (b,b) matrix of F-stat values for effect of factor B
432            (b,b) matrix of p-values for effect of factor B
433            (b,b) matrix of F-stat values for interaction effect
434            (b,b) matrix of p-values for interaction effect
435        """
436        b = len(self._replicaGroupInd)
437        FB = MA.masked * MA.ones((b,b), Numeric.Float)
438        FBprob = MA.masked * MA.ones((b,b), Numeric.Float)
439        FAB = MA.masked * MA.ones((b,b), Numeric.Float)
440        FABprob = MA.masked * MA.ones((b,b), Numeric.Float)
441        for i in range(b):
442            for j in range(i+1,b):
443                rglSub = [self._replicaGroupInd[i], self._replicaGroupInd[j]]
444                takeInd = self._replicaGroupInd[i] + self._replicaGroupInd[j]
445                an = Anova2wayLR(self._arr2d.take(takeInd, 1), rglSub, addInteraction=1)
446                FB[i,j] = FB[j,i] = an.FB
447                FBprob[i,j] = FBprob[j,i] = an.FBprob
448                FAB[i,j] = FAB[j,i] = an.FAB
449                FABprob[i,j] = FABprob[j,i] = an.FABprob
450        return FB, FBprob, FAB, FABprob
451
452
453class Anova2wayLR(Anova2wayLRBase):
454    """2 way ANOVA with multiple observations per cell for unbalanced designs using multivariate linear regression.
455    Multiple observations given at axis 1 together with the list of group lenghts, e.g. [3,2].
456    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
457    TODO: account for empty cells (this implementation results in singular design matrix)
458    """
459
460    def __init__(self, arr2d, groupLens, addInteraction=0, allowReductA=True, allowReductB=False):
461        """arr2d: 2D masked array where replicas are given at axis 1 together with lenghts of the groups (multiple observations per cell);
462        addInteraction: include / exclude the interaction between factors;
463        allowReduct[A|B]: whether to allow or not the reduction of levels of factor A (B).
464        """
465        arr2d = MA.asarray(arr2d)
466        groupLens = Numeric.array(groupLens)    # make a copy for the case if subjects or levels of factor B are removed
467        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
468
469        # check if there exist factor-levels with all values missing (for A and B)
470        # if so, reduce the corresponding DF and fix the number of dummy variables
471        missIndA = Numeric.compress(MA.count(arr2d, 1) == 0, Numeric.arange(arr2d.shape[0]))
472        if missIndA.shape[0] > 0:
473            if allowReductA:
474                print "Warning: removig factor A level(s) %s" % str(missIndA.tolist())
475                takeIndA = Numeric.compress(MA.count(arr2d, 1) != 0, Numeric.arange(arr2d.shape[0]))
476                arr2d = arr2d.take(takeIndA, 0)
477            else:
478                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor A have no values: %s" % str(missIndA.tolist()))
479                return
480        missIndSubj = Numeric.compress(MA.count(arr2d, 0) == 0, Numeric.arange(arr2d.shape[1]))
481        if missIndSubj.shape[0] > 0:
482            takeIndSubj = Numeric.compress(MA.count(arr2d, 0) != 0, Numeric.arange(arr2d.shape[1]))
483            # fix groupLens
484##            mapSubj2Group = -1*Numeric.ones(arr2d.shape[1])
485##            groupLensAcc0 = [0] + Numeric.add.accumulate(groupLens).tolist()
486##            for i in range(len(groupLensAcc0) - 1):
487##                mapSubj2Group[groupLensAcc0[i]:groupLensAcc0[i+1]] = i
488            mapSubj2Group = Numeric.repeat(range(len(groupLens)), groupLens)
489            for subjIdx in missIndSubj:
490                groupLens[mapSubj2Group[subjIdx]] -= 1
491            # remove data columns that are missing all the values
492            arr2d = arr2d.take(takeIndSubj, 1)
493        # fix number of factor B levels (if the length of any group became 0 due to removed subjects)
494        missIndB = Numeric.compress(groupLens <= 0, Numeric.arange(groupLens.shape[0]))
495        if missIndB.shape[0] > 0:
496            if allowReductB:
497                print "Warning: removig factor B level(s) %s" % str(missIndB)
498                takeIndB = Numeric.compress(groupLens > 0, Numeric.arange(groupLens.shape[0]))
499                groupLens = Numeric.take(groupLens, takeIndB)
500                # arr2d has already been taken care of by removing the subjects without observations
501            else:
502                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have no values: %s" % str(missIndB))
503                return
504        # remove levels of factor B with single observation per cell
505        grpIndSingle = Numeric.compress(Numeric.equal(groupLens,1), Numeric.arange(groupLens.shape[0]))
506        if grpIndSingle.shape[0] > 0:
507            if allowReductB:
508                # fix arr2d
509                print "Warning: removing factor B level(s) with single observation: %s" % str(grpIndSingle)
510                groupLensAcc = [0] + list(Numeric.add.accumulate(groupLens))
511                arr2dRemInd = map(lambda i: groupLensAcc[i], grpIndSingle)
512                arr2dTakeInd = Numeric.ones(arr2d.shape[1])
513                Numeric.put(arr2dTakeInd, arr2dRemInd, 0)
514                arr2d = MA.compress(arr2dTakeInd, arr2d, 1)
515                # fix groupLens
516                groupLens = Numeric.compress(Numeric.not_equal(groupLens,1), groupLens)
517            else:
518                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have single observations: %s" % str(grpIndSingle))
519                return
520        # check that there exist at least 2 levels of factors A and B
521        # and that there is at least single duplicated observation
522        if arr2d.shape[0] < 2 or len(groupLens) < 2: ##or MA.count(arr2d) <= arr2d.shape[0] * len(groupLens):
523            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not enough levels of factor A (%i) or factor B (%i)" % (arr2d.shape[0],len(groupLens)))
524            return
525
526        # check if data is balanced
527        isBalanced = (1. * Numeric.add.reduce((1.*groupLens/groupLens[0]) == Numeric.ones(groupLens.shape[0])) / groupLens.shape[0]) == 1
528
529        # check for empty cells; if exist and addInteraction=1, switch to no interaction and continue
532            for idx in range(groupLens.shape[0]):
533                if Numeric.add.reduce(MA.count(arr2d[:,ax1Ind[idx]:ax1Ind[idx+1]],1) == 0) > 0:
534##                    raise ValueError, "arr2d has empty cells, cannot evaluate the interaction between factors, try addInteraction=0"
535                    print ValueError, "Warning: data has empty cells, cannot evaluate the interaction between factors, continuing without interaction"
537                    # store member varibales that are stored only if addInteraction==1
538                    self.FAB = 0
539                    self.FABprob = 1
541
542        # remove missing observations and the corresponding dummy variable codes
543        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
544        noMissing = MA.count(self.y) == self.y.shape[0]
545        self.y, takeInd = numpyExtn.compressIndices(self.y)
546
547        # dummy variables
548        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
549        self.dummyB = self.getDummyEE(len(groupLens), groupLens)
550        self.dummyA, self.dummyB = self.getDummiesJoin(self.dummyA, self.dummyB)
552            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
553        else:
554            self.dummyAB = Numeric.zeros((self.dummyA.shape[0],0))
555        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
556        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
557        self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
558
559        # check that there is enough observations to allow for variability (DFres > 0)
560        if self.dummyA.shape[0] - sum([self.dummyA.shape[1], self.dummyB.shape[1], self.dummyAB.shape[1]]) - 1 <= 0:
561            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not values (dummy condition)")
562            return
563
564        # run analysis
565        try:
567            LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB), 1), self.y)
568##            else:
569##                LR_treat = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
570
571            if isBalanced and noMissing:
572                LR_A = MultLinReg(self.dummyA, self.y)
573                LR_B = MultLinReg(self.dummyB, self.y)
574                self.FA = LR_A.MSreg / LR_treat.MSres
575                self.FB = LR_B.MSreg / LR_treat.MSres
577                    LR_AB = MultLinReg(self.dummyAB, self.y)
578                    self.FAB = LR_AB.MSreg / LR_treat.MSres
579            else:
581                    LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB), 1), self.y)
582                    LR_A_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyAB), 1), self.y)
583                    LR_B_AB = MultLinReg(Numeric.concatenate((self.dummyB, self.dummyAB), 1), self.y)
584                    self.FA = (LR_treat.SSreg - LR_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
585                    self.FB = (LR_treat.SSreg - LR_A_AB.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
586                    self.FAB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
587                else:
588                    LR_A = MultLinReg(self.dummyA, self.y)
589                    LR_B = MultLinReg(self.dummyB, self.y)
590                    self.FA = (LR_treat.SSreg - LR_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
591                    self.FB = (LR_treat.SSreg - LR_A.SSreg) / self.dummyB.shape[1] / LR_treat.MSres
592        except:
593            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: miltivariate linear regression failed due to lack of variability")
594            return
595
596        try:
597            self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
598        except ValueError:
599            self.FAprob = 1
600        try:
601            self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], LR_treat.DFres, self.FB)
602        except ValueError:
603            self.FBprob = 1
604        self.ps = [self.FAprob, self.FBprob]
606            try:
607                self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
608            except ValueError:
609                self.FABprob = 1
610            self.ps.append(self.FABprob)
611
612        # store variables needed for posthoc tests
613        self._MSpool = LR_treat.MSres
614        self._DFpool = LR_treat.MSreg
616        self._arr2d = arr2d
617        self._groupLens = groupLens
618
619
620    def _giveUp(self, arr2d, groupLens, addInteraction, output=None):
621        if output != None:
622            print output
623        self.FA = 0
624        self.FAprob = 1
625        self.FB = 0
626        self.FBprob = 1
627        self.ps = [self.FAprob, self.FBprob]
629            self.FAB = 0
630            self.FABprob = 1
631            self.ps.append(self.FABprob)
633        self._arr2d = arr2d
634        self._groupLens = groupLens
635
636
637
638class AnovaRM11LR(AnovaLRBase):
639    """1 way REPEATED MEASURES ANOVA using multivariate linear regression.
640    Axis 0 correspond to subjects, axis 1 correspond to different factor-levels.
641    Does NOT support missing (masked) data.
642    """
643
644    def __init__(self, arr2d):
645        """arr2d: 2D masked array: [[x1,x2], [y1,y2], [z1,z2]]
646        where xi,yi,zi correspond to measurements of three different subjects (x,y,z) under the i-th factor-level.
647        """
648        arr2d = MA.asarray(arr2d)
649        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
650
651        # remove missing observations and the corresponding dummy variable codes
652        self.y = MA.ravel(arr2d)
653        noMissing = MA.count(self.y) == self.y.shape[0]
654        self.y, takeInd = numpyExtn.compressIndices(self.y)
655
656        # dummy variables for coding subjects and factor-levels
657        self.dummySubj = self.getDummyEE(arr2d.shape[0], 1)
658        self.dummyA = self.getDummyEE(arr2d.shape[1], 1)
659        self.dummySubj, self.dummyA = self.getDummiesJoin(self.dummySubj, self.dummyA)
660        self.dummySubj = Numeric.take(self.dummySubj, takeInd, 0)
661        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
662
663        # run analysis (better: only 2 LR instead of 3)
664        try:
665            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubj, self.dummyA), 1), self.y)
666            if noMissing:
667                LR_A = MultLinReg(self.dummyA, self.y)
668                self.FA = LR_A.MSreg / LR_treat.MSres
669            else:
670                print "WARNING: missing data with 1-way repeated measures ANOVA"
671                LR_S = MultLinReg(self.dummySubj, self.y)
672                self.FA = (LR_treat.SSreg - LR_S.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
673            self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
674        except ZeroDivisionError:
675            self.FA = 0
676            self.FAprob = 1
677
678
679class _AnovaRM12LR_bug_missing_factor_leves(AnovaLRBase):
680    """2 way ANOVA with REPEATED MEASURES on factor A using multivariate linear regression.
681    Axis 0 correspond to different levels of factor A, subjects given at axis 1, subjects nested inside factor B according to the given group indices.
682    Factor A is a within-subject effect, factor B is a between-ubject effect.
683    Example:
684        arr2d = [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]]: 2 levels of factor A, 2 levels of factor B, 5 subjects
685        groupInd = [[0,1],[2,3,4]]
686    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
687    TODO: fix the F and DF for factor B when there are missing values (see Glantz, pp. 462-5)
688    """
689
690    def __init__(self, arr2d, groupInd, addInteraction=0):
691        """arr2d: 2D masked array: [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]]
692        groupInd: axis 1 indices that belong to the same level of factor B: [[0,1],[2,3,4]]
693        addInteraction: include / exclude the interaction between factors A and B
694        """
695        arr2d = MA.asarray(arr2d)
696        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
697
698        # remove missing observations and the corresponding dummy variable codes
699        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
700        noMissing = MA.count(self.y) == self.y.shape[0]
701        self.y, takeInd = numpyExtn.compressIndices(self.y)
702
703        # check if data is balanced
704        rgLens = Numeric.array(map(lambda x: len(x), groupInd), Numeric.Int)
705        assert Numeric.add.reduce(rgLens) == arr2d.shape[1], "arr2d.shape[1] != sum_of_len(groupInd)"
706        isBalanced = Numeric.add.reduce((1.*rgLens/rgLens[0]) == Numeric.ones((len(rgLens)))) / len(rgLens) == 1
707
708        # dummy variables
709        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
710        self.dummyB = self.getDummyEE(len(groupInd), rgLens)
711        self.dummySubjInB = self.getSubjectDummyEE(rgLens, 1)
712        dummyB_Subj = Numeric.concatenate((self.dummyB, self.dummySubjInB), 1)
713        self.dummyA, dummyB_Subj = self.getDummiesJoin(self.dummyA, dummyB_Subj)
714        self.dummyB = dummyB_Subj[:, 0:self.dummyB.shape[1]]
715        self.dummySubjInB = dummyB_Subj[:, self.dummyB.shape[1]:]
717            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
718            self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
719        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
720        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
721        self.dummySubjInB = Numeric.take(self.dummySubjInB, takeInd, 0)
722
723        # run analysis
725            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB, self.dummyAB),1), self.y)
726        else:
727            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)
728
729        if isBalanced and noMissing and False:
730            # non-repeated-measures factor (B)
731            LR_B = MultLinReg(self.dummyB, self.y)
732            LR_SubjInB = MultLinReg(self.dummySubjInB, self.y)
733            self.FB = LR_B.MSreg / LR_SubjInB.MSreg
734            # repeated measures factor (A)
735            LR_A = MultLinReg(self.dummyA, self.y)
736            self.FA = LR_A.MSreg / LR_treat.MSres
737            # interaction (AB)
739                LR_AB = MultLinReg(self.dummyAB, self.y)
740                self.FAB = LR_AB.MSreg / LR_treat.MSres
741            # store variables needed for posthoc tests
742            MS_SubjInB = LR_SubjInB.MSreg
743        else:
744            ######################################################################################################################################
745            ## print "Warning: missing data with 2 way repeated measures ANOVA: F and DF should be adjusted due to random effect factor (subjects)"
746            ######################################################################################################################################
748                # non-repeated-measures factor (B)
749                LR_S_A_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyAB),1), self.y)
750                LR_A_B_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB),1), self.y)
751                self.FB = (LR_treat.SSreg - LR_S_A_AB.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B_AB.SSreg) * self.dummySubjInB.shape[1]
752                # repeated measures factor (A)
753                LR_S_B_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB, self.dummyAB),1), self.y)
754                self.FA = (LR_treat.SSreg - LR_S_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
755                # interaction (AB)
756                LR_S_A_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)
757                self.FAB = (LR_treat.SSreg - LR_S_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
758                # store variables needed for posthoc tests
759                MS_SubjInB = (LR_treat.SSreg - LR_A_B_AB.SSreg) / self.dummySubjInB.shape[1]
760            else:
761                # non-repeated-measures factor (B)
762                LR_S_A = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA),1), self.y)
763                LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
764                self.FB = (LR_treat.SSreg - LR_S_A.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B.SSreg) * self.dummySubjInB.shape[1]
765                # repeated measures factor (A)
766                LR_S_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB),1), self.y)
767                self.FA = (LR_treat.SSreg - LR_S_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
768                # store variables needed for posthoc tests
769                MS_SubjInB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummySubjInB.shape[1]
770
771        self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], self.dummySubjInB.shape[1], self.FB)
772        self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
774            self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
775
776        # store variables needed for posthoc tests
777        self._arr2d = arr2d
778        self._groupInd = groupInd
779        self._LR_treat_MSres = LR_treat.MSres
780        self._LR_treat_DFres = LR_treat.DFres
781        # estimate the variance by pooling residual variance for factors A and B
782        MS_res = LR_treat.MSres
783        a = self.dummyA.shape[1] + 1    # number of levels of factor A
784        DF_SubjInB = self.dummySubjInB.shape[1] # = LR_SubjInB.DFreg
785        DF_res = LR_treat.DFres
786        self._MSpool = (MS_SubjInB + (a-1)*MS_res) / a
787        self._DFpool = ((MS_SubjInB + (a-1)*MS_res)**2) / ((MS_SubjInB**2 / DF_SubjInB) + (((a-1)*MS_res)**2 / DF_res))
788
789
790    def posthoc_tstat_B(self):
791        """Fisher LSD: Compares all levels of factor B and returns (b,b) matrix of t-stat values and a matrix of corresponding p-values.
792        Use when there is no significant interaction.
793        Warning: Statistica computes it WRONG!
794        """
795        b = len(self._groupInd)
796        # precompute averages
797        x_avrg = Numeric.zeros((b,), Numeric.Float)
798        sum_count_x = Numeric.zeros((b,), Numeric.Float)
799        for idx, replicaInd in enumerate(self._groupInd):
800            xi = self._arr2d.take(replicaInd, 1)
801            x_avrg[idx] = MA.average(MA.average(xi,1))                  # first average over replicas to obtain cell mean, then over factor A
802            sum_count_x[idx] = Numeric.add.reduce(1./MA.count(xi,1))    # first count the number of measurements within cells, then sum inverses over factor A
803            ##x_avrg[idx] = MA.average(MA.ravel(xi))                    # this is how Statistica computes it, but it is WRONG!
804            ##sum_count_x[idx] = 1./MA.count(xi)                        # this is how Statistica computes it, but it is WRONG!
805        # t-statistics
806        x_avrg_2 = MA.resize(x_avrg, (x_avrg.shape[0], x_avrg.shape[0]))
807        sum_count_x_2 = Numeric.resize(sum_count_x, (sum_count_x.shape[0],sum_count_x.shape[0]))
808        tstat = (MA.transpose(x_avrg_2) - x_avrg_2) * self._arr2d.shape[0] / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))
809        ##tstat = (MA.transpose(x_avrg_2) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2) + sum_count_x_2))     # this is how Statistica computes it, but it is WRONG!
810        tprob = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat**2))),1,1)
811        return tstat, tprob
812
813
814    def posthoc_tstat_BbyA(self):
815        """Fisher LSD: Compares all levels of factor B by each level of factor A separately.
816        Returns: (b,b,a) matrix of t-stat values
817                 (b,b,a) matrix of corresponding p-values
818                 (b,b) matrix of geometric means of p-values over all levels of factor A.
819        Use when there is a significant interaction and factor A is of no interest.
820        """
821        a = self._arr2d.shape[0]
822        b = len(self._groupInd)
823        # precompute averages
824        x_avrg = Numeric.zeros((a,b), Numeric.Float)
825        sum_count_x = Numeric.zeros((a,b), Numeric.Float)
826        for idx, replicaInd in enumerate(self._groupInd):
827            xi = self._arr2d.take(replicaInd, 1)
828            x_avrg[:,idx] = MA.average(xi, 1)
829            sum_count_x[:,idx] = 1. / MA.count(xi, 1)
830
831        # t-statistics
832        x_avrg_2 = MA.resize(x_avrg, (b,a,b))
833        sum_count_x_2 = Numeric.resize(sum_count_x, (b,a,b))
834
835        tstat = (MA.transpose(x_avrg_2, (2,1,0)) - x_avrg_2) / Numeric.sqrt(self._MSpool * (Numeric.transpose(sum_count_x_2, (2,1,0)) + sum_count_x_2))
836        # get p-values for each level of factor a separately (axis 1)
837        tprob = MA.array(-1*Numeric.ones(tstat.shape, Numeric.Float), mask=Numeric.ones(tstat.shape))
838        for idx1 in range(tprob.shape[1]):
839            tprob[:,idx1,:] = numpyExtn.triangularPut(scipy.stats.abetai(0.5*self._DFpool, 0.5, float(self._DFpool) / (self._DFpool + numpyExtn.triangularGet(tstat[:,idx1,:]**2))),1,1)
840        return tstat, tprob
841
842    def posthoc_anova_B_AB(self):
843        """Conduct ANOVA for each pair of levels of factor B to detect interaction effect.
844        Returns for each pair of factor B levels:
845            (b,b) matrix of F-stat values for effect of factor B
846            (b,b) matrix of p-values for effect of factor B
847            (b,b) matrix of F-stat values for interaction effect
848            (b,b) matrix of p-values for interaction effect
849        """
850        b = len(self._groupInd)
851        FB = MA.masked * MA.ones((b,b), Numeric.Float)
852        FBprob = MA.masked * MA.ones((b,b), Numeric.Float)
853        FAB = MA.masked * MA.ones((b,b), Numeric.Float)
854        FABprob = MA.masked * MA.ones((b,b), Numeric.Float)
855        for i in range(b):
856            for j in range(i+1,b):
857                rglSub = [self._groupInd[i], self._groupInd[j]]
858                takeInd = self._groupInd[i] + self._groupInd[j]
859                an = AnovaRM12LR(self._arr2d.take(takeInd, 1), rglSub, addInteraction=1)
860                FB[i,j] = FB[j,i] = an.FB
861                FBprob[i,j] = FBprob[j,i] = an.FBprob
862                FAB[i,j] = FAB[j,i] = an.FAB
863                FABprob[i,j] = FABprob[j,i] = an.FABprob
864        return FB, FBprob, FAB, FABprob
865
866
867class AnovaRM12LR(Anova2wayLRBase):
868    """2 way ANOVA with REPEATED MEASURES on factor A using multivariate linear regression.
869    Axis 0 correspond to different levels of factor A, subjects given at axis 1, subjects nested inside factor B according to the given group lenghts.
870    Factor A is a within-subject effect, factor B is a between-subject effect.
871    Example:
872        arr2d = [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]]: 2 levels of factor A, 2 levels of factor B, 5 subjects
873        groupLens = [2,3]
874    Supports balanced and unbalanced designs, i.e. different number of observations per cell and missing (masked) data.
875    TODO: fix the F and DF for factor B when there are missing values (see Glantz, pp. 462-5)
876    """
877
878    def __init__(self, arr2d, groupLens, addInteraction=0, allowReductA=True, allowReductB=False):
879        """arr2d: 2D masked array: [[a0b0 a0b0 a0b1 a0b1 a0b1], [a1b0 a1b0 a1b1 a1b1 a1b1]];
880        groupLens: lenghts of axis 1 indices that belong to the same level of factor B, e.g. [2,3];
881        addInteraction: include / exclude the interaction between factors A and B;
882        allowReduct[A|B]: whether to allow or not the reduction of levels of factor A (B).
883        """
884        arr2d = MA.asarray(arr2d)
885        groupLens = Numeric.array(groupLens)    # make a copy for the case if subjects or levels of factor B are removed
886        assert len(arr2d.shape) == 2, "len(arr2d.shape) != 2"
888
889        # check if there exist factor-levels with all values missing (for A, B and Subj)
890        # if so, reduce the corresponding DF and fix the number of dummy variables
891        missIndA = Numeric.compress(MA.count(arr2d, 1) == 0, Numeric.arange(arr2d.shape[0]))
892        if missIndA.shape[0] > 0:
893            if allowReductA:
894                print "Warning: removig factor A level(s) %s" % str(missIndA.tolist())
895                takeIndA = Numeric.compress(MA.count(arr2d, 1) != 0, Numeric.arange(arr2d.shape[0]))
896                arr2d = arr2d.take(takeIndA, 0)
897            else:
898                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor A have no values: %s" % str(missIndA.tolist()))
899                return
900        missIndSubj = Numeric.compress(MA.count(arr2d, 0) == 0, Numeric.arange(arr2d.shape[1]))
901        if missIndSubj.shape[0] > 0:
902            print "Warning: removig subject(s) %s" % str(missIndSubj.tolist())
903            takeIndSubj = Numeric.compress(MA.count(arr2d, 0) != 0, Numeric.arange(arr2d.shape[1]))
904            # fix groupLens
905##            mapSubj2Group = -1*Numeric.ones(arr2d.shape[1])
906##            groupLensAcc0 = [0] + Numeric.add.accumulate(groupLens).tolist()
907##            for i in range(len(groupLensAcc0) - 1):
908##                mapSubj2Group[groupLensAcc0[i]:groupLensAcc0[i+1]] = i
909            mapSubj2Group = Numeric.repeat(range(len(groupLens)), groupLens)
910            for subjIdx in missIndSubj:
911                groupLens[mapSubj2Group[subjIdx]] -= 1
912            # remove data columns that are missing all the values
913            arr2d = arr2d.take(takeIndSubj, 1)
914        # fix number of factor B levels (if the length of any group became 0 due to removed subjects)
915        missIndB = Numeric.compress(groupLens <= 0, Numeric.arange(groupLens.shape[0]))
916        if missIndB.shape[0] > 0:
917            if allowReductB:
918                print "Warning: removig factor B level(s) %s" % str(missIndB.tolist())
919                takeIndB = Numeric.compress(groupLens > 0, Numeric.arange(groupLens.shape[0]))
920                groupLens = Numeric.take(groupLens, takeIndB)
921                # arr2d has already been taken care of by removing the subjects without observations
922            else:
923                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have no values: %s" % str(missIndB))
924                return
925        # remove levels of factor B with single observation per cell
926        grpIndSingle = Numeric.compress(Numeric.equal(groupLens,1), Numeric.arange(groupLens.shape[0]))
927        if grpIndSingle.shape[0] > 0:
928            if allowReductB:
929                # fix arr2d
930                print "Warning: removing factor B level(s) with single observation: %s" % str(grpIndSingle)
931                groupLensAcc = [0] + list(Numeric.add.accumulate(groupLens))
932                arr2dRemInd = map(lambda i: groupLensAcc[i], grpIndSingle)
933                arr2dTakeInd = Numeric.ones(arr2d.shape[1])
934                Numeric.put(arr2dTakeInd, arr2dRemInd, 0)
935                arr2d = MA.compress(arr2dTakeInd, arr2d, 1)
936                # fix groupLens
937                groupLens = Numeric.compress(Numeric.not_equal(groupLens,1), groupLens)
938            else:
939                self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: the following level(s) of factor B have single observations: %s" % str(grpIndSingle))
940                return
941        # check that there exist at least 2 levels of factors A and B
942        # and that there is at least single duplicated observation
943        if arr2d.shape[0] < 2 or len(groupLens) < 2: ## or MA.count(arr2d) <= arr2d.shape[0] * len(groupLens):
944            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not enough levels of factor A (%i) or factor B (%i)" % (arr2d.shape[0],len(groupLens)))
945            return
946
947        # remove other missing observations and the corresponding dummy variable codes
948        self.y = MA.ravel(arr2d) # [a0b0, a0b1, .., a1b0, ...]
949        noMissing = MA.count(self.y) == self.y.shape[0]
950        self.y, takeInd = numpyExtn.compressIndices(self.y)
951
952        # check if data is balanced
953        isBalanced = (1. * Numeric.add.reduce((1.*groupLens/groupLens[0]) == Numeric.ones(groupLens.shape[0])) / groupLens.shape[0]) == 1
954
955        # dummy variables
956        self.dummyA = self.getDummyEE(arr2d.shape[0], 1)
957        self.dummyB = self.getDummyEE(groupLens.shape[0], groupLens)
958        self.dummySubjInB = self.getSubjectDummyEE(groupLens, 1)
959        dummyB_Subj = Numeric.concatenate((self.dummyB, self.dummySubjInB), 1)
960        self.dummyA, dummyB_Subj = self.getDummiesJoin(self.dummyA, dummyB_Subj)
961        self.dummyB = dummyB_Subj[:, 0:self.dummyB.shape[1]]
962        self.dummySubjInB = dummyB_Subj[:, self.dummyB.shape[1]:]
964            self.dummyAB = self.getDummyInteraction(self.dummyA, self.dummyB)
965            self.dummyAB = Numeric.take(self.dummyAB, takeInd, 0)
966        else:
967            self.dummyAB = Numeric.zeros((self.dummyA.shape[0],0))
968        self.dummyA = Numeric.take(self.dummyA, takeInd, 0)
969        self.dummyB = Numeric.take(self.dummyB, takeInd, 0)
970        self.dummySubjInB = Numeric.take(self.dummySubjInB, takeInd, 0)
971
972        # check that there is enough observations to allow for variability (DFres > 0)
973        if self.dummyA.shape[0] - sum([self.dummySubjInB.shape[1], self.dummyA.shape[1], self.dummyB.shape[1], self.dummyAB.shape[1]]) - 1 <= 0:
974            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: not values (dummy condition)")
975            return
976
977        # run analysis
978        try:
980            LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB, self.dummyAB),1), self.y)
981##            else:
982##                LR_treat = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)
983
984            if isBalanced and noMissing and False:
985                # non-repeated-measures factor (B)
986                LR_B = MultLinReg(self.dummyB, self.y)
987                LR_SubjInB = MultLinReg(self.dummySubjInB, self.y)
988                self.FB = LR_B.MSreg / LR_SubjInB.MSreg
989                # repeated measures factor (A)
990                LR_A = MultLinReg(self.dummyA, self.y)
991                self.FA = LR_A.MSreg / LR_treat.MSres
992                # interaction (AB)
994                    LR_AB = MultLinReg(self.dummyAB, self.y)
995                    self.FAB = LR_AB.MSreg / LR_treat.MSres
996                # store variables needed for posthoc tests
997                MS_SubjInB = LR_SubjInB.MSreg
998            else:
999                ######################################################################################################################################
1000                ## print "Warning: missing data with 2 way repeated measures ANOVA: F and DF should be adjusted due to random effect factor (subjects)"
1001                ######################################################################################################################################
1003                    # non-repeated-measures factor (B)
1004                    LR_S_A_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyAB),1), self.y)
1005                    LR_A_B_AB = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB, self.dummyAB),1), self.y)
1006                    self.FB = (LR_treat.SSreg - LR_S_A_AB.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B_AB.SSreg) * self.dummySubjInB.shape[1]
1007                    # repeated measures factor (A)
1008                    LR_S_B_AB = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB, self.dummyAB),1), self.y)
1009                    self.FA = (LR_treat.SSreg - LR_S_B_AB.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
1010                    # interaction (AB)
1011                    LR_S_A_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA, self.dummyB),1), self.y)
1012                    self.FAB = (LR_treat.SSreg - LR_S_A_B.SSreg) / self.dummyAB.shape[1] / LR_treat.MSres
1013                    # store variables needed for posthoc tests
1014                    MS_SubjInB = (LR_treat.SSreg - LR_A_B_AB.SSreg) / self.dummySubjInB.shape[1]
1015                else:
1016                    # non-repeated-measures factor (B)
1017                    LR_S_A = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyA),1), self.y)
1018                    LR_A_B = MultLinReg(Numeric.concatenate((self.dummyA, self.dummyB),1), self.y)
1019                    self.FB = (LR_treat.SSreg - LR_S_A.SSreg) / self.dummyB.shape[1] / (LR_treat.SSreg - LR_A_B.SSreg) * self.dummySubjInB.shape[1]
1020                    # repeated measures factor (A)
1021                    LR_S_B = MultLinReg(Numeric.concatenate((self.dummySubjInB, self.dummyB),1), self.y)
1022                    self.FA = (LR_treat.SSreg - LR_S_B.SSreg) / self.dummyA.shape[1] / LR_treat.MSres
1023                    # store variables needed for posthoc tests
1024                    MS_SubjInB = (LR_treat.SSreg - LR_A_B.SSreg) / self.dummySubjInB.shape[1]
1025        except:
1026            self._giveUp(arr2d, groupLens, addInteraction, output="Giving up ANOVA: miltivariate linear regression failed, probably due to lack of variability")
1027            return
1028
1029        try:
1030            self.FBprob = scipy.stats.fprob(self.dummyB.shape[1], self.dummySubjInB.shape[1], self.FB)
1031        except ValueError:
1032            self.FBprob = 1
1033        try:
1034            self.FAprob = scipy.stats.fprob(self.dummyA.shape[1], LR_treat.DFres, self.FA)
1035        except ValueError:
1036            self.FAprob = 1
1037        self.ps = [self.FAprob, self.FBprob]
1039            try:
1040                self.FABprob = scipy.stats.fprob(self.dummyAB.shape[1], LR_treat.DFres, self.FAB)
1041            except ValueError:
1042                self.FABprob = 1
1043            self.ps.append(self.FABprob)
1044
1045        # estimate the variance and degrees of freedom by pooling residual variance for factors A and B
1046        MS_res = LR_treat.MSres
1047        a = self.dummyA.shape[1] + 1    # number of levels of factor A
1048        DF_SubjInB = self.dummySubjInB.shape[1] # = LR_SubjInB.DFreg
1049        DF_res = LR_treat.DFres
1050        self._MSpool = (MS_SubjInB + (a-1)*MS_res) / a
1051        self._DFpool = ((MS_SubjInB + (a-1)*MS_res)**2) / ((MS_SubjInB**2 / DF_SubjInB) + (((a-1)*MS_res)**2 / DF_res))
1052
1053        # store some variables
1055        self._arr2d = arr2d
1056        self._groupLens = groupLens
1057
1058
1059    def _giveUp(self, arr2d, groupLens, addInteraction, output=None):
1060        if output != None:
1061            print output
1062        self.FA = 0
1063        self.FAprob = 1
1064        self.FB = 0
1065        self.FBprob = 1
1066        self.ps = [self.FAprob, self.FBprob]
1068            self.FAB = 0
1069            self.FABprob = 1
1070            self.ps.append(self.FABprob)
1072        self._arr2d = arr2d
1073        self._groupLens = groupLens
1074
1075
1076#######################################################################################
1078#######################################################################################
1079
1080class Anova2way:
1081    """2 way ANOVA with multiple observations per cell for balanced designs.
1082    Multiple observations given at axis 1 together with the number of repeats.
1083    """
1084
1085    def __init__(self, arr2d, numRepeats):
1086        """Input: 2d array with multiple observations per cell ginven at axis 1.
1087        """
1088        arr2d = MA.asarray(arr2d)
1089        assert math.modf(arr2d.shape[1] / numRepeats)[0] == 0.0, "arr2d.shape[1] / numRepeats not a whole number"
1090        self.a = arr2d.shape[0]
1091        self.b = arr2d.shape[1] / numRepeats
1092        self.n = numRepeats     # number of observations per cell
1093        self.Y = arr2d
1094        self._init_sums()
1095        self._init_SS()
1096        self._init_F()
1097
1098
1099    def _init_sums(self):
1100        """Initializes sums of Y.
1101        """
1102        self.dfA = self.a - 1
1103        self.dfB = self.b - 1
1104        self.dfAB = self.dfA * self.dfB
1105        self.dfTotal = self.a * self.b * self.- 1
1106        self.dfError = self.dfTotal - self.dfA - self.dfB - self.dfAB
1107        # YijK
1108        self.YijK = Numeric.zeros((self.a,self.b), Numeric.Float)
1109        for idx, bIdx in enumerate(range(0, self.b*self.n, self.n)):
1111        # YiJK
1113        # YIjK
1115        # YIJK
1117
1118
1119    def _init_SS(self):
1120        y2IJKdivnab = Numeric.power(self.YIJK,2) / (self.n * self.a * self.b)
1122        self.SSA = Numeric.add.reduce(Numeric.power(self.YiJK,2)) / (self.n * self.b) - y2IJKdivnab
1123        self.SSB = Numeric.add.reduce(Numeric.power(self.YIjK,2)) / (self.n * self.a) - y2IJKdivnab
1124        self.SSAB = Numeric.add.reduce(Numeric.add.reduce(Numeric.power(self.YijK,2))) / (self.n) - y2IJKdivnab - self.SSA - self.SSB
1125        self.SSE = self.TSS - self.SSA - self.SSB - self.SSAB
1126
1127
1128    def _init_F(self):
1129        self.FA = (self.SSA / self.dfA) / (self.SSE / self.dfError)
1130        self.FAprob = scipy.stats.fprob(self.dfA, self.dfError, self.FA)
1131        self.FB = (self.SSB / self.dfB) / (self.SSE / self.dfError)
1132        self.FBprob = scipy.stats.fprob(self.dfB, self.dfError, self.FB)
1133        self.FAB = (self.SSAB / self.dfAB) / (self.SSE / self.dfError)
1134        self.FABprob = scipy.stats.fprob(self.dfAB, self.dfError, self.FAB)
1135
1136
1137class Anova3way:
1138    """3 way ANOVA, optional AB interaction, single observation per cell, balanced design.
1139    Removes factor-levels with mising observations.
1140    """
1141
1143        """Input: 3d masked array where axis correspond factors.
1144        Leave out the factor-levels with masked values.
1145        """
1146        arr3d = MA.asarray(arr3d)
1147        assert len(arr3d.shape) == 3, "len(arr3d.shape) != 3"
1148        a,b,c = arr3d.shape
1149        self.Y = arr3d
1150
1155        self.YIJK2_abc = Numeric.power(Numeric.add.reduce(self.YIJk),2) / (a*b*c)
1156
1158        self.SSA = Numeric.add.reduce(Numeric.power(self.YiJK,2)) / (b*c) - self.YIJK2_abc
1159        self.SSB = Numeric.add.reduce(Numeric.power(self.YIjK,2)) / (a*c) - self.YIJK2_abc
1160        self.SSC = Numeric.add.reduce(Numeric.power(self.YIJk,2)) / (a*b) - self.YIJK2_abc
1161
1162        self.dfA = a - 1
1163        self.dfB = b - 1
1164        self.dfC = c - 1
1165        self.dfTotal = a * b * c  - 1
1166
1168
1169            self.SSAB = Numeric.add.reduce(Numeric.add.reduce(Numeric.power(self.YijK,2))) / c - self.YIJK2_abc - self.SSA - self.SSB
1170            self.dfAB = self.dfA * self.dfB
1171
1172            self.SSE = self.TSS - self.SSA - self.SSB - self.SSC - self.SSAB
1173            self.dfError = self.dfTotal - self.dfA - self.dfB - self.dfC - self.dfAB
1174
1175            self.FAB = (self.SSAB / self.dfAB) / (self.SSE / self.dfError)
1176            self.FABprob = scipy.stats.fprob(self.dfAB, self.dfError, self.FAB)
1177
1178        else:
1179
1180            self.SSE = self.TSS - self.SSA - self.SSB - self.SSC
1181            self.dfError = self.dfTotal - self.dfA - self.dfB - self.dfC
1182
1183        self.FA = (self.SSA / self.dfA) / (self.SSE / self.dfError)
1184        self.FAprob = scipy.stats.fprob(self.dfA, self.dfError, self.FA)
1185        self.FB = (self.SSB / self.dfB) / (self.SSE / self.dfError)
1186        self.FBprob = scipy.stats.fprob(self.dfB, self.dfError, self.FB)
1187        self.FC = (self.SSC / self.dfC) / (self.SSE / self.dfError)
1188        self.FCprob = scipy.stats.fprob(self.dfC, self.dfError, self.FC)
1189
1190
1191#######################################################################################
1192## Multivariate Linear Regression
1193#######################################################################################
1194
1195
1196class MultLinReg:
1197    """Multivariate Linear Regression.
1198    """
1199    def __init__(self, X, y, summary=0):
1200        """
1201        x[i,j]: ith observation for the jth independent variable, shape (n,k)
1202        y[i]: ith observations of the dependent variable, shape (n,)
1203        """
1204        X = Numeric.asarray(X)
1205        y = Numeric.asarray(y)
1206        assert len(X.shape) == 2, "len(X.shape) != 2"
1207        assert len(y.shape) == 1, "len(y.shape) != 1"
1208        self.DFreg = X.shape[1]                         # number of independent variables (k)
1209        self.DFres = X.shape[0] - (X.shape[1]) - 1      # number of observations (n) - number of ind. variables (k) - 1
1210        self.DFtot = X.shape[0] - 1                     # number of observations (n) - 1
1211        X = Numeric.concatenate((Numeric.ones((X.shape[0],1), Numeric.Float), X), 1)
1212        XT = Numeric.transpose(X)
1213        try:
1214            self._XTXinv = LinearAlgebra.inverse(Numeric.dot(XT,X))
1215        except LinearAlgebra.LinAlgError:
1216            print "Warning: singular matrix, using generalized_inverse"
1217            self._XTX = Numeric.dot(XT,X)   # store the singuar matrix
1218            self._XTXinv = LinearAlgebra.generalized_inverse(self._XTX)
1219        y_mean = Numeric.average(y)
1220        # regression
1221        self.b = Numeric.dot(Numeric.dot(self._XTXinv, XT), y)
1222        self.y_hat = Numeric.dot(X, self.b)
1223        # release the variables not needed
1224        X = None
1225        XT = None
1226        # SS
1227        self.SSreg = Numeric.add.reduce((self.y_hat - y_mean)**2)
1228        self.MSreg = self.SSreg / self.DFreg
1229        self.SSres = Numeric.add.reduce((y - self.y_hat)**2)
1230##        self.MSres = self.SSres / self.DFres                    # equals to square std. error: s^2_{y|x}
1231##        if self.MSres == 0.0:
1232##            print "Warning MultLinReg: MSres equals 0, replaced by 1e-20"
1233##            self.MSres = 1e-20
1234        if self.DFres > 0:
1235            self.MSres = self.SSres / self.DFres                    # equals to square std. error: s^2_{y|x}
1236        else:
1237            print "Warning MultLinReg: SSres=%f, DFres=%f" % (self.SSres, self.DFres)
1238            self.MSres = 0
1239        self.SStot = self.SSreg + self.SSres                    # equal to: self.SStot = Numeric.add.reduce((y - y_mean)**2)
1240        self.MStot = self.SStot / self.DFtot
1241        # regression summary
1242        if summary: self.getRegressionSummary()
1243
1244
1245    def __call__(self, X):
1246        X = Numeric.asarray(X)
1247        if len(X.shape) == 1:
1248            X = Numeric.concatenate(([1], X),0)
1249        elif len(X.shape) == 2:
1250            X = Numeric.concatenate((Numeric.ones((X.shape[0],1), Numeric.Float), X), 1)
1251        return Numeric.dot(X, self.b)
1252
1253
1254    def getRegressionSummary(self):
1255        self.stdErr = math.sqrt(self.MSres)                         # std. error of the regression estimate: s_{x|y}
1256        self.VarCovar = self.MSres * self._XTXinv                   # variance-covariance matrix: s^2_b
1257        self.b_se = Numeric.sqrt(Numeric.diagonal(self.VarCovar))   # std. error of regression coef.: s_b_i
1258        self.b_t = self.b / self.b_se                               # t-test whether each regression coef. is significantly different from 0
1259        self.b_tprob = scipy.stats.abetai(0.5*self.DFres, 0.5, float(self.DFres)/(self.DFres + self.b_t**2))
1260        # correlation between coefficients (b's)
1261        # ERROR in Glantz et al.: self.Corr = Numeric.sqrt(self.VarCovar) / Numeric.sqrt(Numeric.dot(self.b_se[:,Numeric.NewAxis],self.b_se[Numeric.NewAxis,:]))
1262        self.Corr = self.VarCovar / Numeric.dot(self.b_se[:,Numeric.NewAxis],self.b_se[Numeric.NewAxis,:])
1263        self.F = self.MSreg / self.MSres        # overall goodness of fit
1264        self.Fprob = scipy.stats.fprob(self.DFreg, self.DFres, self.F)
1265        self.R2 = self.SSreg/self.SStot         # coef. of determination: fraction of variance explained by regression plane
1266        self.R2_adj = 1-(self.MSres/self.MStot) # adjusted coef. of determination (unbiased estimator of population R2; larget the better)
1267        self.R = math.sqrt(self.R2)             # multiple correlation coefficeint: how tight the data points cluster around the regression plane
1268
1269
1270
1271#######################################################################################
1272## FDR and q-value
1273#######################################################################################
1274
1275def qVals(pVals, estimatePi0=False, verbose=False):
1276    """Returns q-values calculated from given p-values.
1277    Input:  array of p-values.
1278            estimate pi0: False (pi0==1)
1279                          "spline": cubic spline fit on pi0(lambda) curve, estimete at lambda=1
1280                          "loess": loess fit on pi0(lambda) curve, estimate at lambda=1
1281    Reference:  Storey et al. (2003) Statistical significance for genomewide studies.
1282                PNAS 100(16), pp. 9440-5.
1283    Changes: loess fit instead of cubic spline
1284    Added:  handles missing values -> it compresses it out
1285            lambda values depend on pValues:
1286                min(pVals) <= lmbd < max(pVals)
1287                len(lmbd) == len(pVals)
1288    TODO: remove calculation of flmbd in all points except for lmbd=1
1289    """
1290    assert estimatePi0 in [False, "spline", "loess"]
1291    pValsMA = MA.asarray(pVals)
1292    qVals = MA.ones(pValsMA.shape, Numeric.Float) * MA.masked
1294    pValsMA = pValsMA.compressed()
1295    minp = min(pValsMA); maxp = max(pValsMA)
1296    m = pValsMA.shape[0]
1297    x = Numeric.arange(0,1.01,0.01)
1298    if estimatePi0 == "spline":
1299        import scipy.interpolate
1300        lmbd = Numeric.arange(0,0.96,0.01,Numeric.Float)
1301        pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1302        splineRep = scipy.interpolate.splrep(lmbd, pi0lmbd, k=3, s=0.01)   # spline representation: (knots, coefficeints, degree)
1303        flmbd = scipy.interpolate.splev(x, splineRep, der=0)
1304        pi0 = flmbd[-1]
1305        if pi0 <= 0:
1306            print "Warning: pi0<=0, trying smaller lambda..."
1307            while pi0 <= 0:
1308                lmbd = lmbd[:-1]
1309                pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1310                splineRep = scipy.interpolate.splrep(lmbd, pi0lmbd, k=3, s=0.1)   # spline representation: (knots, coefficeints, degree)
1311                flmbd = scipy.interpolate.splev(x, splineRep, der=0)
1312                pi0 = flmbd[-1]
1313    elif estimatePi0 == "loess":
1314        lmbd = Numeric.arange(0,1.0,0.01,Numeric.Float)
1315        pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1316        flmbd = Numeric.asarray(statc.loess(zip(lmbd, pi0lmbd), list(x), 0.4))[:,1]
1317        pi0 = flmbd[-1]
1318        if pi0 <= 0:
1319            print "Warning: pi0<=0, trying smaller lambda..."
1320            while pi0 <= 0:
1321                lmbd = lmbd[:-1]
1322                pi0lmbd = map(lambda l: Numeric.add.reduce(Numeric.greater(pValsMA,l)) / (m*(1-l)), lmbd)
1323                flmbd = Numeric.asarray(statc.loess(zip(lmbd, pi0lmbd), list(x), 0.4))[:,1]
1324                pi0 = flmbd[-1]
1325    else:
1326        pi0=1
1327    if verbose and estimatePi0:
1328        M.plot(lmbd, pi0lmbd)
1329        M.plot(x,flmbd)
1330        M.legend(["pi0(lambda", "fit, len(lmbd)==%i"%len(lmbd)])
1331        M.title("pi0: %.4f"%pi0)
1332        M.show()
1333    args = Numeric.argsort(pValsMA)
1334    q = Numeric.ones(pValsMA.shape, Numeric.Float)
1335    q[args[m-1]] = pi0 * pValsMA[args[m-1]]
1336    for i in range(m-1, 0, -1):
1337        q[args[i-1]] = min(pi0*m*pValsMA[args[i-1]]/i, q[args[i]])
1338    MA.put(qVals, putInd, q)
1339    return qVals
1340
1341
1342
1343if __name__=="__main__":
1344
1345    #====================== MULTIVARIATE LINEAR REGRESSION ==========================
1346
1347    # MLR: compare with orange.LinRegLearner
1348##    import Dicty.DData, Meda.Preproc, orange
1349##    DD = Dicty.DData.DData_Nancy()
1350##    p = DD.getMean2d("pkaC")
1351##    po = Meda.Preproc.ma2orng(p, Meda.Preproc.getTcDomain(13))
1352##    po.domain.classVar = po.domain.attributes[-1]
1353##    lr = orange.LinRegLearner(po)
1354##    pn = Numeric.array(p)
1355##    mlr = MultLinReg(pn[:,0:12], pn[:,12])
1356
1357    # mlr: simple test
1358##    xy1 = Numeric.array([[1,1.1],[0,-0.01],[-1,-1.1],[2,2.3],[-2,-1.5]], Numeric.Float)
1359##    ml1 = MultLinReg(xy1[:,:-1],xy1[:,-1])
1360##    et1 = Meda.Preproc.ma2orng(xy1, Meda.Preproc.getTcDomain(2,False,[]))
1361##    et1.domain.classVar = et1.domain.attributes[-1]
1362##    lr1 = orange.LinRegLearner(et1)
1363
1364    # mlr: test with random numbers
1365##    import numpy.oldnumeric.random_array as RandomArray
1366##    xy2 = RandomArray.random((5,4))
1367##    ml2 = MultLinReg(xy2[:,:-1],xy2[:,-1],1)
1368##    et2 = Meda.Preproc.ma2orng(xy2, Meda.Preproc.getTcDomain(4,False,[]))
1369##    et2.domain.classVar = et2.domain.attributes[-1]
1370##    d3 = orange.Domain(et2.domain.attributes[:-1], et2.domain.attributes[-1])
1371##    et3 = orange.ExampleTable(d3, et2)
1372##    lr2 = orange.LinRegLearner(et3)
1373
1374
1375    #====================== 2 WAY ANOVA ==========================
1376
1377    def print3d(arr3d):
1378        for k in range(arr3d.shape[1]):
1379            print "----",k,"----"
1380            for j in range(arr3d.shape[2]):
1381                for i in range(arr3d.shape[0]):
1382                    print arr3d[i,k,j]
1383
1384##    import Dicty.DData
1385##    DN = Dicty.DData.DData_Nancy()
1386##    rawN = DN.getRaw3dAll()
1387##    # subset of data
1388##    rawSub = rawN[0,0:3,0:6]
1389##    rgiSub = [[0,1,2],[3,4,5]]
1390
1391##    # compare traditional ANOVA and ANOVA using linear regression on a balanced design
1392##    as = Anova2way(rawSub, 3)
1393##    print "\ntrad.anova, interact", as.FAprob, as.FBprob, as.FABprob
1394##    ar0 = Anova2wayLR(rawSub, rgiSub, 0)
1395##    print "regr.anova, no inter", ar0.FAprob, ar0.FBprob
1396##    ar1 = Anova2wayLR(rawSub, rgiSub, 1)
1397##    print "regr.anova, interact", ar1.FAprob, ar1.FBprob, ar1.FABprob
1398##
1399##    # missing data, no empty cells
1400##    dSubM1 = MA.array(rawSub)
1402##    arM1_0 = Anova2wayLR(dSubM1, rgiSub, 0)
1403##    print "\nregr.anova, no inter", arM1_0.FAprob, arM1_0.FBprob
1404##    arM1_1 = Anova2wayLR(dSubM1, rgiSub, 1)
1405##    print "regr.anova, interact", arM1_1.FAprob, arM1_1.FBprob, arM1_1.FABprob
1406##
1407##    # missing data and empty cell
1408##    dSubM3 = MA.array(rawSub)
1410##    arM3_0 = Anova2wayLR(dSubM3, rgiSub, 0)
1411##    print "\nregr.anova, no inter", arM3_0.FAprob, arM3_0.FBprob
1412##    arM3_1 = Anova2wayLR(dSubM3, rgiSub, 1)
1413##    print "regr.anova, interact", arM3_1.FAprob, arM3_1.FBprob, arM3_1.FABprob
1414
1415    # test LR ANOVA for selection of genes on a whole dataset
1416    import time
1417
1418    def anova_2fact_testSpeed(rawAll, rgi):
1419        """factors: time, strain
1420        """
[1073]1421        as1 = Numeric.zeros((rawAll.shape[0],2), Numeric.Float)
[188]1422        print "2 fact start", time.ctime()
1423        for i in range(rawAll.shape[0]):
1424            a0 = Anova2wayLR(rawAll[i], rgi, 0)
[1073]1425            as1[i] = [a0.FAprob, a0.FBprob]
[188]1426        print "2 fact end  ", time.ctime()
[1073]1427        return as1
[188]1428
1429    def anova_fulfact_testSpeed(rawAll, rgi):
1430        """factors: time, strain & their interaction
1431        """
1432        try:
[1073]1433            as1 = Numeric.zeros((rawAll.shape[0],3), Numeric.Float)
[188]1434            print "2 fact start", time.ctime()
1435            for i in range(rawAll.shape[0]):
1436                a0 = Anova2wayLR(rawAll[i], rgi, 1)
[1073]1437                as1[i] = [a0.FAprob, a0.FBprob, a0.FABprob]
[188]1438            print "2 fact end  ", time.ctime()
1439        except:
1440            print "gene", i, time.ctime()
1441            raise
[1073]1442        return as1
[188]1443
1444##    asff = anova_fulfact_testSpeed(rawN, DN.replicaGroupInd)  # takes appx. 3 hours!
1445##    as2f = anova_2fact_testSpeed(rawN, DN.replicaGroupInd)      # takes appx 2 minutes
1446
1447    #====================== 3 WAY ANOVA ==========================
1448
1449##    import Dicty.DData
1450##    DN = Dicty.DData.DData_Nancy()
1451##    rawN = DN.getRaw3dAll()
1452
1453    # small test: Anova3way
1454##    rawSub = rawN[0:3, 0:3, 0:3]
1455##    a3Sub = Anova3way(rawSub, True)
1456##    print "FprobGene, FprobTime, FprobRep", a3Sub.FAprob, a3Sub.FBprob, a3Sub.FCprob
1457
1458    # are replicas equal? if FprobRep == 0: different, else equal
1459##    DN = Dicty.DData.DData_Nancy()
1460##    DDPJ = Dicty.DData.DData_PJ1()
1461##    DDMEDSTD = Dicty.DData.DData_MedStd()
1463##        print
1464##        for st in DD.strains:
1466##            print "FpG %.10f FpT %.10f FpGxT %.10f FpRep(F) %.20f(%3.5f)\t%12s (%i rep)" % (a3.FAprob, a3.FBprob, a3.FABprob, a3.FCprob, a3.FC, st, len(DD.replicaGroupInd[DD.getIdxStrain(st)]))
1467##
1468##    print "------------------ START ---------------"
1469##    anova_testRepEq(DN, True)
1470##    anova_testRepEq(DDPJ, True)
1471##    anova_testRepEq(DDMEDSTD, True)
1472##    print "------------------ END ---------------"
1473
1474    #====================== 1 WAY ANOVA LR ==========================
1475
1476##    a1 = Anova1wayLR(Numeric.ravel(rawSub), [range(0,10), range(10,15), range(15,27)])
1477##    subT = MA.transpose(rawN[0,:,8:11])
1478
1479    #====================== 1 WAY REPEATED MEASURES ANOVA LR ==========================
1480
1481##    a1rm = AnovaRM11LR(subT)
1482##    a1rmf = AnovaRM11LR(subT.filled(-1))
1483
1484    #====================== 2 WAY REPEATED MEASURES (on factor A) ANOVA LR ==========================
1485
1486##    ab = AnovaLRBase()
1487##    print
1488##    print Numeric.concatenate((a2rm1.dummyA,a2rm1.dummyB,a2rm1.dummySubjInB),1)
1489##    print
1490##
1491##    import Dicty.DData
1492##    DN = Dicty.DData.DData_Nancy()
1493##    rawN = DN.getRaw3dAll()
1494##    rawSub = rawN[0, 0:5, 0:9]
1495##    gi = [[0,1,2],[3,4,5],[6,7,8]]
1496
1497##    a2rm1 = AnovaRM12LR(rawSub,gi,0)
1498##    print "---no missing---"
1499##    print a2rm1.FAprob, a2rm1.FBprob#, a2rm1.FABprob
1500##
1501##    rawMiss = MA.array(rawSub)
1503##    a2m = AnovaRM12LR(rawMiss,gi,1)
1504##    print "---with missing---"
1505##    print a2m.FAprob, a2m.FBprob, a2m.FABprob
1506
1507
1508    #====================== q-values ==========================
1509
1510    import matplotlib.pylab as M
1511
1512    for st, epist in epist_MMAG_nonRM.items():
1513        for i in range(3):
1514            print st, i
1515            ps1 = epist.posthoc_pIntByStrains[:,i].compressed()
1516            qs1 = qVals(ps, "loess", True)
1517            ps2 = epist.posthoc_pIntByStrains[:,i].compressed()
1518            qs2 = qVals(ps2, "loess", True)
1519            M.title("%s, %i" %(st,i))
1520            M.subplot(1,2,1)
1521            M.scatter(ps1, qs1, 0.001)
1522            M.subplot(1,2,2)
1523            M.scatter(ps2, qs2, 0.001)
1524            M.show()
1525
1526     # increasing pi0lmbd function
1527##    qqc = qVals(ppc)
1528
1529     # negative pi0
1530##    pp = epist_DS_nonRM["pkaRregA"].posthoc_pIntByStrains[:,0].compressed()
1531##    qq = qVals(pp)
1532##    M.scatter(pp,qq)
1533##    M.show()
1534
Note: See TracBrowser for help on using the repository browser.