# source:orange-bioinformatics/obiExpression.py@1385:b4403b620014

Revision 1385:b4403b620014, 32.0 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)

Changed the way target labels are selected by the user in Vulcano Plot.

Line
1import stats, orange, numpy, statc
2
3def mean(l):
4    return float(sum(l))/len(l)
5
6class MA_pearsonCorrelation:
7    """
8    Calling an object of this class computes Pearson correlation of all
9    attributes against class.
10    """
11    def __call__(self, i, data):
12        dom2 = orange.Domain([data.domain.attributes[i]], data.domain.classVar)
13        data2 = orange.ExampleTable(dom2, data)
14        a,c = data2.toNumpy("A/C")
15        return numpy.corrcoef(c,a[:,0])[0,1]
16
17class MA_signalToNoise:
18    """
19    Returns signal to noise measurement: difference of means of two classes
20    divided by the sum of standard deviations for both classes.
21
22    Usege similar to MeasureAttribute*.
23
24    Standard deviation used for now returns minmally 0.2*|mi|, where mi=0 is adjusted to mi=1
25    (as in gsea implementation).
26
27    Can work only on data with two classes. If there are multiple class, then
28    relevant class values can be specified on object initialization.
29    By default the relevant classes are first and second class value
30    from the domain.
31    """
32
33    def __init__(self, a=None, b=None):
34        """
35        a and b are choosen class values.
36        """
37        self.a = a
38        self.b = b
39
40    def __call__(self, i, data):
41        cv = data.domain.classVar
42        #print data.domain
43
44        if self.a == None: self.a = cv.values[0]
45        if self.b == None: self.b = cv.values[1]
46
47        def stdev(l):
48            return statc.std(l)
49
50        def mean(l):
51            return statc.mean(l)
52
53        def stdevm(l):
54            m = mean(l)
55            std = stdev(l)
56            #return minmally 0.2*|mi|, where mi=0 is adjusted to mi=1
57            return max(std, 0.2*abs(1.0 if m == 0 else m))
58
59        def avWCVal(value):
60            return [ex[i].value for ex in data if ex[-1].value == value and not ex[i].isSpecial() ]
61
62        exa = avWCVal(self.a)
63        exb = avWCVal(self.b)
64
65        try:
66            rval = (mean(exa)-mean(exb))/(stdevm(exa)+stdevm(exb))
67            return rval
68        except:
69            #return some "middle" value -
70            #TODO rather throw exception?
71            return 0
72
73class MA_t_test(object):
74    def __init__(self, a=None, b=None, prob=False):
75        self.a = a
76        self.b = b
77        self.prob = prob
78    def __call__(self, i, data):
79        cv = data.domain.classVar
80        #print data.domain
81
82        #for faster computation. to save dragging many attributes along
83        dom2 = orange.Domain([data.domain[i]], data.domain.classVar)
84        data = orange.ExampleTable(dom2, data)
85        i = 0
86
87        if self.a == None: self.a = cv.values[0]
88        if self.b == None: self.b = cv.values[1]
89
90        def avWCVal(value):
91            return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ]
92
93        exa = avWCVal(self.a)
94        exb = avWCVal(self.b)
95
96        try:
97            t, prob = stats.lttest_ind(exa, exb)
98            return prob if self.prob else t
99        except:
100            return 1.0 if self.prob else 0.0
101
102class MA_fold_change(object):
103    def __init__(self, a=None, b=None):
104        self.a = a
105        self.b = b
106    def __call__(self, i, data):
107        cv = data.domain.classVar
108        #print data.domain
109
110        #for faster computation. to save dragging many attributes along
111        dom2 = orange.Domain([data.domain[i]], data.domain.classVar)
112        data = orange.ExampleTable(dom2, data)
113        i = 0
114
115        if self.a == None: self.a = cv.values[0]
116        if self.b == None: self.b = cv.values[1]
117
118        def avWCVal(value):
119            return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ]
120
121        exa = avWCVal(self.a)
122        exb = avWCVal(self.b)
123
124        try:
125            return mean(exa)/mean(exb)
126        except:
127            return 1
128
129class MA_anova(object):
130    def __init__(self, prob=False):
131        self.prob = prob
132    def __call__(self, i, data):
133        cv = data.domain.classVar
134        #print data.domain
135
136        #for faster computation. to save dragging many attributes along
137        dom2 = orange.Domain([data.domain[i]], data.domain.classVar)
138        data = orange.ExampleTable(dom2, data)
139        i = 0
140
141        def avWCVal(value):
142            return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ]
143
144        data = [avWCVal(val) for val in cv.values]
145
146        try:
147            f, prob = stats.lF_oneway(*tuple(data))
148            return prob if self.prob else f
149        except:
150            return 1.0 if self.prob else 0.0
151
152import numpy as np
153import numpy.ma as ma
154
155class ExpressionSignificance_Test(object):
156    def __new__(cls, data, useAttributeLabels, **kwargs):
157        self = object.__new__(cls)
158        if kwargs:
159            self.__init__(data, useAttributeLabels)
160            return self.__call__(**kwargs)
161        else:
162            return self
163
164    def __init__(self, data, useAttributeLabels=False):
165        self.data = data
166        self.useAttributeLabels = useAttributeLabels
167        self.attr_labels, self.data_classes = self._data_info(data)
168        self.attributes = [attr for attr in self.data.domain.attributes if attr.varType in [orange.VarTypes.Continuous, orange.VarTypes.Discrete]]
169        self.classes = np.array(self.attr_labels if useAttributeLabels else self.data_classes)
170        self.keys = range(len(data)) if useAttributeLabels else self.attributes
171        self.array, _, _ = data.toNumpyMA()
172        if self.useAttributeLabels:
173            self.array = ma.transpose(self.array)
174#        self.dim = 1 if useAttributeLabels else 0
175        self.dim = 0
176
177    def _data_info(self, data):
178        return [set(attr.attributes.items()) for attr in data.domain.attributes], [ex.getclass() for ex in data] if data.domain.classVar else [None]*len(data)
179
180    def test_indices(self, target, classes=None):
181        classes = self.classes if classes is None else classes
182
183        def target_set(target):
184            if isinstance(target, tuple):
185                return set([target])
186            else:
187                assert(isinstance(target, set))
188                return target
189
190        if self.useAttributeLabels:
191            if isinstance(target, list):
192                ind = [[i for i, cl in enumerate(self.classes) if target_set(t).intersection(cl)] for t in target]
193            else:
194                target = target_set(target)
195
196                ind1 = [i for i, cl in enumerate(self.classes) if target.intersection(cl)]
197                ind2 = [i for i, cl in enumerate(self.classes) if not target.intersection(cl)]
198                ind = [ind1, ind2]
199        else:
200            if isinstance(target, list):
201                ind = [ma.nonzero(self.classes == t)[0] for t in target]
202            else:
203                if isinstance(target, (basestring, orange.Variable)):
204                    target = set([target])
205                else:
206                    assert(isinstance(target, set))
207                target = list(target)
208                ind1 = [i for i, cl in enumerate(self.classes) if cl in target]
209                ind2 = [i for i, cl in enumerate(self.classes) if cl not in target]
210                ind = [ind1, ind2]
211
212        return ind
213
214    def __call__(self, target):
215        raise NotImplementedError()
216
217    def null_distribution(self, num, *args, **kwargs):
218        kwargs = dict(kwargs)
223        results = []
224        originalClasses = self.classes.copy()
225        for i in range(num):
226            np.random.shuffle(self.classes)
227            results.append(self.__call__(*args, **kwargs))
229        self.classes = originalClasses
230        return results
231
232class ExpressionSignificance_TTest(ExpressionSignificance_Test):
233    def __call__(self, target):
234        ind1, ind2 = self.test_indices(target)
235        t, pval = attest_ind(self.array[ind1, :], self.array[ind2, :], dim=self.dim)
236        return zip(self.keys,  zip(t, pval))
237
238class ExpressionSignificance_FoldChange(ExpressionSignificance_Test):
239    def __call__(self, target):
240        ind1, ind2 = self.test_indices(target)
241        a1, a2 = self.array[ind1, :], self.array[ind2, :]
242        fold = ma.mean(a1, self.dim)/ma.mean(a2, self.dim)
243        return zip(self.keys, fold)
244
245class ExpressionSignificance_SignalToNoise(ExpressionSignificance_Test):
246    def __call__(self, target):
247        ind1, ind2 = self.test_indices(target)
248        a1, a2 = self.array[ind1, :], self.array[ind2, :]
249        stn = (ma.mean(a1, self.dim) - ma.mean(a2, self.dim)) / (ma.sqrt(ma.var(a1, self.dim)) + ma.sqrt(ma.var(a2, self.dim)))
250        return zip(self.keys, stn)
251
252class ExpressionSignificance_ANOVA(ExpressionSignificance_Test):
253    def __call__(self, target=None):
254        if target is not None:
255            indices = self.test_indices(target)
256        else:
257            indices = []
258        f, prob = aF_oneway(*[self.array[ind, :] for ind in indices], **dict(dim=0))
259        return zip(self.keys, zip(f, prob))
260
261class ExpressionSignificance_ChiSquare(ExpressionSignificance_Test):
262    def __call__(self, target):
263        array = equi_n_discretization(self.array.copy(), intervals=5, dim=0)
264        ind1, ind2 = self.test_indices(target)
265        a1, a2 = array[ind1, :], array[ind2, :]
266        dist1, dist2  = [], []
267        dist = ma.zeros((array.shape[1], 2, 5))
268        for i in range(5):
269            dist1.append(ma.sum(ma.ones(a1.shape) * (a1 == i), 0))
270            dist2.append(ma.sum(ma.ones(a2.shape) * (a2 == i), 0))
271            dist[:, 0, i] = dist1[-1]
272            dist[:, 1, i] = dist2[-1]
273        return zip(self.keys, achisquare_indtest(np.array(dist), dim=1))
274
275class ExpressionSignificance_Info(ExpressionSignificance_Test):
276    def __call__(self, target):
277        array = equi_n_discretization(self.array.copy(), intervals=5, dim=1)
278
279        ind1, ind2 = self.test_indices(target)
280        a1, a2 = array[ind1, :], array[ind2, :]
281        dist1, dist2 = [], []
282        dist = ma.zeros((array.shape[1], 2, 5))
283        for i in range(5):
284            dist1.append(ma.sum(ma.ones(a1.shape) * (a1 == i), 0))
285            dist2.append(ma.sum(ma.ones(a2.shape) * (a2 == i), 0))
286            dist[:, 0, i] = dist1[-1]
287            dist[:, 1, i] = dist2[-1]
288        classinfo = entropy(np.array([len(ind1), len(ind2)]))
289        E = ma.sum(entropy(dist, dim=1) * ma.sum(dist, 1), 1) / ma.sum(ma.sum(dist, 1), 1)
290        return zip(self.keys, classinfo - E)
291
292class ExpressionSignificance_MannWhitneyu(ExpressionSignificance_Test):
293    def __call__(self, target):
294        ind1, ind2 = self.test_indices(target)
295        a, b = self.array[ind1, :], self.array[ind2, :]
296#        results = [amannwhitneyu(a[:, i],b[:, i]) for i in range(a.shape[1])]
297        results = [statc.mannwhitneyu(list(a[:, i]),list(b[:, i])) for i in range(a.shape[1])]
298
299        return zip(self.keys, results)
300
301def attest_ind(a, b, dim=None):
302    """ Return the t-test statistics on arrays a and b over the dim axis.
303    Returns both the t statistic as well as the p-value
304    """
305#    dim = a.ndim - 1 if dim is None else dim
306    x1, x2 = ma.mean(a, dim), ma.mean(b, dim)
307    v1, v2 = ma.var(a, dim), ma.var(b, dim)
308    n1, n2 = (a.shape[dim], b.shape[dim]) if dim is not None else (a.size, b.size)
309    df = float(n1+n2-2)
310    svar = ((n1-1)*v1+(n2-1)*v2) / df
311    t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2))
312    if t.ndim == 0:
313        return (t, statc.betai(0.5*df,0.5,df/(df+t**2)) if t is not ma.masked and df/(df+t**2) <= 1.0 else ma.masked)
314    else:
315        prob = [statc.betai(0.5*df,0.5,df/(df+tsq)) if tsq is not ma.masked and df/(df+tsq) <= 1.0 else ma.masked  for tsq in t*t]
316        return t, prob
317
318def aF_oneway(*args, **kwargs):
319    dim = kwargs.get("dim", None)
320    arrays = args
321    means = [ma.mean(a, dim) for a in arrays]
322    vars = [ma.var(a, dim) for a in arrays]
324    alldata = ma.concatenate(arrays, dim if dim is not None else 0)
326    sstot = ma.sum(alldata ** 2, dim) - (ma.sum(alldata, dim) ** 2) / bign
327    ssbn = ma.sum([(ma.sum(a, dim) ** 2) / L for a, L in zip(arrays, lens)], dim)
328#    print ma.sum(alldata, dim) ** 2 / bign, ssbn
329    ssbn -= ma.sum(alldata, dim) ** 2 / bign
330    sswn = sstot - ssbn
331    dfbn = dfnum = float(len(args) - 1.0)
332    dfwn = bign - len(args) # + 1.0
333    F = (ssbn / dfbn) / (sswn / dfwn)
334    if F.ndim == 0 and dfwn.ndim == 0:
335        return (F,statc.betai(0.5 * dfwn, 0.5 * dfnum, dfwn/float(dfwn+dfnum*F)) if F is not ma.masked and dfwn/float(dfwn+dfnum*F) <= 1.0 \
336                and dfwn/float(dfwn+dfnum*F) >= 0.0 else ma.masked)
337    else:
338        prob = [statc.betai(0.5 * dfden, 0.5 * dfnum, dfden/float(dfden+dfnum*f)) if f is not ma.masked and dfden/float(dfden+dfnum*f) <= 1.0 \
339            and dfden/float(dfden+dfnum*f) >= 0.0 else ma.masked for dfden, f in zip (dfwn, F)]
340        return F, prob
341
342def achisquare_indtest(observed, dim=None):
343    if observed.ndim == 2:
344        observed = ma.array([observed])
345        if dim is not None:
346            dim += 1
347    if dim is None:
348        dim = observed.ndim - 2
349    rowtotal = ma.sum(observed, dim + 1)
350    coltotal = ma.sum(observed, dim)
351    total = ma.sum(rowtotal, dim)
352    ones = ma.array(ma.ones(observed.shape))
353    expected = ones * rowtotal.reshape(rowtotal.shape[:dim] + (-1, 1))
354    a = ones * coltotal[..., np.zeros(observed.shape[dim], dtype=int),:]
355    expected = expected * (a) / total.reshape((-1, 1, 1))
356    chisq = ma.sum(ma.sum((observed - expected) ** 2 / expected, dim + 1), dim)
357    return chisq
358
359def equi_n_discretization(array, intervals=5, dim=1):
361    cut = ma.zeros(len(count), dtype=int)
362    sarray = ma.sort(array, dim)
363    r = count % intervals
364    pointsshape = list(array.shape)
365    pointsshape[dim] = 1
366    points = []
367    for i in range(intervals):
368        cutend = cut + count / intervals + numpy.ones(len(r)) * (r > i)
369        if dim == 1:
370            p = sarray[range(len(cutend)), numpy.array(cutend, dtype=int) -1]
371        else:
372            p = sarray[numpy.array(cutend, dtype=int) -1, range(len(cutend))]
373        points.append(p.reshape(pointsshape))
374        cut = cutend
376    darray[ma.nonzero(array <= points[0])] = 0
377    for i in range(0, intervals):
378        darray[ma.nonzero((array > points[i]))] = i + 1
379    return darray
380
381def entropy(array, dim=None):
382    if dim is None:
383        array = array.ravel()
384        dim = 0
385    n = ma.sum(array, dim)
386    array = ma.log(array) * array
387    sum = ma.sum(array, dim)
388    return (ma.log(n) - sum / n) / ma.log(2.0)
389
390"""\
391MA - Plot
392=========
393
394Functions for normalization of expression arrays and ploting
395MA - Plots
396
397Example::
398    ## Load data from GEO
399    >>> data = orange.ExampleTable("GDS1210.tab")
400    ## Split data by columns into normal and cancer subsets
401    >>> cancer, normal = data_split(data, [("disease state", "cancer"), ("disease state", "normal")])
402    ## Convert to numpy MaskedArrays
403    >>> cancer, normal = cancer.toNumpyMA("A")[0], normal.toNumpyMA("A")[0]
404    ## Merge by averaging
405    >>> cancer = merge_replicates(cancer)
406    >>> normal = merge_replicates(normal)
407    ## Plot MA-plot
408    >>> MA_plot(cancer, normal)
409
410"""
411
412import orngMisc
413from numpy import median
414def lowess(x, y, f=2./3., iter=3, progressCallback=None):
415    """ Lowess taken from Bio.Statistics.lowess, modified to compute pairwise
416    distances inplace.
417
418    lowess(x, y, f=2./3., iter=3) -> yest
419
420    Lowess smoother: Robust locally weighted regression.
421    The lowess function fits a nonparametric regression curve to a scatterplot.
422    The arrays x and y contain an equal number of elements; each pair
423    (x[i], y[i]) defines a data point in the scatterplot. The function returns
424    the estimated (smooth) values of y.
425
426    The smoothing span is given by f. A larger value for f will result in a
427    smoother curve. The number of robustifying iterations is given by iter. The
428    function will run faster with a smaller number of iterations.
429
430    x and y should be numpy float arrays of equal length.  The return value is
431    also a numpy float array of that length.
432
433    e.g.
434    >>> import numpy
435    >>> x = numpy.array([4,  4,  7,  7,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12,
436    ...                 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16,
437    ...                 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20,
438    ...                 20, 22, 23, 24, 24, 24, 24, 25], numpy.float)
439    >>> y = numpy.array([2, 10,  4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24,
440    ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
441    ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
442    ...                 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56,
443    ...                 64, 66, 54, 70, 92, 93, 120, 85], numpy.float)
444    >>> result = lowess(x, y)
445    >>> len(result)
446    50
447    >>> print "[%0.2f, ..., %0.2f]" % (result[0], result[-1])
448    [4.85, ..., 84.98]
449    """
450    n = len(x)
451    r = min(int(numpy.ceil(f*n)), n - 1)
452
453#    h = [numpy.sort(numpy.abs(x-x[i]))[r] for i in range(n)]
454#    h, xtmp = numpy.zeros_like(x), numpy.zeros_like(x)
455#    for i in range(n):
456#        xtmp = numpy.abs(x - x[i], xtmp)
457#        h[i] = numpy.sort(xtmp)[r]
458#    w = numpy.clip(numpy.abs(([x]-numpy.transpose([x]))/h),0.0,1.0)
459    dist = [x] - numpy.transpose([x])
460    dist = numpy.abs(dist, dist)
461    dist.sort(axis=1)
462    h = dist[:, r]
463    del dist
464
465    w = [x]-numpy.transpose([x])
466    w /= h
467    w = numpy.abs(w, w)
468    w = numpy.clip(w, 0.0, 1.0, w)
469#    w = 1-w*w*w
470    w **= 3
471    w *= -1
472    w += 1
473#    w = w*w*w
474    w **= 3
475    yest = numpy.zeros(n)
476    delta = numpy.ones(n)
477    milestones = orngMisc.progressBarMilestones(iter*n)
478    for iteration in range(iter):
479        for i in xrange(n):
480            weights = delta * w[:,i]
481            weights_mul_x = weights * x
482            b1 = numpy.ma.dot(weights,y)
483            b2 = numpy.ma.dot(weights_mul_x,y)
484            A11 = sum(weights)
485            A12 = sum(weights_mul_x)
486            A21 = A12
487            A22 = numpy.ma.dot(weights_mul_x,x)
488            determinant = A11*A22 - A12*A21
489            beta1 = (A22*b1-A12*b2) / determinant
490            beta2 = (A11*b2-A21*b1) / determinant
491            yest[i] = beta1 + beta2*x[i]
492            if progressCallback and (iteration*n + i) in milestones:
493                progressCallback((100. * iteration*n + i) /  (iter * n))
494        residuals = y-yest
495        s = median(abs(residuals))
496        delta[:] = numpy.clip(residuals/(6*s),-1,1)
497        delta[:] = 1-delta*delta
498        delta[:] = delta*delta
499    return yest
500
501
502
503def lowess2(x, y, xest, f=2./3., iter=3, progressCallback=None):
504    """Returns estimated values of y in data points xest (or None if estimation fails).
505    Lowess smoother: Robust locally weighted regression.
506    The lowess function fits a nonparametric regression curve to a scatterplot.
507    The arrays x and y contain an equal number of elements; each pair
508    (x[i], y[i]) defines a data point in the scatterplot. The function returns
509    the estimated (smooth) values of y.
510
511    The smoothing span is given by f. A larger value for f will result in a
512    smoother curve. The number of robustifying iterations is given by iter. The
513    function will run faster with a smaller number of iterations.
514
515    Taken from Peter Juvan's numpyExtn.py, modified for numpy, computes pairwise
516    distances inplace
517    """
518    x = numpy.asarray(x, 'f')
519    y = numpy.asarray(y, 'f')
520    xest = numpy.asarray(xest, 'f')
521    n = len(x)
522    nest = len(xest)
523    r = min(int(numpy.ceil(f*n)),n-1) # radius: num. of points to take into LR
524#    h = [numpy.sort(numpy.abs(x-x[i]))[r] for i in range(n)]    # distance of the r-th point from x[i]
525    dist = [x] - numpy.transpose([x])
526    dist = numpy.abs(dist, dist)
527    dist.sort(axis=1)
528    h = dist[:, r]
529    del dist # to free memory
530    w = [x] - numpy.transpose([x])
531    w /= h
532    w = numpy.abs(w, w)
533    w = numpy.clip(w, 0.0, 1.0, w)
534#    w = numpy.clip(numpy.abs(([x]-numpy.transpose([x]))/h),0.0,1.0)
535    w **= 3
536    w *= -1
537    w += 1
538#    w = 1 - w**3 #1-w*w*w
539    w **= 3
540#    w = w**3 #w*w*w
541#    hest = [numpy.sort(numpy.abs(x-xest[i]))[r] for i in range(nest)]    # r-th min. distance from xest[i] to x
542    dist = [x] - numpy.transpose([xest])
543    dist = numpy.abs(dist, dist)
544    dist.sort(axis=1)
545    hest = dist[:, r]
546    del dist # to free memory
547#    west = numpy.clip(numpy.abs(([xest]-numpy.transpose([x]))/hest),0.0,1.0)  # shape: (len(x), len(xest)
548    west = [xest]-numpy.transpose([x])
549    west /= hest
550    west = numpy.abs(west, west)
551    west = numpy.clip(west, 0.0, 1.0, west)
552#    west = 1 - west**3 #1-west*west*west
553    west **= 3
554    west *= -1
555    west += 1
556#    west = west**3 #west*west*west
557    west **= 3
558    yest = numpy.zeros(n,'f')
559    yest2 = numpy.zeros(nest,'f')
560    delta = numpy.ones(n,'f')
561    iter_count = iter*(nest + n) if iter > 1 else nest
562    milestones = orngMisc.progressBarMilestones(iter_count)
563    curr_iter = 0
564    for iteration in range(iter):
565        # fit xest
566        for i in range(nest):
567            weights = delta * west[:,i]
568            b = numpy.array([numpy.sum(weights*y), numpy.sum(weights*y*x)])
569            A = numpy.array([[numpy.sum(weights), numpy.sum(weights*x)], [numpy.sum(weights*x), numpy.sum(weights*x*x)]])
570            beta = numpy.linalg.solve(A, b)
571            yest2[i] = beta[0] + beta[1]*xest[i]
572            if progressCallback and curr_iter in milestones:
573                progressCallback(100. * curr_iter / iter_count)
574            curr_iter += 1
575
576        # fit x (to calculate residuals and delta)
577        if iter > 1:
578            for i in range(n):
579                weights = delta * w[:,i]
580                b = numpy.array([numpy.sum(weights*y), numpy.sum(weights*y*x)])
581                A = numpy.array([[numpy.sum(weights), numpy.sum(weights*x)], [numpy.sum(weights*x), numpy.sum(weights*x*x)]])
582                beta = numpy.linalg.solve(A,b)
583                yest[i] = beta[0] + beta[1]*x[i]
584                if progressCallback and curr_iter in milestones:
585                    progressCallback(100. * curr_iter / iter_count)
586                curr_iter += 1
587            residuals = y-yest
588            s = numpy.median(numpy.abs(residuals))
589            delta = numpy.clip(residuals/(6*s), -1, 1)
590            delta = 1-delta*delta
591            delta = delta*delta
592    return yest2
593
594
595def attr_group_indices(data, label_groups):
596    """ Return a two or more lists of indices into `data.domain` based on `label_groups`
597
598    Example::
599        cancer_indices, no_cancer_indices = attr_group_indices(data, [("disease state", "cancer"), ("disease state", "normal")])
600    """
601    ret = []
602    for key, val in label_groups:
603        ind = [i for i, attr in enumerate(data.domain.attributes) if attr.attributes.get(key, None) == val]
604        ret.append(ind)
605    return ret
606
607
608def example_group_indices(data, attr, values):
609    """ Return lists of indices into `data` for each `values` item that matches
610    the example value at `attr` attribute
611
612    Example::
613        cls_ind1, cls_ind2 = example_group_indices(data, data.domain.classVar, ["Class 1", "Class 2"])
614    """
615    ret = [[] for _ in values]
616    values_id = dict([(str(value), i) for i, value in enumerate(values)])
617    for i, ex in enumerate(data):
618        id = values_id.get(str(ex[attr]), None)
619        if id is not None:
620            ret[id].append(i)
621    return ret
622
623
624def data_group_split(data, label_groups):
625    """ Split an `data` example table into two or more based on
626    contents of iterable `label_groups` containing (key, value)
627    pairs matching the labels of data attributes.
628
629    Example::
630        cancer, no_cancer = data_group_split(data, [("disease state", "cancer"), ("disease state", "normal")])
631    """
632    ret = []
633    group_indices = attr_group_indices(data, label_groups)
634    for indices in group_indices:
635        attrs = [data.domain[i] for i in indices]
636        domain = orange.Domain(attrs, data.domain.classVar)
638        ret.append(orange.ExampleTable(domain, data))
639    return ret
640
641
642def select_indices(data, key, value, axis=1):
643    """ Return indices into `data` (ExampleTable) along specified `axis`
644    where:
645        - if axis == 0 match data[i][key] == value
646        - if axis == 1 match data.domain[i].attributes[key] == value
647
648    Example::
649        cancer_ind = select_indices(data, key="disease state", value="cancer"), axis=1)
650        normal_ind = select_indices(data, key="disease state", value=["normal"], axis=1) # value can be a list to specify more then one value
651
652    """
653    values = value if isinstance(value, list) else [value]
654    if axis == 0:
655        groups = example_group_indices(data, key, values)
656    else:
657        groups = attr_group_indices(data, [(key, val) for val in values])
658
659    return sorted(reduce(set.union, groups, set()))
660
661
662def select_data(data, key, value, axis=1):
663    """ Return `data` (ExampleTable) subset along specified `axis` where
664    where:
665        - if axis == 0 match data[i][key] == value
666        - if axis == 1 match data.domain[i].attributes[key] == value
667        .. note:: This preserves all meta attributes of the domain
668    Example::
669        cancer = select_data(data, "disease state", "cancer", axis=1)
670        normal = select_data(data, "disease state", ["normal"], axis=1) # value can be a list to specify more then one value
671
672    """
673    indices = select_indices(data, key, value, axis)
674    if axis == 0:
675        examples = [data[i] for i in indices]
676        return orange.ExampleTable(data.domain, examples)
677    else:
678        attrs = [data.domain[i] for i in indices]
679        domain = orange.Domain(attrs, False)
681        return orange.ExampleTable(domain, data)
682
683
684def split_data(data, groups, axis=1):
685    """ Split data (ExampleTable) along specified axis, where elements of
686    `groups` match `key` and `value` arguments of the `select_data`
687    function
688
689    Example::
690        cancer, normal = split_data(data, [("disease state", "cancer"), ("disease state", ["normal"])], axis=1)
691    """
692    res = []
693    for key, value in groups:
694        res.append(select_data(data, key, value, axis))
695    return res
696
697
698def geometric_mean(array):
699    """ Return a geometric mean computed on a 1d masked array
700    """
701    array = numpy.ma.asanyarray(array)
702    return numpy.power(reduce(lambda a,b: a*b, array.filled(1.), 1.0), 1./len(array))
703
704
705def harmonic_mean(array):
706    """ Return a harmonic mean computed ona a 1d masked array
707    """
708    array = numpy.ma.asanyarray(array)
709    return len(array) / numpy.ma.sum(1. / array)
710
711
712def merge_replicates(replicates, axis=0, merge_function=numpy.ma.average):
713    """ Merge `replicates` (numpy.array) along `axis` using `merge_function`
714    """
715    return numpy.ma.apply_along_axis(merge_function, axis, replicates)
716
717
718def ratio_intensity(G, R):
719    """ return the log2(R/G), log10(R*G) as a tuple
720    """
721    log2Ratio = numpy.ma.log(R/G) / numpy.log(2)
722    log10Intensity = numpy.ma.log10(R*G)
723    return log2Ratio, log10Intensity
724
725
726def MA_center_average(G, R):
727    """ return the G, R by centering the average log2 ratio
728    """
729    center_est = numpy.ma.average(numpy.ma.log(R/G) / numpy.log(2))
730    G = G * numpy.exp2(center_est)
731    return G, R.copy()
732
733
734def MA_center_lowess(G, R, f=2./3., iter=1, progressCallback=None):
735    """ return the G, R by centering the average log2 ratio locally
736    depending on the intensity using lowess (locally weighted linear regression)
737    """
738#    from Bio.Statistics.lowess import lowess
739    ratio, intensity = ratio_intensity(G, R)
741    valid_ind = numpy.ma.where(valid)
742    center_est = lowess(intensity[valid], ratio[valid], f=f, iter=iter, progressCallback=progressCallback)
743    Gc, R = G.copy(), R.copy()
744    Gc[valid] *= numpy.exp2(center_est)
746    return Gc, R
747
748
749def MA_center_lowess_fast(G, R, f=2./3., iter=1, resolution=100, progressCallback=None):
750    """return the G, R by centering the average log2 ratio locally
751    depending on the intensity using lowess (locally weighted linear regression),
752    approximated only in a limited resolution.
753    """
754
755    ratio, intensity = ratio_intensity(G, R)
757    resoluiton = min(resolution, len(intensity[valid]))
758    hist, edges = numpy.histogram(intensity[valid], len(intensity[valid])/resolution)
759
760    progressCallback2 = (lambda val: progressCallback(val/2)) if progressCallback else None
761    centered = lowess2(intensity[valid], ratio[valid], edges, f, iter, progressCallback=progressCallback2)
762
763    progressCallback2 = (lambda val: progressCallback(50 + val/2)) if progressCallback else None
764    centered = lowess2(edges, centered, intensity[valid], f, iter, progressCallback=progressCallback2)
765
766    Gc, R = G.copy(), R.copy()
767    Gc[valid] *= numpy.exp2(centered)
769    return Gc, R
770
771
772def MA_plot(G, R, format="b."):
773    """ Plot G, R on a MA-plot using matplotlib
774    """
775    import matplotlib.pyplot as plt
776    ratio, intensity = ratio_intensity(G, R)
777    plt.plot(intensity, ratio, format)
778    plt.ylabel('M = log2(R/G')
779    plt.xlabel('A = log10(R*G)')
780
781
782def normalize_expression_data(data, groups, axis=1, merge_function=numpy.ma.average, center_function=MA_center_lowess_fast):
783    """ A helper function that normalizes expression array example table, by centering the MA plot.
784
785    """
786    if isinstance(data, orange.ExampleTable):
787        label_groups = [select_indices(data, key, value, axis) for key, value in groups]
788        array, _, _ = data.toNumpyMA()
789
790    merged = []
791    for indices in label_groups:
792        replicates = numpy.take(array, indices, axis=1)
793        merged.append(merge_replicates(replicates, axis=1, merge_function=merge_function))
794
795    ind1, ind2 = label_groups
796    G, R = merged
797    Gc, Rc = center_function(G, R)
798
799    domain = orange.Domain(data.domain.attributes, data.domain.classVar)
801    data = orange.ExampleTable(domain, data)
802
803    GFactors = Gc/G
804
805    if axis == 0:
806        for i, gf in zip(ind1, GFactors):
807            for attr in range(len(data[i])):
808                if not data[i][attr].isSpecial():
809                    data[i][attr] = float(data[i][attr]) * gf
810    else:
811        for ex, gf in zip(data, GFactors):
812            for i in ind1:
813                if not ex[i].isSpecial():
814                    ex[i] = float(ex[i]) * gf
815    return data
816
817
818def MA_zscore(G, R, window=1./5., padded=False, progressCallback=None):
819    """ Return the Z-score of log2 fold ratio estimated from local
820    distribution of log2 fold ratio values on the MA-plot
821    """
822    ratio, intensity = ratio_intensity(G, R)
823
824    z_scores = numpy.ma.zeros(G.shape)
825    sorted = list(numpy.ma.argsort(intensity))
826    import math, random
827    r = int(math.ceil(len(sorted)*window)) # number of window elements
828    def local_indices(i, sorted):
829        """ local indices in sorted (mirror padded if out of bounds)
830        """
831        start, end = i - r/2, i + r/2 + r%2
833        if start < 0:
836            start = 0
837        if end > len(sorted):
838            pad_end = sorted[end - len(sorted):]
840            end = len(sorted)
841
844        else:
845            return sorted[start:end]
846
847    milestones = orngMisc.progressBarMilestones(len(sorted))
848    for i in range(len(sorted)):
849        indices = local_indices(i, sorted)
850        localRatio = numpy.take(ratio, indices)
851        local_std = numpy.ma.std(localRatio)
852        ind = sorted[i]
853        z_scores[ind] = ratio[ind] / local_std
854        if progressCallback and i in milestones:
855            progressCallback(100. * i / len(sorted))
856