source: orange-bioinformatics/orangecontrib/bio/obiExpression.py @ 1873:0810c5708cc5

Revision 1873:0810c5708cc5, 32.1 KB checked in by Ales Erjavec <ales.erjavec@…>, 7 months ago (diff)

Moved '_bioinformatics' into orangecontrib namespace.

Line 
1from __future__ import absolute_import
2
3import numpy
4
5import orange, statc
6
7from . import stats
8
9def mean(l):
10    return float(sum(l))/len(l)
11
12class MA_pearsonCorrelation:
13    """
14    Calling an object of this class computes Pearson correlation of all
15    attributes against class.
16    """
17    def __call__(self, i, data):
18        dom2 = orange.Domain([data.domain.attributes[i]], data.domain.classVar)
19        data2 = orange.ExampleTable(dom2, data)
20        a,c = data2.toNumpy("A/C")
21        return numpy.corrcoef(c,a[:,0])[0,1]
22
23class MA_signalToNoise:
24    """
25    Returns signal to noise measurement: difference of means of two classes
26    divided by the sum of standard deviations for both classes.
27
28    Usege similar to MeasureAttribute*.
29
30    Standard deviation used for now returns minmally 0.2*|mi|, where mi=0 is adjusted to mi=1
31    (as in gsea implementation).
32
33    Can work only on data with two classes. If there are multiple class, then
34    relevant class values can be specified on object initialization.
35    By default the relevant classes are first and second class value
36    from the domain.
37    """
38
39    def __init__(self, a=None, b=None):
40        """
41        a and b are choosen class values.
42        """
43        self.a = a
44        self.b = b
45
46    def __call__(self, i, data):
47        cv = data.domain.classVar
48        #print data.domain
49
50        if self.a == None: self.a = cv.values[0]
51        if self.b == None: self.b = cv.values[1]
52
53        def stdev(l):
54            return statc.std(l)
55
56        def mean(l):
57            return statc.mean(l)
58
59        def stdevm(l):
60            m = mean(l)
61            std = stdev(l)
62            #return minmally 0.2*|mi|, where mi=0 is adjusted to mi=1
63            return max(std, 0.2*abs(1.0 if m == 0 else m))
64
65        def avWCVal(value):
66            return [ex[i].value for ex in data if ex[-1].value == value and not ex[i].isSpecial() ]
67
68        exa = avWCVal(self.a)
69        exb = avWCVal(self.b)
70
71        try:
72            rval = (mean(exa)-mean(exb))/(stdevm(exa)+stdevm(exb))
73            return rval
74        except:
75            #return some "middle" value -
76            #TODO rather throw exception?
77            return 0
78
79class MA_t_test(object):
80    def __init__(self, a=None, b=None, prob=False):
81        self.a = a
82        self.b = b
83        self.prob = prob
84    def __call__(self, i, data):
85        cv = data.domain.classVar
86        #print data.domain
87
88        #for faster computation. to save dragging many attributes along
89        dom2 = orange.Domain([data.domain[i]], data.domain.classVar)
90        data = orange.ExampleTable(dom2, data)
91        i = 0
92
93        if self.a == None: self.a = cv.values[0]
94        if self.b == None: self.b = cv.values[1]
95
96        def avWCVal(value):
97            return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ]
98
99        exa = avWCVal(self.a)
100        exb = avWCVal(self.b)
101
102        try:
103            t, prob = stats.lttest_ind(exa, exb)
104            return prob if self.prob else t
105        except:
106            return 1.0 if self.prob else 0.0
107
108class MA_fold_change(object):
109    def __init__(self, a=None, b=None):
110        self.a = a
111        self.b = b
112    def __call__(self, i, data):
113        cv = data.domain.classVar
114        #print data.domain
115
116        #for faster computation. to save dragging many attributes along
117        dom2 = orange.Domain([data.domain[i]], data.domain.classVar)
118        data = orange.ExampleTable(dom2, data)
119        i = 0
120
121        if self.a == None: self.a = cv.values[0]
122        if self.b == None: self.b = cv.values[1]
123
124        def avWCVal(value):
125            return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ]
126
127        exa = avWCVal(self.a)
128        exb = avWCVal(self.b)
129
130        try:
131            return mean(exa)/mean(exb)
132        except:
133            return 1
134
135class MA_anova(object):
136    def __init__(self, prob=False):
137        self.prob = prob
138    def __call__(self, i, data):
139        cv = data.domain.classVar
140        #print data.domain
141
142        #for faster computation. to save dragging many attributes along
143        dom2 = orange.Domain([data.domain[i]], data.domain.classVar)
144        data = orange.ExampleTable(dom2, data)
145        i = 0
146
147        def avWCVal(value):
148            return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ]
149
150        data = [avWCVal(val) for val in cv.values]
151
152        try:
153            f, prob = stats.lF_oneway(*tuple(data))
154            return prob if self.prob else f
155        except:
156            return 1.0 if self.prob else 0.0
157
158import numpy as np
159import numpy.ma as ma
160
161class ExpressionSignificance_Test(object):
162    def __new__(cls, data, useAttributeLabels, **kwargs):
163        self = object.__new__(cls)
164        if kwargs:
165            self.__init__(data, useAttributeLabels)
166            return self.__call__(**kwargs)
167        else:
168            return self
169   
170    def __init__(self, data, useAttributeLabels=False):
171        self.data = data
172        self.useAttributeLabels = useAttributeLabels
173        self.attr_labels, self.data_classes = self._data_info(data)
174        self.attributes = [attr for attr in self.data.domain.attributes if attr.varType in [orange.VarTypes.Continuous, orange.VarTypes.Discrete]]
175        self.classes = np.array(self.attr_labels if useAttributeLabels else self.data_classes)
176        self.keys = range(len(data)) if useAttributeLabels else self.attributes
177        self.array, _, _ = data.toNumpyMA()
178        if self.useAttributeLabels:
179            self.array = ma.transpose(self.array)
180#        self.dim = 1 if useAttributeLabels else 0 
181        self.dim = 0
182       
183    def _data_info(self, data):
184        return [set(attr.attributes.items()) for attr in data.domain.attributes], [ex.getclass() for ex in data] if data.domain.classVar else [None]*len(data)
185       
186    def test_indices(self, target, classes=None):
187        classes = self.classes if classes is None else classes
188       
189        def target_set(target):
190            if isinstance(target, tuple):
191                return set([target])
192            else:
193                assert(isinstance(target, set))
194                return target
195           
196        if self.useAttributeLabels:
197            if isinstance(target, list):
198                ind = [[i for i, cl in enumerate(self.classes) if target_set(t).intersection(cl)] for t in target]
199            else:
200                target = target_set(target)
201               
202                ind1 = [i for i, cl in enumerate(self.classes) if target.intersection(cl)]
203                ind2 = [i for i, cl in enumerate(self.classes) if not target.intersection(cl)]
204                ind = [ind1, ind2]
205        else:
206            if isinstance(target, list):
207                ind = [ma.nonzero(self.classes == t)[0] for t in target]
208            else:
209                if isinstance(target, (basestring, orange.Variable)):
210                    target = set([target])
211                else:
212                    assert(isinstance(target, set))
213                target = list(target)
214                ind1 = [i for i, cl in enumerate(self.classes) if cl in target]
215                ind2 = [i for i, cl in enumerate(self.classes) if cl not in target]
216                ind = [ind1, ind2]
217               
218        return ind
219   
220    def __call__(self, target):
221        raise NotImplementedError()
222   
223    def null_distribution(self, num, *args, **kwargs):
224        kwargs = dict(kwargs)
225        advance = lambda: None
226        if "advance" in kwargs:
227            advance = kwargs["advance"]
228            del kwargs["advance"]
229        results = []
230        originalClasses = self.classes.copy()
231        for i in range(num):
232            np.random.shuffle(self.classes)
233            results.append(self.__call__(*args, **kwargs))
234            advance()
235        self.classes = originalClasses
236        return results
237   
238class ExpressionSignificance_TTest(ExpressionSignificance_Test):
239    def __call__(self, target):
240        ind1, ind2 = self.test_indices(target)
241        t, pval = attest_ind(self.array[ind1, :], self.array[ind2, :], dim=self.dim)
242        return zip(self.keys,  zip(t, pval))
243       
244class ExpressionSignificance_FoldChange(ExpressionSignificance_Test):
245    def __call__(self, target):
246        ind1, ind2 = self.test_indices(target)
247        a1, a2 = self.array[ind1, :], self.array[ind2, :]
248        fold = ma.mean(a1, self.dim)/ma.mean(a2, self.dim)
249        return zip(self.keys, fold)
250   
251class ExpressionSignificance_SignalToNoise(ExpressionSignificance_Test):
252    def __call__(self, target):
253        ind1, ind2 = self.test_indices(target)
254        a1, a2 = self.array[ind1, :], self.array[ind2, :]
255        stn = (ma.mean(a1, self.dim) - ma.mean(a2, self.dim)) / (ma.sqrt(ma.var(a1, self.dim)) + ma.sqrt(ma.var(a2, self.dim)))
256        return zip(self.keys, stn)
257   
258class ExpressionSignificance_ANOVA(ExpressionSignificance_Test):
259    def __call__(self, target=None):
260        if target is not None:
261            indices = self.test_indices(target)
262        else:
263            indices = []
264        f, prob = aF_oneway(*[self.array[ind, :] for ind in indices], **dict(dim=0))
265        return zip(self.keys, zip(f, prob))
266       
267class ExpressionSignificance_ChiSquare(ExpressionSignificance_Test):
268    def __call__(self, target):
269        array = equi_n_discretization(self.array.copy(), intervals=5, dim=0)
270        ind1, ind2 = self.test_indices(target)
271        a1, a2 = array[ind1, :], array[ind2, :]
272        dist1, dist2  = [], []
273        dist = ma.zeros((array.shape[1], 2, 5))
274        for i in range(5):
275            dist1.append(ma.sum(ma.ones(a1.shape) * (a1 == i), 0))
276            dist2.append(ma.sum(ma.ones(a2.shape) * (a2 == i), 0))
277            dist[:, 0, i] = dist1[-1]
278            dist[:, 1, i] = dist2[-1] 
279        return zip(self.keys, achisquare_indtest(np.array(dist), dim=1))
280       
281class ExpressionSignificance_Info(ExpressionSignificance_Test):
282    def __call__(self, target):
283        array = equi_n_discretization(self.array.copy(), intervals=5, dim=1)
284       
285        ind1, ind2 = self.test_indices(target)
286        a1, a2 = array[ind1, :], array[ind2, :]
287        dist1, dist2 = [], []
288        dist = ma.zeros((array.shape[1], 2, 5))
289        for i in range(5):
290            dist1.append(ma.sum(ma.ones(a1.shape) * (a1 == i), 0))
291            dist2.append(ma.sum(ma.ones(a2.shape) * (a2 == i), 0))
292            dist[:, 0, i] = dist1[-1]
293            dist[:, 1, i] = dist2[-1]
294        classinfo = entropy(np.array([len(ind1), len(ind2)]))
295        E = ma.sum(entropy(dist, dim=1) * ma.sum(dist, 1), 1) / ma.sum(ma.sum(dist, 1), 1)
296        return zip(self.keys, classinfo - E)
297   
298class ExpressionSignificance_MannWhitneyu(ExpressionSignificance_Test):
299    def __call__(self, target):
300        ind1, ind2 = self.test_indices(target)
301        a, b = self.array[ind1, :], self.array[ind2, :]
302#        results = [amannwhitneyu(a[:, i],b[:, i]) for i in range(a.shape[1])]
303        results = [statc.mannwhitneyu(list(a[:, i]),list(b[:, i])) for i in range(a.shape[1])]
304       
305        return zip(self.keys, results)
306
307def attest_ind(a, b, dim=None):
308    """ Return the t-test statistics on arrays a and b over the dim axis.
309    Returns both the t statistic as well as the p-value
310    """
311#    dim = a.ndim - 1 if dim is None else dim
312    x1, x2 = ma.mean(a, dim), ma.mean(b, dim)
313    v1, v2 = ma.var(a, dim), ma.var(b, dim)
314    n1, n2 = (a.shape[dim], b.shape[dim]) if dim is not None else (a.size, b.size)
315    df = float(n1+n2-2)
316    svar = ((n1-1)*v1+(n2-1)*v2) / df
317    t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2))
318    if t.ndim == 0:
319        return (t, statc.betai(0.5*df,0.5,df/(df+t**2)) if t is not ma.masked and df/(df+t**2) <= 1.0 else ma.masked)
320    else:
321        prob = [statc.betai(0.5*df,0.5,df/(df+tsq)) if tsq is not ma.masked and df/(df+tsq) <= 1.0 else ma.masked  for tsq in t*t]
322        return t, prob
323
324def aF_oneway(*args, **kwargs):
325    dim = kwargs.get("dim", None)
326    arrays = args
327    means = [ma.mean(a, dim) for a in arrays]
328    vars = [ma.var(a, dim) for a in arrays]
329    lens = [ma.sum(ma.array(ma.ones(a.shape), mask=ma.asarray(a).mask), dim) for a in arrays]
330    alldata = ma.concatenate(arrays, dim if dim is not None else 0)
331    bign =  ma.sum(ma.array(ma.ones(alldata.shape), mask=alldata.mask), dim)
332    sstot = ma.sum(alldata ** 2, dim) - (ma.sum(alldata, dim) ** 2) / bign
333    ssbn = ma.sum([(ma.sum(a, dim) ** 2) / L for a, L in zip(arrays, lens)], dim)
334#    print ma.sum(alldata, dim) ** 2 / bign, ssbn
335    ssbn -= ma.sum(alldata, dim) ** 2 / bign
336    sswn = sstot - ssbn
337    dfbn = dfnum = float(len(args) - 1.0)
338    dfwn = bign - len(args) # + 1.0
339    F = (ssbn / dfbn) / (sswn / dfwn)
340    if F.ndim == 0 and dfwn.ndim == 0:
341        return (F,statc.betai(0.5 * dfwn, 0.5 * dfnum, dfwn/float(dfwn+dfnum*F)) if F is not ma.masked and dfwn/float(dfwn+dfnum*F) <= 1.0 \
342                and dfwn/float(dfwn+dfnum*F) >= 0.0 else ma.masked)
343    else:
344        prob = [statc.betai(0.5 * dfden, 0.5 * dfnum, dfden/float(dfden+dfnum*f)) if f is not ma.masked and dfden/float(dfden+dfnum*f) <= 1.0 \
345            and dfden/float(dfden+dfnum*f) >= 0.0 else ma.masked for dfden, f in zip (dfwn, F)]
346        return F, prob
347   
348def achisquare_indtest(observed, dim=None):
349    if observed.ndim == 2:
350        observed = ma.array([observed])
351        if dim is not None:
352            dim += 1 
353    if dim is None:
354        dim = observed.ndim - 2
355    rowtotal = ma.sum(observed, dim + 1)
356    coltotal = ma.sum(observed, dim)
357    total = ma.sum(rowtotal, dim)
358    ones = ma.array(ma.ones(observed.shape))
359    expected = ones * rowtotal.reshape(rowtotal.shape[:dim] + (-1, 1))
360    a = ones * coltotal[..., np.zeros(observed.shape[dim], dtype=int),:]
361    expected = expected * (a) / total.reshape((-1, 1, 1))
362    chisq = ma.sum(ma.sum((observed - expected) ** 2 / expected, dim + 1), dim)
363    return chisq
364   
365def equi_n_discretization(array, intervals=5, dim=1):
366    count = ma.sum(ma.array(ma.ones(array.shape, dtype=int), mask=array.mask), dim)
367    cut = ma.zeros(len(count), dtype=int)
368    sarray = ma.sort(array, dim)
369    r = count % intervals
370    pointsshape = list(array.shape)
371    pointsshape[dim] = 1
372    points = []
373    for i in range(intervals):
374        cutend = cut + count / intervals + numpy.ones(len(r)) * (r > i)
375        if dim == 1:
376            p = sarray[range(len(cutend)), numpy.array(cutend, dtype=int) -1]
377        else:
378            p = sarray[numpy.array(cutend, dtype=int) -1, range(len(cutend))]
379        points.append(p.reshape(pointsshape))
380        cut = cutend
381    darray = ma.array(ma.zeros(array.shape) - 1, mask=array.mask)
382    darray[ma.nonzero(array <= points[0])] = 0
383    for i in range(0, intervals):
384        darray[ma.nonzero((array > points[i]))] = i + 1 
385    return darray
386
387def entropy(array, dim=None):
388    if dim is None:
389        array = array.ravel()
390        dim = 0
391    n = ma.sum(array, dim)
392    array = ma.log(array) * array
393    sum = ma.sum(array, dim)
394    return (ma.log(n) - sum / n) / ma.log(2.0)
395
396"""\
397MA - Plot
398=========
399
400Functions for normalization of expression arrays and ploting
401MA - Plots
402
403Example::
404    ## Load data from GEO
405    >>> data = orange.ExampleTable("GDS1210.tab")
406    ## Split data by columns into normal and cancer subsets
407    >>> cancer, normal = data_split(data, [("disease state", "cancer"), ("disease state", "normal")])
408    ## Convert to numpy MaskedArrays
409    >>> cancer, normal = cancer.toNumpyMA("A")[0], normal.toNumpyMA("A")[0]
410    ## Merge by averaging
411    >>> cancer = merge_replicates(cancer)
412    >>> normal = merge_replicates(normal)
413    ## Plot MA-plot
414    >>> MA_plot(cancer, normal)
415   
416"""
417
418from Orange.orng import orngMisc
419from numpy import median
420def lowess(x, y, f=2./3., iter=3, progressCallback=None):
421    """ Lowess taken from Bio.Statistics.lowess, modified to compute pairwise
422    distances inplace.
423     
424    lowess(x, y, f=2./3., iter=3) -> yest
425
426    Lowess smoother: Robust locally weighted regression.
427    The lowess function fits a nonparametric regression curve to a scatterplot.
428    The arrays x and y contain an equal number of elements; each pair
429    (x[i], y[i]) defines a data point in the scatterplot. The function returns
430    the estimated (smooth) values of y.
431
432    The smoothing span is given by f. A larger value for f will result in a
433    smoother curve. The number of robustifying iterations is given by iter. The
434    function will run faster with a smaller number of iterations.
435
436    x and y should be numpy float arrays of equal length.  The return value is
437    also a numpy float array of that length.
438
439    e.g.
440    >>> import numpy
441    >>> x = numpy.array([4,  4,  7,  7,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12,
442    ...                 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16,
443    ...                 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20,
444    ...                 20, 22, 23, 24, 24, 24, 24, 25], numpy.float)
445    >>> y = numpy.array([2, 10,  4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24,
446    ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
447    ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
448    ...                 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56,
449    ...                 64, 66, 54, 70, 92, 93, 120, 85], numpy.float)
450    >>> result = lowess(x, y)
451    >>> len(result)
452    50
453    >>> print "[%0.2f, ..., %0.2f]" % (result[0], result[-1])
454    [4.85, ..., 84.98]
455    """
456    n = len(x)
457    r = min(int(numpy.ceil(f*n)), n - 1)
458   
459#    h = [numpy.sort(numpy.abs(x-x[i]))[r] for i in range(n)]
460#    h, xtmp = numpy.zeros_like(x), numpy.zeros_like(x)
461#    for i in range(n):
462#        xtmp = numpy.abs(x - x[i], xtmp)
463#        h[i] = numpy.sort(xtmp)[r]
464#    w = numpy.clip(numpy.abs(([x]-numpy.transpose([x]))/h),0.0,1.0)
465    dist = [x] - numpy.transpose([x])
466    dist = numpy.abs(dist, dist)
467    dist.sort(axis=1)
468    h = dist[:, r]
469    del dist
470
471    w = [x]-numpy.transpose([x])
472    w /= h
473    w = numpy.abs(w, w)
474    w = numpy.clip(w, 0.0, 1.0, w)
475#    w = 1-w*w*w
476    w **= 3
477    w *= -1
478    w += 1
479#    w = w*w*w
480    w **= 3
481    yest = numpy.zeros(n)
482    delta = numpy.ones(n)
483    milestones = orngMisc.progressBarMilestones(iter*n)
484    for iteration in range(iter):
485        for i in xrange(n):
486            weights = delta * w[:,i]
487            weights_mul_x = weights * x
488            b1 = numpy.ma.dot(weights,y)
489            b2 = numpy.ma.dot(weights_mul_x,y)
490            A11 = sum(weights)
491            A12 = sum(weights_mul_x)
492            A21 = A12
493            A22 = numpy.ma.dot(weights_mul_x,x)
494            determinant = A11*A22 - A12*A21
495            beta1 = (A22*b1-A12*b2) / determinant
496            beta2 = (A11*b2-A21*b1) / determinant
497            yest[i] = beta1 + beta2*x[i]
498            if progressCallback and (iteration*n + i) in milestones:
499                progressCallback((100. * iteration*n + i) /  (iter * n))
500        residuals = y-yest
501        s = median(abs(residuals))
502        delta[:] = numpy.clip(residuals/(6*s),-1,1)
503        delta[:] = 1-delta*delta
504        delta[:] = delta*delta
505    return yest
506
507
508
509def lowess2(x, y, xest, f=2./3., iter=3, progressCallback=None):
510    """Returns estimated values of y in data points xest (or None if estimation fails).
511    Lowess smoother: Robust locally weighted regression.
512    The lowess function fits a nonparametric regression curve to a scatterplot.
513    The arrays x and y contain an equal number of elements; each pair
514    (x[i], y[i]) defines a data point in the scatterplot. The function returns
515    the estimated (smooth) values of y.
516
517    The smoothing span is given by f. A larger value for f will result in a
518    smoother curve. The number of robustifying iterations is given by iter. The
519    function will run faster with a smaller number of iterations.
520   
521    Taken from Peter Juvan's numpyExtn.py, modified for numpy, computes pairwise
522    distances inplace
523    """
524    x = numpy.asarray(x, 'f')
525    y = numpy.asarray(y, 'f')
526    xest = numpy.asarray(xest, 'f')
527    n = len(x)
528    nest = len(xest)
529    r = min(int(numpy.ceil(f*n)),n-1) # radius: num. of points to take into LR
530#    h = [numpy.sort(numpy.abs(x-x[i]))[r] for i in range(n)]    # distance of the r-th point from x[i]
531    dist = [x] - numpy.transpose([x])
532    dist = numpy.abs(dist, dist)
533    dist.sort(axis=1)
534    h = dist[:, r]
535    del dist # to free memory
536    w = [x] - numpy.transpose([x])
537    w /= h
538    w = numpy.abs(w, w)
539    w = numpy.clip(w, 0.0, 1.0, w)
540#    w = numpy.clip(numpy.abs(([x]-numpy.transpose([x]))/h),0.0,1.0)
541    w **= 3
542    w *= -1
543    w += 1
544#    w = 1 - w**3 #1-w*w*w
545    w **= 3
546#    w = w**3 #w*w*w
547#    hest = [numpy.sort(numpy.abs(x-xest[i]))[r] for i in range(nest)]    # r-th min. distance from xest[i] to x
548    dist = [x] - numpy.transpose([xest])
549    dist = numpy.abs(dist, dist)
550    dist.sort(axis=1)
551    hest = dist[:, r]
552    del dist # to free memory
553#    west = numpy.clip(numpy.abs(([xest]-numpy.transpose([x]))/hest),0.0,1.0)  # shape: (len(x), len(xest)
554    west = [xest]-numpy.transpose([x])
555    west /= hest
556    west = numpy.abs(west, west)
557    west = numpy.clip(west, 0.0, 1.0, west)
558#    west = 1 - west**3 #1-west*west*west
559    west **= 3
560    west *= -1
561    west += 1
562#    west = west**3 #west*west*west
563    west **= 3
564    yest = numpy.zeros(n,'f')
565    yest2 = numpy.zeros(nest,'f')
566    delta = numpy.ones(n,'f')
567    iter_count = iter*(nest + n) if iter > 1 else nest
568    milestones = orngMisc.progressBarMilestones(iter_count)
569    curr_iter = 0
570    for iteration in range(iter):
571        # fit xest
572        for i in range(nest):
573            weights = delta * west[:,i]
574            b = numpy.array([numpy.sum(weights*y), numpy.sum(weights*y*x)])
575            A = numpy.array([[numpy.sum(weights), numpy.sum(weights*x)], [numpy.sum(weights*x), numpy.sum(weights*x*x)]])
576            beta = numpy.linalg.solve(A, b)
577            yest2[i] = beta[0] + beta[1]*xest[i]
578            if progressCallback and curr_iter in milestones:
579                progressCallback(100. * curr_iter / iter_count)
580            curr_iter += 1
581               
582        # fit x (to calculate residuals and delta)
583        if iter > 1:
584            for i in range(n):
585                weights = delta * w[:,i]
586                b = numpy.array([numpy.sum(weights*y), numpy.sum(weights*y*x)])
587                A = numpy.array([[numpy.sum(weights), numpy.sum(weights*x)], [numpy.sum(weights*x), numpy.sum(weights*x*x)]])
588                beta = numpy.linalg.solve(A,b)
589                yest[i] = beta[0] + beta[1]*x[i]
590                if progressCallback and curr_iter in milestones:
591                    progressCallback(100. * curr_iter / iter_count)
592                curr_iter += 1
593            residuals = y-yest
594            s = numpy.median(numpy.abs(residuals))
595            delta = numpy.clip(residuals/(6*s), -1, 1)
596            delta = 1-delta*delta
597            delta = delta*delta
598    return yest2
599
600
601def attr_group_indices(data, label_groups):
602    """ Return a two or more lists of indices into `data.domain` based on `label_groups`
603   
604    Example::
605        cancer_indices, no_cancer_indices = attr_group_indices(data, [("disease state", "cancer"), ("disease state", "normal")])
606    """
607    ret = []
608    for key, val in label_groups:
609        ind = [i for i, attr in enumerate(data.domain.attributes) if attr.attributes.get(key, None) == val]
610        ret.append(ind)
611    return ret
612
613
614def example_group_indices(data, attr, values):
615    """ Return lists of indices into `data` for each `values` item that matches
616    the example value at `attr` attribute
617   
618    Example::
619        cls_ind1, cls_ind2 = example_group_indices(data, data.domain.classVar, ["Class 1", "Class 2"])
620    """
621    ret = [[] for _ in values]
622    values_id = dict([(str(value), i) for i, value in enumerate(values)])
623    for i, ex in enumerate(data):
624        id = values_id.get(str(ex[attr]), None)
625        if id is not None:
626            ret[id].append(i)
627    return ret
628   
629   
630def data_group_split(data, label_groups):
631    """ Split an `data` example table into two or more based on
632    contents of iterable `label_groups` containing (key, value)
633    pairs matching the labels of data attributes.
634   
635    Example::
636        cancer, no_cancer = data_group_split(data, [("disease state", "cancer"), ("disease state", "normal")])
637    """
638    ret = []
639    group_indices = attr_group_indices(data, label_groups)
640    for indices in group_indices:
641        attrs = [data.domain[i] for i in indices]
642        domain = orange.Domain(attrs, data.domain.classVar)
643        domain.addmetas(data.domain.getmetas())
644        ret.append(orange.ExampleTable(domain, data))
645    return ret
646   
647   
648def select_indices(data, key, value, axis=1):
649    """ Return indices into `data` (ExampleTable) along specified `axis`
650    where:
651        - if axis == 0 match data[i][key] == value
652        - if axis == 1 match data.domain[i].attributes[key] == value
653   
654    Example::
655        cancer_ind = select_indices(data, key="disease state", value="cancer"), axis=1)
656        normal_ind = select_indices(data, key="disease state", value=["normal"], axis=1) # value can be a list to specify more then one value
657       
658    """
659    values = value if isinstance(value, list) else [value]
660    if axis == 0:
661        groups = example_group_indices(data, key, values)
662    else:
663        groups = attr_group_indices(data, [(key, val) for val in values])
664       
665    return sorted(reduce(set.union, groups, set()))
666
667
668def select_data(data, key, value, axis=1):
669    """ Return `data` (ExampleTable) subset along specified `axis` where
670    where:
671        - if axis == 0 match data[i][key] == value
672        - if axis == 1 match data.domain[i].attributes[key] == value
673        .. note:: This preserves all meta attributes of the domain
674    Example::
675        cancer = select_data(data, "disease state", "cancer", axis=1)
676        normal = select_data(data, "disease state", ["normal"], axis=1) # value can be a list to specify more then one value
677       
678    """
679    indices = select_indices(data, key, value, axis)
680    if axis == 0:
681        examples = [data[i] for i in indices]
682        return orange.ExampleTable(data.domain, examples)
683    else:
684        attrs = [data.domain[i] for i in indices]
685        domain = orange.Domain(attrs, False)
686        domain.addmetas(data.domain.getmetas())
687        return orange.ExampleTable(domain, data)
688   
689   
690def split_data(data, groups, axis=1):
691    """ Split data (ExampleTable) along specified axis, where elements of
692    `groups` match `key` and `value` arguments of the `select_data`
693    function
694   
695    Example::
696        cancer, normal = split_data(data, [("disease state", "cancer"), ("disease state", ["normal"])], axis=1)
697    """
698    res = []
699    for key, value in groups:
700        res.append(select_data(data, key, value, axis))
701    return res
702
703   
704def geometric_mean(array):
705    """ Return a geometric mean computed on a 1d masked array
706    """
707    array = numpy.ma.asanyarray(array)
708    return numpy.power(reduce(lambda a,b: a*b, array.filled(1.), 1.0), 1./len(array))
709
710
711def harmonic_mean(array):
712    """ Return a harmonic mean computed ona a 1d masked array
713    """
714    array = numpy.ma.asanyarray(array)
715    return len(array) / numpy.ma.sum(1. / array)
716
717
718def merge_replicates(replicates, axis=0, merge_function=numpy.ma.average):
719    """ Merge `replicates` (numpy.array) along `axis` using `merge_function`
720    """
721    return numpy.ma.apply_along_axis(merge_function, axis, replicates)
722
723
724def ratio_intensity(G, R):
725    """ return the log2(R/G), log10(R*G) as a tuple
726    """
727    log2Ratio = numpy.ma.log(R/G) / numpy.log(2)
728    log10Intensity = numpy.ma.log10(R*G)
729    return log2Ratio, log10Intensity
730
731
732def MA_center_average(G, R):
733    """ return the G, R by centering the average log2 ratio
734    """
735    center_est = numpy.ma.average(numpy.ma.log(R/G) / numpy.log(2))
736    G = G * numpy.exp2(center_est)
737    return G, R.copy()
738
739
740def MA_center_lowess(G, R, f=2./3., iter=1, progressCallback=None):
741    """ return the G, R by centering the average log2 ratio locally
742    depending on the intensity using lowess (locally weighted linear regression)
743    """
744#    from Bio.Statistics.lowess import lowess
745    ratio, intensity = ratio_intensity(G, R)
746    valid = - (ratio.mask & intensity.mask)
747    valid_ind = numpy.ma.where(valid)
748    center_est = lowess(intensity[valid], ratio[valid], f=f, iter=iter, progressCallback=progressCallback)
749    Gc, R = G.copy(), R.copy()
750    Gc[valid] *= numpy.exp2(center_est)
751    Gc.mask, R.mask = -valid, -valid
752    return Gc, R
753
754
755def MA_center_lowess_fast(G, R, f=2./3., iter=1, resolution=100, progressCallback=None):
756    """return the G, R by centering the average log2 ratio locally
757    depending on the intensity using lowess (locally weighted linear regression),
758    approximated only in a limited resolution.
759    """
760   
761    ratio, intensity = ratio_intensity(G, R)
762    valid = - (ratio.mask & intensity.mask)
763    resoluiton = min(resolution, len(intensity[valid]))
764    hist, edges = numpy.histogram(intensity[valid], len(intensity[valid])/resolution)
765   
766    progressCallback2 = (lambda val: progressCallback(val/2)) if progressCallback else None 
767    centered = lowess2(intensity[valid], ratio[valid], edges, f, iter, progressCallback=progressCallback2)
768
769    progressCallback2 = (lambda val: progressCallback(50 + val/2)) if progressCallback else None
770    centered = lowess2(edges, centered, intensity[valid], f, iter, progressCallback=progressCallback2)
771   
772    Gc, R = G.copy(), R.copy()
773    Gc[valid] *= numpy.exp2(centered)
774    Gc.mask, R.mask = -valid, -valid
775    return Gc, R
776
777
778def MA_plot(G, R, format="b."):
779    """ Plot G, R on a MA-plot using matplotlib
780    """
781    import matplotlib.pyplot as plt
782    ratio, intensity = ratio_intensity(G, R)
783    plt.plot(intensity, ratio, format)
784    plt.ylabel('M = log2(R/G')
785    plt.xlabel('A = log10(R*G)')
786
787
788def normalize_expression_data(data, groups, axis=1, merge_function=numpy.ma.average, center_function=MA_center_lowess_fast):
789    """ A helper function that normalizes expression array example table, by centering the MA plot.
790   
791    """
792    if isinstance(data, orange.ExampleTable):
793        label_groups = [select_indices(data, key, value, axis) for key, value in groups]
794        array, _, _ = data.toNumpyMA()
795   
796    merged = []
797    for indices in label_groups:
798        replicates = numpy.take(array, indices, axis=1)
799        merged.append(merge_replicates(replicates, axis=1, merge_function=merge_function))
800       
801    ind1, ind2 = label_groups
802    G, R = merged
803    Gc, Rc = center_function(G, R)
804   
805    domain = orange.Domain(data.domain.attributes, data.domain.classVar)
806    domain.addmetas(data.domain.getmetas())
807    data = orange.ExampleTable(domain, data)
808   
809    GFactors = Gc/G
810   
811    if axis == 0:
812        for i, gf in zip(ind1, GFactors):
813            for attr in range(len(data[i])):
814                if not data[i][attr].isSpecial():
815                    data[i][attr] = float(data[i][attr]) * gf
816    else:   
817        for ex, gf in zip(data, GFactors):
818            for i in ind1:
819                if not ex[i].isSpecial():
820                    ex[i] = float(ex[i]) * gf
821    return data
822   
823   
824def MA_zscore(G, R, window=1./5., padded=False, progressCallback=None):
825    """ Return the Z-score of log2 fold ratio estimated from local
826    distribution of log2 fold ratio values on the MA-plot
827    """
828    ratio, intensity = ratio_intensity(G, R)
829   
830    z_scores = numpy.ma.zeros(G.shape)
831    sorted = list(numpy.ma.argsort(intensity))
832    import math, random
833    r = int(math.ceil(len(sorted)*window)) # number of window elements
834    def local_indices(i, sorted):
835        """ local indices in sorted (mirror padded if out of bounds)
836        """
837        start, end = i - r/2, i + r/2 + r%2
838        pad_start , pad_end = [], []
839        if start < 0:
840            pad_start = sorted[:abs(start)]
841            random.shuffle(pad_start)
842            start = 0
843        if end > len(sorted):
844            pad_end = sorted[end - len(sorted):]
845            random.shuffle(pad_end)
846            end = len(sorted)
847       
848        if padded:
849            return pad_start + sorted[start: end] + pad_end
850        else:
851            return sorted[start:end]
852   
853    milestones = orngMisc.progressBarMilestones(len(sorted))
854    for i in range(len(sorted)):
855        indices = local_indices(i, sorted)
856        localRatio = numpy.take(ratio, indices)
857        local_std = numpy.ma.std(localRatio)
858        ind = sorted[i]
859        z_scores[ind] = ratio[ind] / local_std
860        if progressCallback and i in milestones:
861            progressCallback(100. * i / len(sorted))
862       
863    z_scores._mask = - numpy.isfinite(z_scores)
864    return z_scores
865   
Note: See TracBrowser for help on using the repository browser.