Changeset 1962:d35f315a7d08 in orange-bioinformatics


Ignore:
Timestamp:
02/18/14 16:02:08 (2 months ago)
Author:
markotoplak
Branch:
default
Message:

Moved obiExpression -> utils.expression. Removed dependencies on old Gary Strangman stats.py.

Location:
orangecontrib/bio
Files:
1 added
1 edited
1 copied

Legend:

Unmodified
Added
Removed
  • orangecontrib/bio/obiExpression.py

    r1873 r1962  
    11from __future__ import absolute_import 
    22 
    3 import numpy 
    4  
    5 import orange, statc 
    6  
    7 from . import stats 
    8  
    9 def mean(l): 
    10     return float(sum(l))/len(l) 
    11  
    12 class MA_pearsonCorrelation: 
    13     """ 
    14     Calling an object of this class computes Pearson correlation of all 
    15     attributes against class. 
    16     """ 
    17     def __call__(self, i, data): 
    18         dom2 = orange.Domain([data.domain.attributes[i]], data.domain.classVar) 
    19         data2 = orange.ExampleTable(dom2, data) 
    20         a,c = data2.toNumpy("A/C") 
    21         return numpy.corrcoef(c,a[:,0])[0,1] 
    22  
    23 class MA_signalToNoise: 
    24     """ 
    25     Returns signal to noise measurement: difference of means of two classes 
    26     divided by the sum of standard deviations for both classes.  
    27  
    28     Usege similar to MeasureAttribute*. 
    29  
    30     Standard deviation used for now returns minmally 0.2*|mi|, where mi=0 is adjusted to mi=1 
    31     (as in gsea implementation). 
    32  
    33     Can work only on data with two classes. If there are multiple class, then 
    34     relevant class values can be specified on object initialization. 
    35     By default the relevant classes are first and second class value 
    36     from the domain. 
    37     """ 
    38  
    39     def __init__(self, a=None, b=None): 
    40         """ 
    41         a and b are choosen class values. 
    42         """ 
    43         self.a = a 
    44         self.b = b 
    45  
    46     def __call__(self, i, data): 
    47         cv = data.domain.classVar 
    48         #print data.domain 
    49  
    50         if self.a == None: self.a = cv.values[0] 
    51         if self.b == None: self.b = cv.values[1] 
    52  
    53         def stdev(l): 
    54             return statc.std(l) 
    55  
    56         def mean(l): 
    57             return statc.mean(l) 
    58  
    59         def stdevm(l): 
    60             m = mean(l) 
    61             std = stdev(l) 
    62             #return minmally 0.2*|mi|, where mi=0 is adjusted to mi=1 
    63             return max(std, 0.2*abs(1.0 if m == 0 else m)) 
    64  
    65         def avWCVal(value): 
    66             return [ex[i].value for ex in data if ex[-1].value == value and not ex[i].isSpecial() ] 
    67  
    68         exa = avWCVal(self.a) 
    69         exb = avWCVal(self.b) 
    70  
    71         try: 
    72             rval = (mean(exa)-mean(exb))/(stdevm(exa)+stdevm(exb)) 
    73             return rval 
    74         except: 
    75             #return some "middle" value - 
    76             #TODO rather throw exception?  
    77             return 0 
    78  
    79 class MA_t_test(object): 
    80     def __init__(self, a=None, b=None, prob=False): 
    81         self.a = a 
    82         self.b = b 
    83         self.prob = prob 
    84     def __call__(self, i, data): 
    85         cv = data.domain.classVar 
    86         #print data.domain 
    87  
    88         #for faster computation. to save dragging many attributes along 
    89         dom2 = orange.Domain([data.domain[i]], data.domain.classVar) 
    90         data = orange.ExampleTable(dom2, data) 
    91         i = 0 
    92  
    93         if self.a == None: self.a = cv.values[0] 
    94         if self.b == None: self.b = cv.values[1] 
    95  
    96         def avWCVal(value): 
    97             return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ] 
    98  
    99         exa = avWCVal(self.a) 
    100         exb = avWCVal(self.b) 
    101  
    102         try: 
    103             t, prob = stats.lttest_ind(exa, exb) 
    104             return prob if self.prob else t 
    105         except: 
    106             return 1.0 if self.prob else 0.0 
    107  
    108 class MA_fold_change(object): 
    109     def __init__(self, a=None, b=None): 
    110         self.a = a 
    111         self.b = b 
    112     def __call__(self, i, data): 
    113         cv = data.domain.classVar 
    114         #print data.domain 
    115  
    116         #for faster computation. to save dragging many attributes along 
    117         dom2 = orange.Domain([data.domain[i]], data.domain.classVar) 
    118         data = orange.ExampleTable(dom2, data) 
    119         i = 0 
    120  
    121         if self.a == None: self.a = cv.values[0] 
    122         if self.b == None: self.b = cv.values[1] 
    123  
    124         def avWCVal(value): 
    125             return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ] 
    126  
    127         exa = avWCVal(self.a) 
    128         exb = avWCVal(self.b) 
    129  
    130         try: 
    131             return mean(exa)/mean(exb) 
    132         except: 
    133             return 1 
    134  
    135 class MA_anova(object): 
    136     def __init__(self, prob=False): 
    137         self.prob = prob 
    138     def __call__(self, i, data): 
    139         cv = data.domain.classVar 
    140         #print data.domain 
    141  
    142         #for faster computation. to save dragging many attributes along 
    143         dom2 = orange.Domain([data.domain[i]], data.domain.classVar) 
    144         data = orange.ExampleTable(dom2, data) 
    145         i = 0 
    146  
    147         def avWCVal(value): 
    148             return [ex[i].value for ex in data if ex[cv] == value and not ex[i].isSpecial() ] 
    149  
    150         data = [avWCVal(val) for val in cv.values] 
    151  
    152         try: 
    153             f, prob = stats.lF_oneway(*tuple(data)) 
    154             return prob if self.prob else f 
    155         except: 
    156             return 1.0 if self.prob else 0.0 
    157  
    158 import numpy as np 
    159 import numpy.ma as ma 
    160  
    161 class ExpressionSignificance_Test(object): 
    162     def __new__(cls, data, useAttributeLabels, **kwargs): 
    163         self = object.__new__(cls) 
    164         if kwargs: 
    165             self.__init__(data, useAttributeLabels) 
    166             return self.__call__(**kwargs) 
    167         else: 
    168             return self 
    169      
    170     def __init__(self, data, useAttributeLabels=False): 
    171         self.data = data 
    172         self.useAttributeLabels = useAttributeLabels 
    173         self.attr_labels, self.data_classes = self._data_info(data) 
    174         self.attributes = [attr for attr in self.data.domain.attributes if attr.varType in [orange.VarTypes.Continuous, orange.VarTypes.Discrete]] 
    175         self.classes = np.array(self.attr_labels if useAttributeLabels else self.data_classes) 
    176         self.keys = range(len(data)) if useAttributeLabels else self.attributes 
    177         self.array, _, _ = data.toNumpyMA() 
    178         if self.useAttributeLabels: 
    179             self.array = ma.transpose(self.array) 
    180 #        self.dim = 1 if useAttributeLabels else 0   
    181         self.dim = 0 
    182          
    183     def _data_info(self, data): 
    184         return [set(attr.attributes.items()) for attr in data.domain.attributes], [ex.getclass() for ex in data] if data.domain.classVar else [None]*len(data) 
    185          
    186     def test_indices(self, target, classes=None): 
    187         classes = self.classes if classes is None else classes 
    188          
    189         def target_set(target): 
    190             if isinstance(target, tuple): 
    191                 return set([target]) 
    192             else: 
    193                 assert(isinstance(target, set)) 
    194                 return target 
    195              
    196         if self.useAttributeLabels: 
    197             if isinstance(target, list): 
    198                 ind = [[i for i, cl in enumerate(self.classes) if target_set(t).intersection(cl)] for t in target] 
    199             else: 
    200                 target = target_set(target) 
    201                  
    202                 ind1 = [i for i, cl in enumerate(self.classes) if target.intersection(cl)] 
    203                 ind2 = [i for i, cl in enumerate(self.classes) if not target.intersection(cl)] 
    204                 ind = [ind1, ind2] 
    205         else: 
    206             if isinstance(target, list): 
    207                 ind = [ma.nonzero(self.classes == t)[0] for t in target] 
    208             else: 
    209                 if isinstance(target, (basestring, orange.Variable)): 
    210                     target = set([target]) 
    211                 else: 
    212                     assert(isinstance(target, set)) 
    213                 target = list(target) 
    214                 ind1 = [i for i, cl in enumerate(self.classes) if cl in target] 
    215                 ind2 = [i for i, cl in enumerate(self.classes) if cl not in target] 
    216                 ind = [ind1, ind2] 
    217                  
    218         return ind 
    219      
    220     def __call__(self, target): 
    221         raise NotImplementedError() 
    222      
    223     def null_distribution(self, num, *args, **kwargs): 
    224         kwargs = dict(kwargs) 
    225         advance = lambda: None 
    226         if "advance" in kwargs: 
    227             advance = kwargs["advance"] 
    228             del kwargs["advance"] 
    229         results = [] 
    230         originalClasses = self.classes.copy() 
    231         for i in range(num): 
    232             np.random.shuffle(self.classes) 
    233             results.append(self.__call__(*args, **kwargs)) 
    234             advance() 
    235         self.classes = originalClasses 
    236         return results 
    237      
    238 class ExpressionSignificance_TTest(ExpressionSignificance_Test): 
    239     def __call__(self, target): 
    240         ind1, ind2 = self.test_indices(target) 
    241         t, pval = attest_ind(self.array[ind1, :], self.array[ind2, :], dim=self.dim) 
    242         return zip(self.keys,  zip(t, pval)) 
    243          
    244 class ExpressionSignificance_FoldChange(ExpressionSignificance_Test): 
    245     def __call__(self, target): 
    246         ind1, ind2 = self.test_indices(target) 
    247         a1, a2 = self.array[ind1, :], self.array[ind2, :] 
    248         fold = ma.mean(a1, self.dim)/ma.mean(a2, self.dim) 
    249         return zip(self.keys, fold) 
    250      
    251 class ExpressionSignificance_SignalToNoise(ExpressionSignificance_Test): 
    252     def __call__(self, target): 
    253         ind1, ind2 = self.test_indices(target) 
    254         a1, a2 = self.array[ind1, :], self.array[ind2, :] 
    255         stn = (ma.mean(a1, self.dim) - ma.mean(a2, self.dim)) / (ma.sqrt(ma.var(a1, self.dim)) + ma.sqrt(ma.var(a2, self.dim))) 
    256         return zip(self.keys, stn) 
    257      
    258 class ExpressionSignificance_ANOVA(ExpressionSignificance_Test): 
    259     def __call__(self, target=None): 
    260         if target is not None: 
    261             indices = self.test_indices(target) 
    262         else: 
    263             indices = [] 
    264         f, prob = aF_oneway(*[self.array[ind, :] for ind in indices], **dict(dim=0)) 
    265         return zip(self.keys, zip(f, prob)) 
    266          
    267 class ExpressionSignificance_ChiSquare(ExpressionSignificance_Test): 
    268     def __call__(self, target): 
    269         array = equi_n_discretization(self.array.copy(), intervals=5, dim=0) 
    270         ind1, ind2 = self.test_indices(target) 
    271         a1, a2 = array[ind1, :], array[ind2, :] 
    272         dist1, dist2  = [], [] 
    273         dist = ma.zeros((array.shape[1], 2, 5)) 
    274         for i in range(5): 
    275             dist1.append(ma.sum(ma.ones(a1.shape) * (a1 == i), 0)) 
    276             dist2.append(ma.sum(ma.ones(a2.shape) * (a2 == i), 0)) 
    277             dist[:, 0, i] = dist1[-1] 
    278             dist[:, 1, i] = dist2[-1]  
    279         return zip(self.keys, achisquare_indtest(np.array(dist), dim=1)) 
    280          
    281 class ExpressionSignificance_Info(ExpressionSignificance_Test): 
    282     def __call__(self, target): 
    283         array = equi_n_discretization(self.array.copy(), intervals=5, dim=1) 
    284          
    285         ind1, ind2 = self.test_indices(target) 
    286         a1, a2 = array[ind1, :], array[ind2, :] 
    287         dist1, dist2 = [], [] 
    288         dist = ma.zeros((array.shape[1], 2, 5)) 
    289         for i in range(5): 
    290             dist1.append(ma.sum(ma.ones(a1.shape) * (a1 == i), 0)) 
    291             dist2.append(ma.sum(ma.ones(a2.shape) * (a2 == i), 0)) 
    292             dist[:, 0, i] = dist1[-1] 
    293             dist[:, 1, i] = dist2[-1] 
    294         classinfo = entropy(np.array([len(ind1), len(ind2)])) 
    295         E = ma.sum(entropy(dist, dim=1) * ma.sum(dist, 1), 1) / ma.sum(ma.sum(dist, 1), 1) 
    296         return zip(self.keys, classinfo - E) 
    297      
    298 class ExpressionSignificance_MannWhitneyu(ExpressionSignificance_Test): 
    299     def __call__(self, target): 
    300         ind1, ind2 = self.test_indices(target) 
    301         a, b = self.array[ind1, :], self.array[ind2, :] 
    302 #        results = [amannwhitneyu(a[:, i],b[:, i]) for i in range(a.shape[1])] 
    303         results = [statc.mannwhitneyu(list(a[:, i]),list(b[:, i])) for i in range(a.shape[1])] 
    304          
    305         return zip(self.keys, results) 
    306  
    307 def attest_ind(a, b, dim=None): 
    308     """ Return the t-test statistics on arrays a and b over the dim axis. 
    309     Returns both the t statistic as well as the p-value 
    310     """ 
    311 #    dim = a.ndim - 1 if dim is None else dim 
    312     x1, x2 = ma.mean(a, dim), ma.mean(b, dim) 
    313     v1, v2 = ma.var(a, dim), ma.var(b, dim) 
    314     n1, n2 = (a.shape[dim], b.shape[dim]) if dim is not None else (a.size, b.size) 
    315     df = float(n1+n2-2) 
    316     svar = ((n1-1)*v1+(n2-1)*v2) / df 
    317     t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2)) 
    318     if t.ndim == 0: 
    319         return (t, statc.betai(0.5*df,0.5,df/(df+t**2)) if t is not ma.masked and df/(df+t**2) <= 1.0 else ma.masked) 
    320     else: 
    321         prob = [statc.betai(0.5*df,0.5,df/(df+tsq)) if tsq is not ma.masked and df/(df+tsq) <= 1.0 else ma.masked  for tsq in t*t] 
    322         return t, prob 
    323  
    324 def aF_oneway(*args, **kwargs): 
    325     dim = kwargs.get("dim", None) 
    326     arrays = args 
    327     means = [ma.mean(a, dim) for a in arrays] 
    328     vars = [ma.var(a, dim) for a in arrays] 
    329     lens = [ma.sum(ma.array(ma.ones(a.shape), mask=ma.asarray(a).mask), dim) for a in arrays] 
    330     alldata = ma.concatenate(arrays, dim if dim is not None else 0) 
    331     bign =  ma.sum(ma.array(ma.ones(alldata.shape), mask=alldata.mask), dim) 
    332     sstot = ma.sum(alldata ** 2, dim) - (ma.sum(alldata, dim) ** 2) / bign 
    333     ssbn = ma.sum([(ma.sum(a, dim) ** 2) / L for a, L in zip(arrays, lens)], dim) 
    334 #    print ma.sum(alldata, dim) ** 2 / bign, ssbn 
    335     ssbn -= ma.sum(alldata, dim) ** 2 / bign 
    336     sswn = sstot - ssbn 
    337     dfbn = dfnum = float(len(args) - 1.0) 
    338     dfwn = bign - len(args) # + 1.0 
    339     F = (ssbn / dfbn) / (sswn / dfwn) 
    340     if F.ndim == 0 and dfwn.ndim == 0: 
    341         return (F,statc.betai(0.5 * dfwn, 0.5 * dfnum, dfwn/float(dfwn+dfnum*F)) if F is not ma.masked and dfwn/float(dfwn+dfnum*F) <= 1.0 \ 
    342                 and dfwn/float(dfwn+dfnum*F) >= 0.0 else ma.masked) 
    343     else: 
    344         prob = [statc.betai(0.5 * dfden, 0.5 * dfnum, dfden/float(dfden+dfnum*f)) if f is not ma.masked and dfden/float(dfden+dfnum*f) <= 1.0 \ 
    345             and dfden/float(dfden+dfnum*f) >= 0.0 else ma.masked for dfden, f in zip (dfwn, F)] 
    346         return F, prob 
    347      
    348 def achisquare_indtest(observed, dim=None): 
    349     if observed.ndim == 2: 
    350         observed = ma.array([observed]) 
    351         if dim is not None: 
    352             dim += 1  
    353     if dim is None: 
    354         dim = observed.ndim - 2 
    355     rowtotal = ma.sum(observed, dim + 1) 
    356     coltotal = ma.sum(observed, dim) 
    357     total = ma.sum(rowtotal, dim) 
    358     ones = ma.array(ma.ones(observed.shape)) 
    359     expected = ones * rowtotal.reshape(rowtotal.shape[:dim] + (-1, 1)) 
    360     a = ones * coltotal[..., np.zeros(observed.shape[dim], dtype=int),:] 
    361     expected = expected * (a) / total.reshape((-1, 1, 1)) 
    362     chisq = ma.sum(ma.sum((observed - expected) ** 2 / expected, dim + 1), dim) 
    363     return chisq 
    364      
    365 def equi_n_discretization(array, intervals=5, dim=1): 
    366     count = ma.sum(ma.array(ma.ones(array.shape, dtype=int), mask=array.mask), dim) 
    367     cut = ma.zeros(len(count), dtype=int) 
    368     sarray = ma.sort(array, dim) 
    369     r = count % intervals 
    370     pointsshape = list(array.shape) 
    371     pointsshape[dim] = 1 
    372     points = [] 
    373     for i in range(intervals): 
    374         cutend = cut + count / intervals + numpy.ones(len(r)) * (r > i) 
    375         if dim == 1: 
    376             p = sarray[range(len(cutend)), numpy.array(cutend, dtype=int) -1] 
    377         else: 
    378             p = sarray[numpy.array(cutend, dtype=int) -1, range(len(cutend))] 
    379         points.append(p.reshape(pointsshape)) 
    380         cut = cutend 
    381     darray = ma.array(ma.zeros(array.shape) - 1, mask=array.mask) 
    382     darray[ma.nonzero(array <= points[0])] = 0 
    383     for i in range(0, intervals): 
    384         darray[ma.nonzero((array > points[i]))] = i + 1  
    385     return darray 
    386  
    387 def entropy(array, dim=None): 
    388     if dim is None: 
    389         array = array.ravel() 
    390         dim = 0 
    391     n = ma.sum(array, dim) 
    392     array = ma.log(array) * array 
    393     sum = ma.sum(array, dim) 
    394     return (ma.log(n) - sum / n) / ma.log(2.0) 
    395  
    396 """\ 
    397 MA - Plot 
    398 ========= 
    399  
    400 Functions for normalization of expression arrays and ploting 
    401 MA - Plots 
    402  
    403 Example:: 
    404     ## Load data from GEO 
    405     >>> data = orange.ExampleTable("GDS1210.tab") 
    406     ## Split data by columns into normal and cancer subsets 
    407     >>> cancer, normal = data_split(data, [("disease state", "cancer"), ("disease state", "normal")]) 
    408     ## Convert to numpy MaskedArrays 
    409     >>> cancer, normal = cancer.toNumpyMA("A")[0], normal.toNumpyMA("A")[0] 
    410     ## Merge by averaging 
    411     >>> cancer = merge_replicates(cancer) 
    412     >>> normal = merge_replicates(normal) 
    413     ## Plot MA-plot 
    414     >>> MA_plot(cancer, normal) 
    415      
    416 """ 
    417  
    418 from Orange.orng import orngMisc 
    419 from numpy import median 
    420 def lowess(x, y, f=2./3., iter=3, progressCallback=None): 
    421     """ Lowess taken from Bio.Statistics.lowess, modified to compute pairwise  
    422     distances inplace. 
    423       
    424     lowess(x, y, f=2./3., iter=3) -> yest 
    425  
    426     Lowess smoother: Robust locally weighted regression. 
    427     The lowess function fits a nonparametric regression curve to a scatterplot. 
    428     The arrays x and y contain an equal number of elements; each pair 
    429     (x[i], y[i]) defines a data point in the scatterplot. The function returns 
    430     the estimated (smooth) values of y. 
    431  
    432     The smoothing span is given by f. A larger value for f will result in a 
    433     smoother curve. The number of robustifying iterations is given by iter. The 
    434     function will run faster with a smaller number of iterations. 
    435  
    436     x and y should be numpy float arrays of equal length.  The return value is 
    437     also a numpy float array of that length. 
    438  
    439     e.g. 
    440     >>> import numpy 
    441     >>> x = numpy.array([4,  4,  7,  7,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12, 
    442     ...                 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 
    443     ...                 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 
    444     ...                 20, 22, 23, 24, 24, 24, 24, 25], numpy.float) 
    445     >>> y = numpy.array([2, 10,  4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 
    446     ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 
    447     ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 
    448     ...                 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, 
    449     ...                 64, 66, 54, 70, 92, 93, 120, 85], numpy.float) 
    450     >>> result = lowess(x, y) 
    451     >>> len(result) 
    452     50 
    453     >>> print "[%0.2f, ..., %0.2f]" % (result[0], result[-1]) 
    454     [4.85, ..., 84.98] 
    455     """ 
    456     n = len(x) 
    457     r = min(int(numpy.ceil(f*n)), n - 1) 
    458      
    459 #    h = [numpy.sort(numpy.abs(x-x[i]))[r] for i in range(n)] 
    460 #    h, xtmp = numpy.zeros_like(x), numpy.zeros_like(x) 
    461 #    for i in range(n): 
    462 #        xtmp = numpy.abs(x - x[i], xtmp) 
    463 #        h[i] = numpy.sort(xtmp)[r] 
    464 #    w = numpy.clip(numpy.abs(([x]-numpy.transpose([x]))/h),0.0,1.0) 
    465     dist = [x] - numpy.transpose([x]) 
    466     dist = numpy.abs(dist, dist) 
    467     dist.sort(axis=1) 
    468     h = dist[:, r] 
    469     del dist 
    470  
    471     w = [x]-numpy.transpose([x]) 
    472     w /= h 
    473     w = numpy.abs(w, w) 
    474     w = numpy.clip(w, 0.0, 1.0, w) 
    475 #    w = 1-w*w*w 
    476     w **= 3 
    477     w *= -1 
    478     w += 1 
    479 #    w = w*w*w 
    480     w **= 3 
    481     yest = numpy.zeros(n) 
    482     delta = numpy.ones(n) 
    483     milestones = orngMisc.progressBarMilestones(iter*n) 
    484     for iteration in range(iter): 
    485         for i in xrange(n): 
    486             weights = delta * w[:,i] 
    487             weights_mul_x = weights * x 
    488             b1 = numpy.ma.dot(weights,y) 
    489             b2 = numpy.ma.dot(weights_mul_x,y) 
    490             A11 = sum(weights) 
    491             A12 = sum(weights_mul_x) 
    492             A21 = A12 
    493             A22 = numpy.ma.dot(weights_mul_x,x) 
    494             determinant = A11*A22 - A12*A21 
    495             beta1 = (A22*b1-A12*b2) / determinant 
    496             beta2 = (A11*b2-A21*b1) / determinant 
    497             yest[i] = beta1 + beta2*x[i] 
    498             if progressCallback and (iteration*n + i) in milestones: 
    499                 progressCallback((100. * iteration*n + i) /  (iter * n)) 
    500         residuals = y-yest 
    501         s = median(abs(residuals)) 
    502         delta[:] = numpy.clip(residuals/(6*s),-1,1) 
    503         delta[:] = 1-delta*delta 
    504         delta[:] = delta*delta 
    505     return yest 
    506  
    507  
    508  
    509 def lowess2(x, y, xest, f=2./3., iter=3, progressCallback=None): 
    510     """Returns estimated values of y in data points xest (or None if estimation fails). 
    511     Lowess smoother: Robust locally weighted regression. 
    512     The lowess function fits a nonparametric regression curve to a scatterplot. 
    513     The arrays x and y contain an equal number of elements; each pair 
    514     (x[i], y[i]) defines a data point in the scatterplot. The function returns 
    515     the estimated (smooth) values of y. 
    516  
    517     The smoothing span is given by f. A larger value for f will result in a 
    518     smoother curve. The number of robustifying iterations is given by iter. The 
    519     function will run faster with a smaller number of iterations. 
    520      
    521     Taken from Peter Juvan's numpyExtn.py, modified for numpy, computes pairwise 
    522     distances inplace 
    523     """ 
    524     x = numpy.asarray(x, 'f') 
    525     y = numpy.asarray(y, 'f') 
    526     xest = numpy.asarray(xest, 'f') 
    527     n = len(x) 
    528     nest = len(xest) 
    529     r = min(int(numpy.ceil(f*n)),n-1) # radius: num. of points to take into LR 
    530 #    h = [numpy.sort(numpy.abs(x-x[i]))[r] for i in range(n)]    # distance of the r-th point from x[i] 
    531     dist = [x] - numpy.transpose([x]) 
    532     dist = numpy.abs(dist, dist) 
    533     dist.sort(axis=1) 
    534     h = dist[:, r] 
    535     del dist # to free memory 
    536     w = [x] - numpy.transpose([x]) 
    537     w /= h 
    538     w = numpy.abs(w, w) 
    539     w = numpy.clip(w, 0.0, 1.0, w) 
    540 #    w = numpy.clip(numpy.abs(([x]-numpy.transpose([x]))/h),0.0,1.0) 
    541     w **= 3 
    542     w *= -1 
    543     w += 1 
    544 #    w = 1 - w**3 #1-w*w*w 
    545     w **= 3 
    546 #    w = w**3 #w*w*w 
    547 #    hest = [numpy.sort(numpy.abs(x-xest[i]))[r] for i in range(nest)]    # r-th min. distance from xest[i] to x 
    548     dist = [x] - numpy.transpose([xest]) 
    549     dist = numpy.abs(dist, dist) 
    550     dist.sort(axis=1) 
    551     hest = dist[:, r] 
    552     del dist # to free memory 
    553 #    west = numpy.clip(numpy.abs(([xest]-numpy.transpose([x]))/hest),0.0,1.0)  # shape: (len(x), len(xest) 
    554     west = [xest]-numpy.transpose([x]) 
    555     west /= hest 
    556     west = numpy.abs(west, west) 
    557     west = numpy.clip(west, 0.0, 1.0, west) 
    558 #    west = 1 - west**3 #1-west*west*west 
    559     west **= 3 
    560     west *= -1 
    561     west += 1 
    562 #    west = west**3 #west*west*west 
    563     west **= 3 
    564     yest = numpy.zeros(n,'f') 
    565     yest2 = numpy.zeros(nest,'f') 
    566     delta = numpy.ones(n,'f') 
    567     iter_count = iter*(nest + n) if iter > 1 else nest 
    568     milestones = orngMisc.progressBarMilestones(iter_count) 
    569     curr_iter = 0 
    570     for iteration in range(iter): 
    571         # fit xest 
    572         for i in range(nest): 
    573             weights = delta * west[:,i] 
    574             b = numpy.array([numpy.sum(weights*y), numpy.sum(weights*y*x)]) 
    575             A = numpy.array([[numpy.sum(weights), numpy.sum(weights*x)], [numpy.sum(weights*x), numpy.sum(weights*x*x)]]) 
    576             beta = numpy.linalg.solve(A, b) 
    577             yest2[i] = beta[0] + beta[1]*xest[i] 
    578             if progressCallback and curr_iter in milestones: 
    579                 progressCallback(100. * curr_iter / iter_count) 
    580             curr_iter += 1 
    581                  
    582         # fit x (to calculate residuals and delta) 
    583         if iter > 1: 
    584             for i in range(n): 
    585                 weights = delta * w[:,i] 
    586                 b = numpy.array([numpy.sum(weights*y), numpy.sum(weights*y*x)]) 
    587                 A = numpy.array([[numpy.sum(weights), numpy.sum(weights*x)], [numpy.sum(weights*x), numpy.sum(weights*x*x)]]) 
    588                 beta = numpy.linalg.solve(A,b) 
    589                 yest[i] = beta[0] + beta[1]*x[i] 
    590                 if progressCallback and curr_iter in milestones: 
    591                     progressCallback(100. * curr_iter / iter_count) 
    592                 curr_iter += 1 
    593             residuals = y-yest 
    594             s = numpy.median(numpy.abs(residuals)) 
    595             delta = numpy.clip(residuals/(6*s), -1, 1) 
    596             delta = 1-delta*delta 
    597             delta = delta*delta 
    598     return yest2 
    599  
    600  
    601 def attr_group_indices(data, label_groups): 
    602     """ Return a two or more lists of indices into `data.domain` based on `label_groups` 
    603      
    604     Example:: 
    605         cancer_indices, no_cancer_indices = attr_group_indices(data, [("disease state", "cancer"), ("disease state", "normal")]) 
    606     """ 
    607     ret = [] 
    608     for key, val in label_groups: 
    609         ind = [i for i, attr in enumerate(data.domain.attributes) if attr.attributes.get(key, None) == val] 
    610         ret.append(ind) 
    611     return ret 
    612  
    613  
    614 def example_group_indices(data, attr, values): 
    615     """ Return lists of indices into `data` for each `values` item that matches 
    616     the example value at `attr` attribute 
    617      
    618     Example:: 
    619         cls_ind1, cls_ind2 = example_group_indices(data, data.domain.classVar, ["Class 1", "Class 2"]) 
    620     """ 
    621     ret = [[] for _ in values] 
    622     values_id = dict([(str(value), i) for i, value in enumerate(values)]) 
    623     for i, ex in enumerate(data): 
    624         id = values_id.get(str(ex[attr]), None) 
    625         if id is not None: 
    626             ret[id].append(i) 
    627     return ret 
    628      
    629      
    630 def data_group_split(data, label_groups): 
    631     """ Split an `data` example table into two or more based on 
    632     contents of iterable `label_groups` containing (key, value) 
    633     pairs matching the labels of data attributes. 
    634      
    635     Example:: 
    636         cancer, no_cancer = data_group_split(data, [("disease state", "cancer"), ("disease state", "normal")]) 
    637     """ 
    638     ret = [] 
    639     group_indices = attr_group_indices(data, label_groups) 
    640     for indices in group_indices: 
    641         attrs = [data.domain[i] for i in indices] 
    642         domain = orange.Domain(attrs, data.domain.classVar) 
    643         domain.addmetas(data.domain.getmetas()) 
    644         ret.append(orange.ExampleTable(domain, data)) 
    645     return ret 
    646      
    647      
    648 def select_indices(data, key, value, axis=1): 
    649     """ Return indices into `data` (ExampleTable) along specified `axis` 
    650     where: 
    651         - if axis == 0 match data[i][key] == value 
    652         - if axis == 1 match data.domain[i].attributes[key] == value  
    653      
    654     Example:: 
    655         cancer_ind = select_indices(data, key="disease state", value="cancer"), axis=1) 
    656         normal_ind = select_indices(data, key="disease state", value=["normal"], axis=1) # value can be a list to specify more then one value 
    657          
    658     """ 
    659     values = value if isinstance(value, list) else [value] 
    660     if axis == 0: 
    661         groups = example_group_indices(data, key, values) 
    662     else: 
    663         groups = attr_group_indices(data, [(key, val) for val in values]) 
    664          
    665     return sorted(reduce(set.union, groups, set())) 
    666  
    667  
    668 def select_data(data, key, value, axis=1): 
    669     """ Return `data` (ExampleTable) subset along specified `axis` where 
    670     where: 
    671         - if axis == 0 match data[i][key] == value 
    672         - if axis == 1 match data.domain[i].attributes[key] == value  
    673         .. note:: This preserves all meta attributes of the domain 
    674     Example:: 
    675         cancer = select_data(data, "disease state", "cancer", axis=1) 
    676         normal = select_data(data, "disease state", ["normal"], axis=1) # value can be a list to specify more then one value 
    677          
    678     """ 
    679     indices = select_indices(data, key, value, axis) 
    680     if axis == 0: 
    681         examples = [data[i] for i in indices] 
    682         return orange.ExampleTable(data.domain, examples) 
    683     else: 
    684         attrs = [data.domain[i] for i in indices] 
    685         domain = orange.Domain(attrs, False) 
    686         domain.addmetas(data.domain.getmetas()) 
    687         return orange.ExampleTable(domain, data) 
    688      
    689      
    690 def split_data(data, groups, axis=1): 
    691     """ Split data (ExampleTable) along specified axis, where elements of  
    692     `groups` match `key` and `value` arguments of the `select_data` 
    693     function  
    694      
    695     Example:: 
    696         cancer, normal = split_data(data, [("disease state", "cancer"), ("disease state", ["normal"])], axis=1) 
    697     """ 
    698     res = [] 
    699     for key, value in groups: 
    700         res.append(select_data(data, key, value, axis)) 
    701     return res 
    702  
    703      
    704 def geometric_mean(array): 
    705     """ Return a geometric mean computed on a 1d masked array 
    706     """ 
    707     array = numpy.ma.asanyarray(array) 
    708     return numpy.power(reduce(lambda a,b: a*b, array.filled(1.), 1.0), 1./len(array)) 
    709  
    710  
    711 def harmonic_mean(array): 
    712     """ Return a harmonic mean computed ona a 1d masked array 
    713     """ 
    714     array = numpy.ma.asanyarray(array) 
    715     return len(array) / numpy.ma.sum(1. / array) 
    716  
    717  
    718 def merge_replicates(replicates, axis=0, merge_function=numpy.ma.average): 
    719     """ Merge `replicates` (numpy.array) along `axis` using `merge_function` 
    720     """ 
    721     return numpy.ma.apply_along_axis(merge_function, axis, replicates) 
    722  
    723  
    724 def ratio_intensity(G, R): 
    725     """ return the log2(R/G), log10(R*G) as a tuple 
    726     """ 
    727     log2Ratio = numpy.ma.log(R/G) / numpy.log(2) 
    728     log10Intensity = numpy.ma.log10(R*G) 
    729     return log2Ratio, log10Intensity 
    730  
    731  
    732 def MA_center_average(G, R): 
    733     """ return the G, R by centering the average log2 ratio 
    734     """ 
    735     center_est = numpy.ma.average(numpy.ma.log(R/G) / numpy.log(2)) 
    736     G = G * numpy.exp2(center_est) 
    737     return G, R.copy() 
    738  
    739  
    740 def MA_center_lowess(G, R, f=2./3., iter=1, progressCallback=None): 
    741     """ return the G, R by centering the average log2 ratio locally 
    742     depending on the intensity using lowess (locally weighted linear regression) 
    743     """ 
    744 #    from Bio.Statistics.lowess import lowess 
    745     ratio, intensity = ratio_intensity(G, R) 
    746     valid = - (ratio.mask & intensity.mask) 
    747     valid_ind = numpy.ma.where(valid) 
    748     center_est = lowess(intensity[valid], ratio[valid], f=f, iter=iter, progressCallback=progressCallback) 
    749     Gc, R = G.copy(), R.copy() 
    750     Gc[valid] *= numpy.exp2(center_est) 
    751     Gc.mask, R.mask = -valid, -valid 
    752     return Gc, R 
    753  
    754  
    755 def MA_center_lowess_fast(G, R, f=2./3., iter=1, resolution=100, progressCallback=None): 
    756     """return the G, R by centering the average log2 ratio locally 
    757     depending on the intensity using lowess (locally weighted linear regression), 
    758     approximated only in a limited resolution. 
    759     """ 
    760      
    761     ratio, intensity = ratio_intensity(G, R) 
    762     valid = - (ratio.mask & intensity.mask) 
    763     resoluiton = min(resolution, len(intensity[valid])) 
    764     hist, edges = numpy.histogram(intensity[valid], len(intensity[valid])/resolution) 
    765      
    766     progressCallback2 = (lambda val: progressCallback(val/2)) if progressCallback else None  
    767     centered = lowess2(intensity[valid], ratio[valid], edges, f, iter, progressCallback=progressCallback2) 
    768  
    769     progressCallback2 = (lambda val: progressCallback(50 + val/2)) if progressCallback else None 
    770     centered = lowess2(edges, centered, intensity[valid], f, iter, progressCallback=progressCallback2) 
    771      
    772     Gc, R = G.copy(), R.copy() 
    773     Gc[valid] *= numpy.exp2(centered) 
    774     Gc.mask, R.mask = -valid, -valid 
    775     return Gc, R 
    776  
    777  
    778 def MA_plot(G, R, format="b."): 
    779     """ Plot G, R on a MA-plot using matplotlib 
    780     """ 
    781     import matplotlib.pyplot as plt 
    782     ratio, intensity = ratio_intensity(G, R) 
    783     plt.plot(intensity, ratio, format) 
    784     plt.ylabel('M = log2(R/G') 
    785     plt.xlabel('A = log10(R*G)') 
    786  
    787  
    788 def normalize_expression_data(data, groups, axis=1, merge_function=numpy.ma.average, center_function=MA_center_lowess_fast): 
    789     """ A helper function that normalizes expression array example table, by centering the MA plot. 
    790      
    791     """ 
    792     if isinstance(data, orange.ExampleTable): 
    793         label_groups = [select_indices(data, key, value, axis) for key, value in groups] 
    794         array, _, _ = data.toNumpyMA() 
    795      
    796     merged = [] 
    797     for indices in label_groups: 
    798         replicates = numpy.take(array, indices, axis=1) 
    799         merged.append(merge_replicates(replicates, axis=1, merge_function=merge_function)) 
    800          
    801     ind1, ind2 = label_groups 
    802     G, R = merged 
    803     Gc, Rc = center_function(G, R) 
    804      
    805     domain = orange.Domain(data.domain.attributes, data.domain.classVar) 
    806     domain.addmetas(data.domain.getmetas()) 
    807     data = orange.ExampleTable(domain, data) 
    808      
    809     GFactors = Gc/G 
    810      
    811     if axis == 0: 
    812         for i, gf in zip(ind1, GFactors): 
    813             for attr in range(len(data[i])): 
    814                 if not data[i][attr].isSpecial(): 
    815                     data[i][attr] = float(data[i][attr]) * gf 
    816     else:    
    817         for ex, gf in zip(data, GFactors): 
    818             for i in ind1: 
    819                 if not ex[i].isSpecial(): 
    820                     ex[i] = float(ex[i]) * gf 
    821     return data 
    822      
    823      
    824 def MA_zscore(G, R, window=1./5., padded=False, progressCallback=None): 
    825     """ Return the Z-score of log2 fold ratio estimated from local 
    826     distribution of log2 fold ratio values on the MA-plot 
    827     """ 
    828     ratio, intensity = ratio_intensity(G, R) 
    829      
    830     z_scores = numpy.ma.zeros(G.shape) 
    831     sorted = list(numpy.ma.argsort(intensity)) 
    832     import math, random 
    833     r = int(math.ceil(len(sorted)*window)) # number of window elements 
    834     def local_indices(i, sorted): 
    835         """ local indices in sorted (mirror padded if out of bounds) 
    836         """ 
    837         start, end = i - r/2, i + r/2 + r%2 
    838         pad_start , pad_end = [], [] 
    839         if start < 0: 
    840             pad_start = sorted[:abs(start)] 
    841             random.shuffle(pad_start) 
    842             start = 0 
    843         if end > len(sorted): 
    844             pad_end = sorted[end - len(sorted):] 
    845             random.shuffle(pad_end) 
    846             end = len(sorted) 
    847          
    848         if padded: 
    849             return pad_start + sorted[start: end] + pad_end 
    850         else: 
    851             return sorted[start:end] 
    852      
    853     milestones = orngMisc.progressBarMilestones(len(sorted)) 
    854     for i in range(len(sorted)): 
    855         indices = local_indices(i, sorted) 
    856         localRatio = numpy.take(ratio, indices) 
    857         local_std = numpy.ma.std(localRatio) 
    858         ind = sorted[i] 
    859         z_scores[ind] = ratio[ind] / local_std 
    860         if progressCallback and i in milestones: 
    861             progressCallback(100. * i / len(sorted)) 
    862          
    863     z_scores._mask = - numpy.isfinite(z_scores) 
    864     return z_scores 
    865      
     3from .utils.expression import * 
  • orangecontrib/bio/utils/expression.py

    r1873 r1962  
    44 
    55import orange, statc 
    6  
    7 from . import stats 
     6import scipy.stats 
     7 
     8print dir(stats) 
    89 
    910def mean(l): 
     
    101102 
    102103        try: 
    103             t, prob = stats.lttest_ind(exa, exb) 
    104             return prob if self.prob else t 
     104            t, prob = scipy.stats.ttest_ind(exa, exb) 
     105            return prob if self.prob else float(t) 
    105106        except: 
    106107            return 1.0 if self.prob else 0.0 
     
    151152 
    152153        try: 
    153             f, prob = stats.lF_oneway(*tuple(data)) 
     154            f, prob = scipy.stats.f_oneway(*tuple(data)) 
    154155            return prob if self.prob else f 
    155156        except: 
Note: See TracChangeset for help on using the changeset viewer.