source: orange-bioinformatics/_bioinformatics/obiDifscale.py @ 1636:10d234fdadb9

Revision 1636:10d234fdadb9, 12.7 KB checked in by mitar, 2 years ago (diff)

Restructuring because we will not be using namespaces.

Line 
1from __future__ import absolute_import
2
3import random
4from math import log
5from operator import itemgetter
6
7import numpy
8
9import Orange
10
11from . import obiGEO
12from .obiExpression import ExpressionSignificance_Test
13
14# Utility functiions
15
16log2 = lambda x: log(x, 2.)
17
18def my_ratio(x, y):
19    """ compute the log-ratio """
20    return log2(x/y)
21
22def sign(x):
23    return cmp(x, 0)
24
25def common_domain(data1, data2):
26    """Use only attributes that are in both data sets"""
27    atts = sorted(set([a.name for a in data1.domain.attributes]).intersection(
28               [a.name for a in data2.domain.attributes]))
29    new1 = Orange.data.Table(Orange.data.Domain(atts + [data1.domain.classVar], data1.domain), data1)
30    new2 = Orange.data.Table(Orange.data.Domain(atts + [data2.domain.classVar], data2.domain), data2)
31    return new1, new2
32
33def common_genes(data1, data2, att='gene'):
34    common_set = list(set(ex[att].value for ex in data1).intersection(ex[att].value for ex in data2))
35    print len(set(ex[att].value for ex in data1))
36    print len(common_set)
37    return data1.filter(**{att: common_set}), data2.filter(**{att: common_set})
38
39# Normalization
40
41def quantilenorm(data):
42    """normalization of microarray data to obtain the same distribution in all chips"""
43    chips = data.domain.attributes
44    genes = [str(d["gene"]) for d in data]
45    d_sort = {}
46    for c in chips:
47        dc = [(float(d[c]), str(d["gene"])) for d in data]
48        dc.sort()
49        d_sort[c.name] = dc
50    d_sort2 = dict([(str(d["gene"]),{}) for d in data])
51    for i in range(len(data)):
52        genes_c = [(d_sort[c.name][i][1], c.name) for c in chips]
53        mean_row = numpy.mean([d_sort[c.name][i][0] for c in chips])
54        for g, d in genes_c:
55            d_sort2.get(g, {}).update(dict([(d, mean_row)]))
56    data_norm = Orange.data.Table(data.domain)
57    for i, d in enumerate(data):
58        g = str(d["gene"])
59        ex = [d_sort2[g][c.name] for c in chips]
60        data_norm.append(ex)
61        data_norm[i]["gene"] = g
62    return data_norm
63
64def get_mediannorm(modeldata):
65    """returns the function for median scale normalization"""
66    globalmedian = numpy.median([float(d[c]) for d in modeldata for c in modeldata.domain.attributes])
67    def normalize(data):
68        chips = data.domain.attributes
69        medians = [numpy.median([float(d[c]) for d in data]) for c in chips]
70        data_norm = Orange.data.Table(data.domain)
71        data_norm.domain.add_metas(data.domain.get_metas())
72        for i, d in enumerate(data):
73            ex = [(d[c.name].value*globalmedian)/medians[k] for k,c in enumerate(chips)]
74            data_norm.append(ex)
75            for m in data.domain.get_metas():
76                data_norm[i][m] = data[i][m]
77        return data_norm
78    return normalize
79
80def medianscalenorm(data, newsamples=None, intersection=True):
81    """normalization of microarray data to center the distributions in all chips on the same global median"""
82    if not newsamples:
83        n = get_mediannorm(data)
84        return n(data), None
85    else:
86        if intersection:
87            idata, inewsamples = common_genes(data, newsamples)
88        n = get_mediannorm(idata)
89        return n(idata), n(inewsamples)
90
91def normalize(data1, data2=None, type='median'):
92    if type == 'quantile':
93        ndata1 = quantilenorm(data1)
94        return ndata1, medianscalenorm(ndata1, data2)[1] if data2 else None
95    elif type == 'median':
96        return medianscalenorm(data1, data2)
97    else:
98        return Error
99
100
101# Gene filtering
102
103def compute_diff_pairs(at_a, d):
104    """ computes the pairwise differences between the replicates """
105    differ = []
106    for i in range(len (at_a)-1):
107        for j in range(i+1, len(at_a)):
108            differ.append(log2(d[at_a[i]])-log2(d[at_a[j]]))
109    return differ
110
111def costruct_series(data, attr_set, differences=False):
112    """ Constructs the time series by averaging the replicates of the same time point """
113    serie = dict([(str(d["gene"]), [numpy.mean([d[at] for at in at_samples])
114                    for at_name, at_samples in attr_set]) for d in data])
115    if differences:
116        """store the differences between replicates while creating the time series"""
117        differences = []
118        r = [[differences.extend(compute_diff_pairs(at_samples, d))
119              for at_name, at_samples in attr_set] for d in data]
120        return serie, differences
121    else:
122        return serie
123
124def costruct_control_series(serie, num_points, control):
125    """ Creation of the control series (for each gene) as a constant profile equal to expression at time 0 """
126    ##print "control series..."
127    serie_0 = {}
128    if control == "avg":
129        serie_0.update(dict([(g,[numpy.mean(serie[g]) for i in range(num_points)]) for g in serie]))
130    elif control == "t0":
131        serie_0.update(dict([(g,[serie[g][0] for i in range(num_points)]) for g in serie]))
132    return serie_0
133
134def compute_area(vals, t, baseline=0.):
135    return sum(a_pair((t[i], vals[i]), (t[i+1], vals[i+1]), baseline) \
136        for i in xrange(len(vals)-1))
137
138def a_pair(p1, p2, baseline):
139    """Area under the line bounded by a pair of two points (x,y) with
140    respect to baseline. Both parts around the diagonal are
141    positive."""
142    x1,y1 = p1
143    x2,y2 = p2
144    x2 = x2-x1 #same start
145    x1 = 0.0
146    a = y1-baseline
147    b = y2-baseline
148    if a*b >= 0: #both on one side
149        return abs(x2*(b+a)/2.0)
150    else:
151        xp = -a * x2 / float(y2-y1)
152        return (abs(xp * a) + abs((x2-xp)*b)) / 2.0
153
154def uniform_time_scale(attr_set):
155    """ Obtains time points with a unique measure (the lowest among [min,h,d]) present in data"""
156    ord_measures = ["min","h","d"]
157    converter = {"min":{"h":60, "d":24*60}, "h": {"d":24}}
158    measures = list(set([t.split(" ")[1] for t,s in attr_set]))
159    if len(measures) == 1:
160        time_points = [float(t.split(" ")[0]) for t,s in attr_set]
161    else:
162        first_measure = min([(ord_measures.index(m),m) for m in measures])[1]
163        time_points = [float(t.split(" ")[0]) for t, s in attr_set
164                       if t.split(" ")[1] == first_measure]
165        time_points.extend([float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]]
166                            for t, s in attr_set if t.split(" ")[1] != first_measure])
167    return time_points
168
169def AREA(data, attr_set, control='t0', weighted=False, auto=False, perc=99):
170    """ AREA (Area Under the Curve) filtering method """
171    from matplotlib.mlab import prctile
172    if weighted:
173        time_points = uniform_time_scale(attr_set)
174    else:
175        time_points = range(len(attr_set))
176
177    # Monte Carlo approach to create the null distribution of the areas
178    if auto: # null distribution
179        serie, differences = costruct_series(data, attr_set, auto)
180        serie_campionata = {}
181        area = []
182        for i in range(20000):
183            serie_campionata = random.sample(differences, len(time_points))
184            area.append(compute_area(serie_campionata, time_points))
185        area_threshold = prctile(area, perc)
186    else:
187        serie = costruct_series(data, attr_set, auto)
188
189    serie_0 = costruct_control_series(serie, len(time_points), control)
190
191    # Gene filtering
192    areas = []
193    for g in serie:
194        diff_s = [log2(serie[g][i]) - log2(serie_0[g][i]) for i in range(len(time_points))]
195        area_diff = compute_area(diff_s, time_points);
196        if not auto or area_diff > area_threshold:
197            areas.append((g, area_diff))
198    return areas
199
200class ExpressionSignificance_AREA(ExpressionSignificance_Test):
201    def __call__(self, target=None):
202        attr_set = {}
203        for a in self.data.domain.attributes:
204            attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name]
205        scores = AREA(self.data, sorted(attr_set.items()))
206        gene2ind = dict((g, i) for i,g in enumerate(ex['gene'].value for ex in self.data))
207        return [(gene2ind[g], s) for g, s in scores]
208
209def FC(data, attr_set, control='t0', thr=2, auto=False, p_thr=0.2):
210    """ Gene filtering based on the number of FC of all time points with the control series > thr """
211    serie = costruct_series(data, attr_set, False)
212    num_points = len(serie.values()[0])
213    serie_0 = costruct_control_series(serie, num_points, control)
214    fc = [(g, len([0 for i in range(num_points) if abs(my_ratio(s[i], serie_0[g][i])) >= thr]))
215          for g, s in serie.items()]
216    if auto:
217        thr_points = round(p_thr*num_points)
218        fc = [(g, v) for g, v in fc if v >= thr_points]
219    return fc
220
221class ExpressionSignificance_FCts(ExpressionSignificance_Test):
222    def __call__(self, target=None):
223        attr_set = {}
224        for a in self.data.domain.attributes:
225            attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name]
226        scores = FC(self.data, sorted(attr_set.items()))
227        return zip(self.keys, map(itemgetter(1), scores))
228
229def spearmanr_filter(data, limit=1000):
230    """ Spearman ranks gene filtering """
231    from scipy.stats import spearmanr
232    time = [a.attributes['time'] for a in data.domain.attributes]
233    exs = sorted(data, reverse=True, key=lambda ex: abs(spearmanr([a.value for a in ex], time)[0]))
234    return [str(ex['gene']) for ex in exs[:limit]]
235
236
237def signed_PCA(data):
238    pca = Orange.projection.pca.Pca(data, standardize=False, max_components=1)
239    classifier = lambda X: [x[0].value for x in pca(X)]
240    predictions = classifier(data)
241    classes = [ex.getclass().value for ex in data]
242    n = 0
243    for i1,c1 in enumerate(classes):
244        for i2,c2 in enumerate(classes[:i1]):
245            n += cmp(c1,c2) * cmp(predictions[i1], predictions[i2])
246    if n < 0:
247        def invert(X):
248            y = classifier(X)
249            return -y if type(y) == float else [-x for x in y]
250        return invert
251    else:
252        return classifier
253
254signed_PCA.name = 'PCA'
255
256
257def conttime(data, d):
258    for a in data.domain.attributes:
259        a.attributes['time'] = d[a.attributes['time']]
260
261def conv(attr_set, ticks=True):
262    """Obtain time points with a unique measure (the lowest among [min,h,d]) present in data"""
263    ord_measures = ["min","h","d"]
264    converter = {"min":{"h":60, "d":24*60}, "h": {"d":24}}
265    measures = list(set([t.split(" ")[1] for t in attr_set]))
266    if len(measures) == 1:
267        time_points = [(t, float(t.split(" ")[0])) for t in attr_set]
268    else:
269        first_measure = min([(ord_measures.index(m),m) for m in measures])[1]
270        time_points = [(t, float(t.split(" ")[0])) for t in attr_set if t.split(" ")[1] == first_measure]
271        time_points.extend([(t, float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]])
272                            for t in attr_set if t.split(" ")[1] != first_measure])
273    time_points.sort(key=itemgetter(1))
274    if ticks:
275        time_points = [(t[0],float(i)) for i,t in enumerate(time_points)]
276    return dict(time_points)
277
278
279def get_projections(data1, data2=None):
280    labels1 = list(a.attributes['time'] for a in data1.domain.attributes)
281    tdata1 = obiGEO.transpose(data1)
282    if data2:
283        labels2 = list('[%s]' % a.attributes['time'] for a in data2.domain.attributes)
284        tdata2 = obiGEO.transpose(data2)
285        tdata1, tdata2 = common_domain(tdata1, tdata2)
286        classifier = signed_PCA(tdata1)
287        proj1 = classifier(tdata1)
288        proj2 = classifier(tdata2)
289    else:
290        classifier = signed_PCA(tdata1)
291        proj1 = classifier(tdata1)
292        proj2, labels2 = [], []
293    return proj1, labels1, proj2, labels2
294
295
296############
297
298if False and __name__ == '__main__':
299    from . import obiGEO
300    # Data set 1
301    data1 = obiGEO.GDS('GDS2666').getdata(report_genes=True, transpose=False)
302    labels1 = list(a.attributes['time'] for a in data1.domain.attributes)
303    attr_set = list(set(a.attributes['time'] for a in data1.domain.attributes))
304    convd = conv(attr_set)
305    conttime(data1, convd)
306    # Data set 2
307    data2 = obiGEO.GDS('GDS2667').getdata(report_genes=True, transpose=False)
308    labels2 = list(a.attributes['time'] for a in data2.domain.attributes)
309    attr_set = list(set(a.attributes['time'] for a in data2.domain.attributes))
310    convd = conv(attr_set)
311    conttime(data2, convd)
312    # Normalize data set 1
313    ndata1, _ = normalize(data1, type='quantile')
314    # Filtering
315    attr_set = {}
316    for a in ndata1.domain.attributes:
317        attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name]
318    scores = AREA(ndata1, sorted(attr_set.items()))
319    genes = map(itemgetter(0), sorted(scores, key=itemgetter(1))[:1000])
320    fdata1 = ndata1.filter(gene=genes)
321    # Rescale data set 2
322    ttrain, ttest = normalize(fdata1, data2)
323    # Model construction and prediction
324    # train = obiGEO.transpose(ttrain)
325    # test = obiGEO.transpose(ttest)
326    # cdtrain, cdtest = common_domain(train, test)
327    # classifier = signed_PCA(cdtrain)
328    # proj1 = classifier(cdtrain)
329    # proj2 = classifier(cdtest)
330
331
332
Note: See TracBrowser for help on using the repository browser.