#
source:
orange-bioinformatics/orangecontrib/bio/obiDifscale.py
@
1873:0810c5708cc5

Revision 1873:0810c5708cc5, 12.7 KB checked in by Ales Erjavec <ales.erjavec@…>, 7 months ago (diff) |
---|

Rev | Line | |
---|---|---|

[1632] | 1 | from __future__ import absolute_import |

2 | ||

[1442] | 3 | import random |

4 | from math import log | |

[1448] | 5 | from operator import itemgetter |

[1442] | 6 | |

7 | import numpy | |

8 | ||

9 | import Orange | |

10 | ||

[1632] | 11 | from . import obiGEO |

12 | from .obiExpression import ExpressionSignificance_Test | |

[1442] | 13 | |

[1450] | 14 | # Utility functiions |

15 | ||

16 | log2 = lambda x: log(x, 2.) | |

17 | ||

18 | def my_ratio(x, y): | |

19 | """ compute the log-ratio """ | |

20 | return log2(x/y) | |

21 | ||

22 | def sign(x): | |

23 | return cmp(x, 0) | |

24 | ||

25 | def common_domain(data1, data2): | |

26 | """Use only attributes that are in both data sets""" | |

27 | atts = sorted(set([a.name for a in data1.domain.attributes]).intersection( | |

28 | [a.name for a in data2.domain.attributes])) | |

29 | new1 = Orange.data.Table(Orange.data.Domain(atts + [data1.domain.classVar], data1.domain), data1) | |

30 | new2 = Orange.data.Table(Orange.data.Domain(atts + [data2.domain.classVar], data2.domain), data2) | |

31 | return new1, new2 | |

32 | ||

33 | def common_genes(data1, data2, att='gene'): | |

34 | common_set = list(set(ex[att].value for ex in data1).intersection(ex[att].value for ex in data2)) | |

35 | print len(set(ex[att].value for ex in data1)) | |

36 | print len(common_set) | |

37 | return data1.filter(**{att: common_set}), data2.filter(**{att: common_set}) | |

38 | ||

[1442] | 39 | # Normalization |

40 | ||

41 | def quantilenorm(data): | |

[1446] | 42 | """normalization of microarray data to obtain the same distribution in all chips""" |

[1442] | 43 | chips = data.domain.attributes |

44 | genes = [str(d["gene"]) for d in data] | |

45 | d_sort = {} | |

46 | for c in chips: | |

47 | dc = [(float(d[c]), str(d["gene"])) for d in data] | |

48 | dc.sort() | |

49 | d_sort[c.name] = dc | |

50 | d_sort2 = dict([(str(d["gene"]),{}) for d in data]) | |

51 | for i in range(len(data)): | |

52 | genes_c = [(d_sort[c.name][i][1], c.name) for c in chips] | |

53 | mean_row = numpy.mean([d_sort[c.name][i][0] for c in chips]) | |

54 | for g, d in genes_c: | |

55 | d_sort2.get(g, {}).update(dict([(d, mean_row)])) | |

56 | data_norm = Orange.data.Table(data.domain) | |

57 | for i, d in enumerate(data): | |

58 | g = str(d["gene"]) | |

59 | ex = [d_sort2[g][c.name] for c in chips] | |

60 | data_norm.append(ex) | |

61 | data_norm[i]["gene"] = g | |

62 | return data_norm | |

63 | ||

64 | def get_mediannorm(modeldata): | |

[1446] | 65 | """returns the function for median scale normalization""" |

[1442] | 66 | globalmedian = numpy.median([float(d[c]) for d in modeldata for c in modeldata.domain.attributes]) |

67 | def normalize(data): | |

68 | chips = data.domain.attributes | |

69 | medians = [numpy.median([float(d[c]) for d in data]) for c in chips] | |

70 | data_norm = Orange.data.Table(data.domain) | |

[1446] | 71 | data_norm.domain.add_metas(data.domain.get_metas()) |

[1442] | 72 | for i, d in enumerate(data): |

73 | ex = [(d[c.name].value*globalmedian)/medians[k] for k,c in enumerate(chips)] | |

74 | data_norm.append(ex) | |

[1446] | 75 | for m in data.domain.get_metas(): |

76 | data_norm[i][m] = data[i][m] | |

[1442] | 77 | return data_norm |

78 | return normalize | |

79 | ||

[1450] | 80 | def medianscalenorm(data, newsamples=None, intersection=True): |

[1446] | 81 | """normalization of microarray data to center the distributions in all chips on the same global median""" |

[1442] | 82 | if not newsamples: |

[1450] | 83 | n = get_mediannorm(data) |

[1442] | 84 | return n(data), None |

85 | else: | |

[1450] | 86 | if intersection: |

87 | idata, inewsamples = common_genes(data, newsamples) | |

88 | n = get_mediannorm(idata) | |

89 | return n(idata), n(inewsamples) | |

[1442] | 90 | |

91 | def normalize(data1, data2=None, type='median'): | |

92 | if type == 'quantile': | |

93 | ndata1 = quantilenorm(data1) | |

94 | return ndata1, medianscalenorm(ndata1, data2)[1] if data2 else None | |

95 | elif type == 'median': | |

96 | return medianscalenorm(data1, data2) | |

97 | else: | |

98 | return Error | |

99 | ||

100 | ||

101 | # Gene filtering | |

102 | ||

103 | def compute_diff_pairs(at_a, d): | |

104 | """ computes the pairwise differences between the replicates """ | |

105 | differ = [] | |

106 | for i in range(len (at_a)-1): | |

107 | for j in range(i+1, len(at_a)): | |

108 | differ.append(log2(d[at_a[i]])-log2(d[at_a[j]])) | |

109 | return differ | |

110 | ||

[1446] | 111 | def costruct_series(data, attr_set, differences=False): |

[1442] | 112 | """ Constructs the time series by averaging the replicates of the same time point """ |

[1446] | 113 | serie = dict([(str(d["gene"]), [numpy.mean([d[at] for at in at_samples]) |

114 | for at_name, at_samples in attr_set]) for d in data]) | |

115 | if differences: | |

116 | """store the differences between replicates while creating the time series""" | |

[1442] | 117 | differences = [] |

[1446] | 118 | r = [[differences.extend(compute_diff_pairs(at_samples, d)) |

119 | for at_name, at_samples in attr_set] for d in data] | |

[1442] | 120 | return serie, differences |

121 | else: | |

122 | return serie | |

123 | ||

124 | def costruct_control_series(serie, num_points, control): | |

125 | """ Creation of the control series (for each gene) as a constant profile equal to expression at time 0 """ | |

126 | ##print "control series..." | |

127 | serie_0 = {} | |

128 | if control == "avg": | |

129 | serie_0.update(dict([(g,[numpy.mean(serie[g]) for i in range(num_points)]) for g in serie])) | |

130 | elif control == "t0": | |

131 | serie_0.update(dict([(g,[serie[g][0] for i in range(num_points)]) for g in serie])) | |

132 | return serie_0 | |

133 | ||

[1464] | 134 | def compute_area(vals, t, baseline=0.): |

135 | return sum(a_pair((t[i], vals[i]), (t[i+1], vals[i+1]), baseline) \ | |

136 | for i in xrange(len(vals)-1)) | |

[1442] | 137 | |

[1464] | 138 | def a_pair(p1, p2, baseline): |

139 | """Area under the line bounded by a pair of two points (x,y) with | |

140 | respect to baseline. Both parts around the diagonal are | |

141 | positive.""" | |

142 | x1,y1 = p1 | |

143 | x2,y2 = p2 | |

144 | x2 = x2-x1 #same start | |

145 | x1 = 0.0 | |

146 | a = y1-baseline | |

147 | b = y2-baseline | |

148 | if a*b >= 0: #both on one side | |

149 | return abs(x2*(b+a)/2.0) | |

150 | else: | |

151 | xp = -a * x2 / float(y2-y1) | |

152 | return (abs(xp * a) + abs((x2-xp)*b)) / 2.0 | |

[1442] | 153 | |

154 | def uniform_time_scale(attr_set): | |

155 | """ Obtains time points with a unique measure (the lowest among [min,h,d]) present in data""" | |

156 | ord_measures = ["min","h","d"] | |

157 | converter = {"min":{"h":60, "d":24*60}, "h": {"d":24}} | |

158 | measures = list(set([t.split(" ")[1] for t,s in attr_set])) | |

159 | if len(measures) == 1: | |

160 | time_points = [float(t.split(" ")[0]) for t,s in attr_set] | |

161 | else: | |

162 | first_measure = min([(ord_measures.index(m),m) for m in measures])[1] | |

[1446] | 163 | time_points = [float(t.split(" ")[0]) for t, s in attr_set |

164 | if t.split(" ")[1] == first_measure] | |

165 | time_points.extend([float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]] | |

166 | for t, s in attr_set if t.split(" ")[1] != first_measure]) | |

[1442] | 167 | return time_points |

168 | ||

[1446] | 169 | def AREA(data, attr_set, control='t0', weighted=False, auto=False, perc=99): |

[1442] | 170 | """ AREA (Area Under the Curve) filtering method """ |

[1474] | 171 | from matplotlib.mlab import prctile |

[1446] | 172 | if weighted: |

[1442] | 173 | time_points = uniform_time_scale(attr_set) |

174 | else: | |

175 | time_points = range(len(attr_set)) | |

176 | ||

[1446] | 177 | # Monte Carlo approach to create the null distribution of the areas |

178 | if auto: # null distribution | |

179 | serie, differences = costruct_series(data, attr_set, auto) | |

180 | serie_campionata = {} | |

181 | area = [] | |

182 | for i in range(20000): | |

183 | serie_campionata = random.sample(differences, len(time_points)) | |

184 | area.append(compute_area(serie_campionata, time_points)) | |

185 | area_threshold = prctile(area, perc) | |

186 | else: | |

187 | serie = costruct_series(data, attr_set, auto) | |

188 | ||

[1442] | 189 | serie_0 = costruct_control_series(serie, len(time_points), control) |

190 | ||

[1446] | 191 | # Gene filtering |

[1442] | 192 | areas = [] |

193 | for g in serie: | |

194 | diff_s = [log2(serie[g][i]) - log2(serie_0[g][i]) for i in range(len(time_points))] | |

195 | area_diff = compute_area(diff_s, time_points); | |

[1446] | 196 | if not auto or area_diff > area_threshold: |

197 | areas.append((g, area_diff)) | |

198 | return areas | |

[1442] | 199 | |

[1448] | 200 | class ExpressionSignificance_AREA(ExpressionSignificance_Test): |

201 | def __call__(self, target=None): | |

202 | attr_set = {} | |

203 | for a in self.data.domain.attributes: | |

204 | attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name] | |

205 | scores = AREA(self.data, sorted(attr_set.items())) | |

[1466] | 206 | gene2ind = dict((g, i) for i,g in enumerate(ex['gene'].value for ex in self.data)) |

207 | return [(gene2ind[g], s) for g, s in scores] | |

[1448] | 208 | |

[1446] | 209 | def FC(data, attr_set, control='t0', thr=2, auto=False, p_thr=0.2): |

[1442] | 210 | """ Gene filtering based on the number of FC of all time points with the control series > thr """ |

[1446] | 211 | serie = costruct_series(data, attr_set, False) |

[1442] | 212 | num_points = len(serie.values()[0]) |

213 | serie_0 = costruct_control_series(serie, num_points, control) | |

[1446] | 214 | fc = [(g, len([0 for i in range(num_points) if abs(my_ratio(s[i], serie_0[g][i])) >= thr])) |

215 | for g, s in serie.items()] | |

216 | if auto: | |

217 | thr_points = round(p_thr*num_points) | |

218 | fc = [(g, v) for g, v in fc if v >= thr_points] | |

219 | return fc | |

[1442] | 220 | |

[1448] | 221 | class ExpressionSignificance_FCts(ExpressionSignificance_Test): |

222 | def __call__(self, target=None): | |

223 | attr_set = {} | |

224 | for a in self.data.domain.attributes: | |

225 | attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name] | |

226 | scores = FC(self.data, sorted(attr_set.items())) | |

227 | return zip(self.keys, map(itemgetter(1), scores)) | |

228 | ||

[1442] | 229 | def spearmanr_filter(data, limit=1000): |

230 | """ Spearman ranks gene filtering """ | |

231 | from scipy.stats import spearmanr | |

232 | time = [a.attributes['time'] for a in data.domain.attributes] | |

233 | exs = sorted(data, reverse=True, key=lambda ex: abs(spearmanr([a.value for a in ex], time)[0])) | |

234 | return [str(ex['gene']) for ex in exs[:limit]] | |

235 | ||

236 | ||

[1450] | 237 | def signed_PCA(data): |

[1612] | 238 | pca = Orange.projection.linear.PCA(data, standardize=False, max_components=1) |

[1450] | 239 | classifier = lambda X: [x[0].value for x in pca(X)] |

240 | predictions = classifier(data) | |

241 | classes = [ex.getclass().value for ex in data] | |

242 | n = 0 | |

243 | for i1,c1 in enumerate(classes): | |

244 | for i2,c2 in enumerate(classes[:i1]): | |

245 | n += cmp(c1,c2) * cmp(predictions[i1], predictions[i2]) | |

246 | if n < 0: | |

247 | def invert(X): | |

248 | y = classifier(X) | |

249 | return -y if type(y) == float else [-x for x in y] | |

250 | return invert | |

251 | else: | |

252 | return classifier | |

253 | ||

254 | signed_PCA.name = 'PCA' | |

255 | ||

256 | ||

257 | def conttime(data, d): | |

258 | for a in data.domain.attributes: | |

259 | a.attributes['time'] = d[a.attributes['time']] | |

260 | ||

261 | def conv(attr_set, ticks=True): | |

262 | """Obtain time points with a unique measure (the lowest among [min,h,d]) present in data""" | |

263 | ord_measures = ["min","h","d"] | |

264 | converter = {"min":{"h":60, "d":24*60}, "h": {"d":24}} | |

265 | measures = list(set([t.split(" ")[1] for t in attr_set])) | |

266 | if len(measures) == 1: | |

267 | time_points = [(t, float(t.split(" ")[0])) for t in attr_set] | |

268 | else: | |

269 | first_measure = min([(ord_measures.index(m),m) for m in measures])[1] | |

270 | time_points = [(t, float(t.split(" ")[0])) for t in attr_set if t.split(" ")[1] == first_measure] | |

271 | time_points.extend([(t, float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]]) | |

272 | for t in attr_set if t.split(" ")[1] != first_measure]) | |

273 | time_points.sort(key=itemgetter(1)) | |

274 | if ticks: | |

275 | time_points = [(t[0],float(i)) for i,t in enumerate(time_points)] | |

276 | return dict(time_points) | |

277 | ||

278 | ||

279 | def get_projections(data1, data2=None): | |

280 | labels1 = list(a.attributes['time'] for a in data1.domain.attributes) | |

281 | tdata1 = obiGEO.transpose(data1) | |

282 | if data2: | |

[1466] | 283 | labels2 = list('[%s]' % a.attributes['time'] for a in data2.domain.attributes) |

[1450] | 284 | tdata2 = obiGEO.transpose(data2) |

285 | tdata1, tdata2 = common_domain(tdata1, tdata2) | |

286 | classifier = signed_PCA(tdata1) | |

287 | proj1 = classifier(tdata1) | |

288 | proj2 = classifier(tdata2) | |

289 | else: | |

290 | classifier = signed_PCA(tdata1) | |

291 | proj1 = classifier(tdata1) | |

292 | proj2, labels2 = [], [] | |

293 | return proj1, labels1, proj2, labels2 | |

294 | ||

295 | ||

296 | ############ | |

297 | ||

298 | if False and __name__ == '__main__': | |

[1632] | 299 | from . import obiGEO |

[1450] | 300 | # Data set 1 |

301 | data1 = obiGEO.GDS('GDS2666').getdata(report_genes=True, transpose=False) | |

302 | labels1 = list(a.attributes['time'] for a in data1.domain.attributes) | |

303 | attr_set = list(set(a.attributes['time'] for a in data1.domain.attributes)) | |

304 | convd = conv(attr_set) | |

305 | conttime(data1, convd) | |

306 | # Data set 2 | |

307 | data2 = obiGEO.GDS('GDS2667').getdata(report_genes=True, transpose=False) | |

308 | labels2 = list(a.attributes['time'] for a in data2.domain.attributes) | |

309 | attr_set = list(set(a.attributes['time'] for a in data2.domain.attributes)) | |

310 | convd = conv(attr_set) | |

311 | conttime(data2, convd) | |

312 | # Normalize data set 1 | |

313 | ndata1, _ = normalize(data1, type='quantile') | |

314 | # Filtering | |

315 | attr_set = {} | |

316 | for a in ndata1.domain.attributes: | |

317 | attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name] | |

318 | scores = AREA(ndata1, sorted(attr_set.items())) | |

319 | genes = map(itemgetter(0), sorted(scores, key=itemgetter(1))[:1000]) | |

320 | fdata1 = ndata1.filter(gene=genes) | |

321 | # Rescale data set 2 | |

322 | ttrain, ttest = normalize(fdata1, data2) | |

323 | # Model construction and prediction | |

324 | # train = obiGEO.transpose(ttrain) | |

325 | # test = obiGEO.transpose(ttest) | |

326 | # cdtrain, cdtest = common_domain(train, test) | |

327 | # classifier = signed_PCA(cdtrain) | |

328 | # proj1 = classifier(cdtrain) | |

329 | # proj2 = classifier(cdtest) | |

330 | ||

331 | ||

332 |

**Note:**See TracBrowser for help on using the repository browser.