Changeset 1450:6c2b050c2c2a in orangebioinformatics
 Timestamp:
 07/05/11 16:28:04 (3 years ago)
 Branch:
 default
 Convert:
 ec92f1ec0993ae454fb16bf73b97dcde03cb87a3
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

obiDifscale.py
r1448 r1450 10 10 from obiExpression import ExpressionSignificance_Test 11 11 12 13 # Utility functiions 14 15 log2 = lambda x: log(x, 2.) 16 17 def my_ratio(x, y): 18 """ compute the logratio """ 19 return log2(x/y) 20 21 def sign(x): 22 return cmp(x, 0) 23 24 def common_domain(data1, data2): 25 """Use only attributes that are in both data sets""" 26 atts = sorted(set([a.name for a in data1.domain.attributes]).intersection( 27 [a.name for a in data2.domain.attributes])) 28 new1 = Orange.data.Table(Orange.data.Domain(atts + [data1.domain.classVar], data1.domain), data1) 29 new2 = Orange.data.Table(Orange.data.Domain(atts + [data2.domain.classVar], data2.domain), data2) 30 return new1, new2 31 32 def common_genes(data1, data2, att='gene'): 33 common_set = list(set(ex[att].value for ex in data1).intersection(ex[att].value for ex in data2)) 34 print len(set(ex[att].value for ex in data1)) 35 print len(common_set) 36 return data1.filter(**{att: common_set}), data2.filter(**{att: common_set}) 12 37 13 38 # Normalization … … 52 77 return normalize 53 78 54 def medianscalenorm(data, newsamples=None ):79 def medianscalenorm(data, newsamples=None, intersection=True): 55 80 """normalization of microarray data to center the distributions in all chips on the same global median""" 56 n = get_mediannorm(data)57 81 if not newsamples: 82 n = get_mediannorm(data) 58 83 return n(data), None 59 84 else: 60 return n(data), n(newsamples) 85 if intersection: 86 idata, inewsamples = common_genes(data, newsamples) 87 n = get_mediannorm(idata) 88 return n(idata), n(inewsamples) 61 89 62 90 def normalize(data1, data2=None, type='median'): … … 71 99 72 100 # Gene filtering 73 74 log2 = lambda x: log(x, 2.)75 76 def my_ratio(x, y):77 """ compute the logratio """78 return log2(x/y)79 80 def sign(int):81 """ returns the sign of int """82 if(int < 0):83 return 1;84 elif(int > 0):85 return 1;86 else:87 return 0;88 101 89 102 def compute_diff_pairs(at_a, d): … … 218 231 219 232 233 def signed_PCA(data): 234 pca = Orange.projection.pca.Pca(data, standardize=False, max_components=1) 235 classifier = lambda X: [x[0].value for x in pca(X)] 236 predictions = classifier(data) 237 classes = [ex.getclass().value for ex in data] 238 n = 0 239 for i1,c1 in enumerate(classes): 240 for i2,c2 in enumerate(classes[:i1]): 241 n += cmp(c1,c2) * cmp(predictions[i1], predictions[i2]) 242 if n < 0: 243 def invert(X): 244 y = classifier(X) 245 return y if type(y) == float else [x for x in y] 246 return invert 247 else: 248 return classifier 249 250 signed_PCA.name = 'PCA' 251 252 253 def conttime(data, d): 254 for a in data.domain.attributes: 255 a.attributes['time'] = d[a.attributes['time']] 256 257 def conv(attr_set, ticks=True): 258 """Obtain time points with a unique measure (the lowest among [min,h,d]) present in data""" 259 ord_measures = ["min","h","d"] 260 converter = {"min":{"h":60, "d":24*60}, "h": {"d":24}} 261 measures = list(set([t.split(" ")[1] for t in attr_set])) 262 if len(measures) == 1: 263 time_points = [(t, float(t.split(" ")[0])) for t in attr_set] 264 else: 265 first_measure = min([(ord_measures.index(m),m) for m in measures])[1] 266 time_points = [(t, float(t.split(" ")[0])) for t in attr_set if t.split(" ")[1] == first_measure] 267 time_points.extend([(t, float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]]) 268 for t in attr_set if t.split(" ")[1] != first_measure]) 269 time_points.sort(key=itemgetter(1)) 270 if ticks: 271 time_points = [(t[0],float(i)) for i,t in enumerate(time_points)] 272 return dict(time_points) 273 274 275 def get_projections(data1, data2=None): 276 labels1 = list(a.attributes['time'] for a in data1.domain.attributes) 277 tdata1 = obiGEO.transpose(data1) 278 if data2: 279 labels2 = list(a.attributes['time'] for a in data2.domain.attributes) 280 tdata2 = obiGEO.transpose(data2) 281 tdata1, tdata2 = common_domain(tdata1, tdata2) 282 classifier = signed_PCA(tdata1) 283 proj1 = classifier(tdata1) 284 proj2 = classifier(tdata2) 285 else: 286 classifier = signed_PCA(tdata1) 287 proj1 = classifier(tdata1) 288 proj2, labels2 = [], [] 289 return proj1, labels1, proj2, labels2 290 291 292 ############ 293 294 if False and __name__ == '__main__': 295 import obiGEO 296 # Data set 1 297 data1 = obiGEO.GDS('GDS2666').getdata(report_genes=True, transpose=False) 298 labels1 = list(a.attributes['time'] for a in data1.domain.attributes) 299 attr_set = list(set(a.attributes['time'] for a in data1.domain.attributes)) 300 convd = conv(attr_set) 301 conttime(data1, convd) 302 # Data set 2 303 data2 = obiGEO.GDS('GDS2667').getdata(report_genes=True, transpose=False) 304 labels2 = list(a.attributes['time'] for a in data2.domain.attributes) 305 attr_set = list(set(a.attributes['time'] for a in data2.domain.attributes)) 306 convd = conv(attr_set) 307 conttime(data2, convd) 308 # Normalize data set 1 309 ndata1, _ = normalize(data1, type='quantile') 310 # Filtering 311 attr_set = {} 312 for a in ndata1.domain.attributes: 313 attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name] 314 scores = AREA(ndata1, sorted(attr_set.items())) 315 genes = map(itemgetter(0), sorted(scores, key=itemgetter(1))[:1000]) 316 fdata1 = ndata1.filter(gene=genes) 317 # Rescale data set 2 318 ttrain, ttest = normalize(fdata1, data2) 319 # Model construction and prediction 320 # train = obiGEO.transpose(ttrain) 321 # test = obiGEO.transpose(ttest) 322 # cdtrain, cdtest = common_domain(train, test) 323 # classifier = signed_PCA(cdtrain) 324 # proj1 = classifier(cdtrain) 325 # proj2 = classifier(cdtest) 326 327 328
Note: See TracChangeset
for help on using the changeset viewer.