Changeset 1446:d78fec13157d in orangebioinformatics
 Timestamp:
 07/01/11 13:25:59 (3 years ago)
 Branch:
 default
 Convert:
 f7d5aad0df5db2d5681517cbce3c981cb4f7e406
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

obiDifscale.py
r1442 r1446 12 12 13 13 def quantilenorm(data): 14 """ normalization of microarray data to obtain the same distribution in all chips"""14 """normalization of microarray data to obtain the same distribution in all chips""" 15 15 chips = data.domain.attributes 16 16 genes = [str(d["gene"]) for d in data] … … 35 35 36 36 def get_mediannorm(modeldata): 37 """ returns the function for median scale normalization"""37 """returns the function for median scale normalization""" 38 38 globalmedian = numpy.median([float(d[c]) for d in modeldata for c in modeldata.domain.attributes]) 39 39 def normalize(data): … … 41 41 medians = [numpy.median([float(d[c]) for d in data]) for c in chips] 42 42 data_norm = Orange.data.Table(data.domain) 43 data_norm.domain.add_metas(data.domain.get_metas()) 43 44 for i, d in enumerate(data): 44 g = str(d["gene"])45 45 ex = [(d[c.name].value*globalmedian)/medians[k] for k,c in enumerate(chips)] 46 46 data_norm.append(ex) 47 data_norm[i]["gene"] = g 47 for m in data.domain.get_metas(): 48 data_norm[i][m] = data[i][m] 48 49 return data_norm 49 50 return normalize 50 51 51 52 def medianscalenorm(data, newsamples=None): 52 """ normalization of microarray data to obtain the distributions in all chips centered on the same global median"""53 """normalization of microarray data to center the distributions in all chips on the same global median""" 53 54 n = get_mediannorm(data) 54 55 if not newsamples: … … 92 93 return differ 93 94 94 def costruct_series(data, attr_set, f):95 def costruct_series(data, attr_set, differences=False): 95 96 """ Constructs the time series by averaging the replicates of the same time point """ 96 ##print "creating the time series..."97 serie = dict([(str(d["gene"]), [numpy.mean([d[at] for at in at_samples])for at_name, at_samples in attr_set]) for d in data])98 if f == "AREA":99 """ for AREA filtering, store the differences between replicates while creating the time series"""97 serie = dict([(str(d["gene"]), [numpy.mean([d[at] for at in at_samples]) 98 for at_name, at_samples in attr_set]) for d in data]) 99 if differences: 100 """store the differences between replicates while creating the time series""" 100 101 differences = [] 101 r = [[differences.extend(compute_diff_pairs(at_samples, d)) for at_name, at_samples in attr_set] for d in data] 102 r = [[differences.extend(compute_diff_pairs(at_samples, d)) 103 for at_name, at_samples in attr_set] for d in data] 102 104 return serie, differences 103 105 else: … … 116 118 def compute_area(serie, tempi): 117 119 """ Function that calculates the area between 2 time series """ 118 # if the ith point and the next point have the same sign it computes the area by the 119 # integral with the trapezoidal method, else it computes the area of the two triangles found with linear interpolation 120 # if the ith point and the next point have the same sign it computes the 121 # area by the integral with the trapezoidal method, else it computes the 122 # area of the two triangles found with linear interpolation 120 123 area_pieces = [] 121 124 for i in range(len(serie)1): … … 123 126 area_pieces.append(abs(numpy.trapz([serie[i], serie[i+1]], [tempi[i], tempi[i+1]]))) 124 127 else: 125 area_triangle_1 = float(serie[i]*(tempi[i+1]tempi[i]))/(serie[i]serie[i+1])*abs(serie[i])/2 126 area_triangle_2 = float(serie[i+1]*(tempi[i]tempi[i+1]))/(serie[i]serie[i+1])*abs(serie[i+1])/2 128 area_triangle_1 = float(serie[i] * (tempi[i+1]  tempi[i])) / \ 129 (serie[i]  serie[i+1]) * abs(serie[i]) / 2 130 area_triangle_2 = float(serie[i+1] * (tempi[i]  tempi[i+1])) / \ 131 (serie[i]  serie[i+1]) * abs(serie[i+1]) / 2 127 132 area_pieces.append(area_triangle_1 + area_triangle_2) 128 133 … … 139 144 else: 140 145 first_measure = min([(ord_measures.index(m),m) for m in measures])[1] 141 time_points = [float(t.split(" ")[0]) for t,s in attr_set if t.split(" ")[1] == first_measure] 142 time_points.extend([float(t.split(" ")[0])*converter[first_measure][t.split(" ")[1]] for t,s in attr_set if t.split(" ")[1] != first_measure]) 146 time_points = [float(t.split(" ")[0]) for t, s in attr_set 147 if t.split(" ")[1] == first_measure] 148 time_points.extend([float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]] 149 for t, s in attr_set if t.split(" ")[1] != first_measure]) 143 150 return time_points 144 151 145 def AREA(data, attr_set, perc, control, scale, limit):152 def AREA(data, attr_set, control='t0', weighted=False, auto=False, perc=99): 146 153 """ AREA (Area Under the Curve) filtering method """ 147 if scale == "weighted":154 if weighted: 148 155 time_points = uniform_time_scale(attr_set) 149 156 else: 150 157 time_points = range(len(attr_set)) 151 158 152 serie, differences = costruct_series(data, attr_set, "AREA") 159 # Monte Carlo approach to create the null distribution of the areas 160 if auto: # null distribution 161 serie, differences = costruct_series(data, attr_set, auto) 162 serie_campionata = {} 163 area = [] 164 for i in range(20000): 165 serie_campionata = random.sample(differences, len(time_points)) 166 area.append(compute_area(serie_campionata, time_points)) 167 area_threshold = prctile(area, perc) 168 else: 169 serie = costruct_series(data, attr_set, auto) 170 153 171 serie_0 = costruct_control_series(serie, len(time_points), control) 154 172 155 """ Monte Carlo approach to create the null distribution of the areas """ 156 ##print "null distribution..." 157 serie_campionata = {} 158 area = [] 159 for i in range(20000): 160 serie_campionata = random.sample(differences, len(time_points)); 161 area.append(compute_area(serie_campionata, time_points)); 162 163 area_soglia = prctile(area,perc); 164 165 166 """ Gene filtering based on the threshold (selected percentile of null distribution of areas) """ 167 ##print "filtering..." 168 diff_expr = [] 173 # Gene filtering 169 174 areas = [] 170 175 for g in serie: 171 176 diff_s = [log2(serie[g][i])  log2(serie_0[g][i]) for i in range(len(time_points))] 172 177 area_diff = compute_area(diff_s, time_points); 173 areas.append((area_diff,g)) 174 if area_diff > area_soglia: 175 diff_expr.append(g) 176 areas.sort(reverse = True) 177 if limit == 0: 178 de_genes = diff_expr 179 else: 180 de_genes = [d[1] for d in areas][:limit] 181 182 ##print "%d genes differentially expressed" %len(de_genes) 183 return de_genes 184 185 def FC(data, attr_set, thr, p_thr, control, limit): 178 if not auto or area_diff > area_threshold: 179 areas.append((g, area_diff)) 180 return areas 181 182 def FC(data, attr_set, control='t0', thr=2, auto=False, p_thr=0.2): 186 183 """ Gene filtering based on the number of FC of all time points with the control series > thr """ 187 serie = costruct_series(data, attr_set, "fc")184 serie = costruct_series(data, attr_set, False) 188 185 num_points = len(serie.values()[0]) 189 186 serie_0 = costruct_control_series(serie, num_points, control) 190 thr_points = round(p_thr*num_points) 191 ##print "filtering..." 192 if limit == 0: 193 de_genes = [gene for gene,s in serie.items() if len([True for i in range(num_points) if abs(my_ratio(s[i], serie_0[gene][i])) >= log2(thr)]) >= thr_points] 194 else: 195 fc = [(len([True for i in range(num_points) if abs(my_ratio(s[i], serie_0[gene][i])) >= log2(thr)]),gene) for gene,s in serie.items()] 196 fc.sort(reverse = True) 197 de_genes = [d[1] for d in fc][:limit] 198 199 ##print "%d genes differentially expressed" %len(de_genes) 200 return de_genes 187 fc = [(g, len([0 for i in range(num_points) if abs(my_ratio(s[i], serie_0[g][i])) >= thr])) 188 for g, s in serie.items()] 189 if auto: 190 thr_points = round(p_thr*num_points) 191 fc = [(g, v) for g, v in fc if v >= thr_points] 192 return fc 201 193 202 194 def spearmanr_filter(data, limit=1000):
Note: See TracChangeset
for help on using the changeset viewer.