Changeset 1446:d78fec13157d in orange-bioinformatics


Ignore:
Timestamp:
07/01/11 13:25:59 (3 years ago)
Author:
lanz <lan.zagar@…>
Branch:
default
Convert:
f7d5aad0df5db2d5681517cbce3c981cb4f7e406
Message:

Changes to methods.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • obiDifscale.py

    r1442 r1446  
    1212 
    1313def quantilenorm(data): 
    14     """ normalization of microarray data to obtain the same distribution in all chips """ 
     14    """normalization of microarray data to obtain the same distribution in all chips""" 
    1515    chips = data.domain.attributes 
    1616    genes = [str(d["gene"]) for d in data] 
     
    3535 
    3636def get_mediannorm(modeldata): 
    37     """ returns the function for median scale normalization """ 
     37    """returns the function for median scale normalization""" 
    3838    globalmedian = numpy.median([float(d[c]) for d in modeldata for c in modeldata.domain.attributes]) 
    3939    def normalize(data): 
     
    4141        medians = [numpy.median([float(d[c]) for d in data]) for c in chips] 
    4242        data_norm = Orange.data.Table(data.domain) 
     43        data_norm.domain.add_metas(data.domain.get_metas()) 
    4344        for i, d in enumerate(data): 
    44             g = str(d["gene"]) 
    4545            ex = [(d[c.name].value*globalmedian)/medians[k] for k,c in enumerate(chips)] 
    4646            data_norm.append(ex) 
    47             data_norm[i]["gene"] = g 
     47            for m in data.domain.get_metas(): 
     48                data_norm[i][m] = data[i][m] 
    4849        return data_norm 
    4950    return normalize 
    5051 
    5152def medianscalenorm(data, newsamples=None): 
    52     """ normalization of microarray data to obtain the distributions in all chips centered on the same global median """ 
     53    """normalization of microarray data to center the distributions in all chips on the same global median""" 
    5354    n = get_mediannorm(data) 
    5455    if not newsamples: 
     
    9293    return differ 
    9394 
    94 def costruct_series(data, attr_set, f): 
     95def costruct_series(data, attr_set, differences=False): 
    9596    """ Constructs the time series by averaging the replicates of the same time point """ 
    96     ##print "creating the time series..." 
    97     serie = dict([(str(d["gene"]), [numpy.mean([d[at] for at in at_samples]) for at_name, at_samples in attr_set]) for d in data]) 
    98     if f == "AREA": 
    99         """ for AREA filtering, store the differences between replicates while creating the time series """ 
     97    serie = dict([(str(d["gene"]), [numpy.mean([d[at] for at in at_samples]) 
     98                    for at_name, at_samples in attr_set]) for d in data]) 
     99    if differences: 
     100        """store the differences between replicates while creating the time series""" 
    100101        differences = [] 
    101         r = [[differences.extend(compute_diff_pairs(at_samples, d)) for at_name, at_samples in attr_set] for d in data] 
     102        r = [[differences.extend(compute_diff_pairs(at_samples, d)) 
     103              for at_name, at_samples in attr_set] for d in data] 
    102104        return serie, differences 
    103105    else: 
     
    116118def compute_area(serie, tempi): 
    117119    """ Function that calculates the area between 2 time series """ 
    118     # if the ith point and the next point have the same sign it computes the area by the 
    119     # integral with the trapezoidal method, else it computes the area of the two triangles found with linear interpolation 
     120    # if the ith point and the next point have the same sign it computes the 
     121    # area by the integral with the trapezoidal method, else it computes the 
     122    # area of the two triangles found with linear interpolation 
    120123    area_pieces = [] 
    121124    for i in range(len(serie)-1): 
     
    123126            area_pieces.append(abs(numpy.trapz([serie[i], serie[i+1]], [tempi[i], tempi[i+1]]))) 
    124127        else: 
    125             area_triangle_1 = float(serie[i]*(tempi[i+1]-tempi[i]))/(serie[i]-serie[i+1])*abs(serie[i])/2 
    126             area_triangle_2 = float(serie[i+1]*(tempi[i]-tempi[i+1]))/(serie[i]-serie[i+1])*abs(serie[i+1])/2 
     128            area_triangle_1 = float(serie[i] * (tempi[i+1] - tempi[i])) / \ 
     129                              (serie[i] - serie[i+1]) * abs(serie[i]) / 2 
     130            area_triangle_2 = float(serie[i+1] * (tempi[i] - tempi[i+1])) / \ 
     131                              (serie[i] - serie[i+1]) * abs(serie[i+1]) / 2 
    127132            area_pieces.append(area_triangle_1 + area_triangle_2) 
    128133 
     
    139144    else: 
    140145        first_measure = min([(ord_measures.index(m),m) for m in measures])[1] 
    141         time_points = [float(t.split(" ")[0]) for t,s in attr_set if t.split(" ")[1] == first_measure] 
    142         time_points.extend([float(t.split(" ")[0])*converter[first_measure][t.split(" ")[1]] for t,s in attr_set if t.split(" ")[1] != first_measure]) 
     146        time_points = [float(t.split(" ")[0]) for t, s in attr_set 
     147                       if t.split(" ")[1] == first_measure] 
     148        time_points.extend([float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]] 
     149                            for t, s in attr_set if t.split(" ")[1] != first_measure]) 
    143150    return time_points 
    144151 
    145 def AREA(data, attr_set, perc, control, scale, limit): 
     152def AREA(data, attr_set, control='t0', weighted=False, auto=False, perc=99): 
    146153    """ AREA (Area Under the Curve) filtering method """ 
    147     if scale == "weighted": 
     154    if weighted: 
    148155        time_points = uniform_time_scale(attr_set) 
    149156    else: 
    150157        time_points = range(len(attr_set)) 
    151158 
    152     serie, differences = costruct_series(data, attr_set, "AREA") 
     159    # Monte Carlo approach to create the null distribution of the areas 
     160    if auto: # null distribution 
     161        serie, differences = costruct_series(data, attr_set, auto) 
     162        serie_campionata = {} 
     163        area = [] 
     164        for i in range(20000): 
     165            serie_campionata = random.sample(differences, len(time_points)) 
     166            area.append(compute_area(serie_campionata, time_points)) 
     167        area_threshold = prctile(area, perc) 
     168    else: 
     169        serie = costruct_series(data, attr_set, auto) 
     170 
    153171    serie_0 = costruct_control_series(serie, len(time_points), control) 
    154172 
    155     """ Monte Carlo approach to create the null distribution of the areas """ 
    156     ##print "null distribution..." 
    157     serie_campionata = {} 
    158     area = [] 
    159     for i in range(20000): 
    160         serie_campionata = random.sample(differences, len(time_points)); 
    161         area.append(compute_area(serie_campionata, time_points)); 
    162  
    163     area_soglia = prctile(area,perc); 
    164  
    165  
    166     """ Gene filtering based on the threshold (selected percentile of null distribution of areas) """ 
    167     ##print "filtering..." 
    168     diff_expr = [] 
     173    # Gene filtering 
    169174    areas = [] 
    170175    for g in serie: 
    171176        diff_s = [log2(serie[g][i]) - log2(serie_0[g][i]) for i in range(len(time_points))] 
    172177        area_diff = compute_area(diff_s, time_points); 
    173         areas.append((area_diff,g)) 
    174         if area_diff > area_soglia: 
    175             diff_expr.append(g) 
    176     areas.sort(reverse = True) 
    177     if limit == 0: 
    178         de_genes = diff_expr 
    179     else: 
    180         de_genes = [d[1] for d in areas][:limit] 
    181  
    182     ##print "%d genes differentially expressed" %len(de_genes) 
    183     return de_genes 
    184  
    185 def FC(data, attr_set, thr, p_thr, control, limit): 
     178        if not auto or area_diff > area_threshold: 
     179            areas.append((g, area_diff)) 
     180    return areas 
     181 
     182def FC(data, attr_set, control='t0', thr=2, auto=False, p_thr=0.2): 
    186183    """ Gene filtering based on the number of FC of all time points with the control series > thr """ 
    187     serie = costruct_series(data, attr_set, "fc") 
     184    serie = costruct_series(data, attr_set, False) 
    188185    num_points = len(serie.values()[0]) 
    189186    serie_0 = costruct_control_series(serie, num_points, control) 
    190     thr_points = round(p_thr*num_points) 
    191     ##print "filtering..." 
    192     if limit == 0: 
    193         de_genes = [gene for gene,s in serie.items() if len([True for i in range(num_points) if abs(my_ratio(s[i], serie_0[gene][i])) >= log2(thr)]) >= thr_points] 
    194     else: 
    195         fc = [(len([True for i in range(num_points) if abs(my_ratio(s[i], serie_0[gene][i])) >= log2(thr)]),gene) for gene,s in serie.items()] 
    196         fc.sort(reverse = True) 
    197         de_genes = [d[1] for d in fc][:limit] 
    198  
    199     ##print "%d genes differentially expressed" %len(de_genes) 
    200     return de_genes 
     187    fc = [(g, len([0 for i in range(num_points) if abs(my_ratio(s[i], serie_0[g][i])) >= thr])) 
     188          for g, s in serie.items()] 
     189    if auto: 
     190        thr_points = round(p_thr*num_points) 
     191        fc = [(g, v) for g, v in fc if v >= thr_points] 
     192    return fc 
    201193 
    202194def spearmanr_filter(data, limit=1000): 
Note: See TracChangeset for help on using the changeset viewer.