Changeset 1450:6c2b050c2c2a in orange-bioinformatics


Ignore:
Timestamp:
07/05/11 16:28:04 (3 years ago)
Author:
lanz <lan.zagar@…>
Branch:
default
Convert:
ec92f1ec0993ae454fb16bf73b97dcde03cb87a3
Message:

Updated.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • obiDifscale.py

    r1448 r1450  
    1010from obiExpression import ExpressionSignificance_Test 
    1111 
     12 
     13# Utility functiions 
     14 
     15log2 = lambda x: log(x, 2.) 
     16 
     17def my_ratio(x, y): 
     18    """ compute the log-ratio """ 
     19    return log2(x/y) 
     20 
     21def sign(x): 
     22    return cmp(x, 0) 
     23 
     24def common_domain(data1, data2): 
     25    """Use only attributes that are in both data sets""" 
     26    atts = sorted(set([a.name for a in data1.domain.attributes]).intersection( 
     27               [a.name for a in data2.domain.attributes])) 
     28    new1 = Orange.data.Table(Orange.data.Domain(atts + [data1.domain.classVar], data1.domain), data1) 
     29    new2 = Orange.data.Table(Orange.data.Domain(atts + [data2.domain.classVar], data2.domain), data2) 
     30    return new1, new2 
     31 
     32def common_genes(data1, data2, att='gene'): 
     33    common_set = list(set(ex[att].value for ex in data1).intersection(ex[att].value for ex in data2)) 
     34    print len(set(ex[att].value for ex in data1)) 
     35    print len(common_set) 
     36    return data1.filter(**{att: common_set}), data2.filter(**{att: common_set}) 
    1237 
    1338# Normalization 
     
    5277    return normalize 
    5378 
    54 def medianscalenorm(data, newsamples=None): 
     79def medianscalenorm(data, newsamples=None, intersection=True): 
    5580    """normalization of microarray data to center the distributions in all chips on the same global median""" 
    56     n = get_mediannorm(data) 
    5781    if not newsamples: 
     82        n = get_mediannorm(data) 
    5883        return n(data), None 
    5984    else: 
    60         return n(data), n(newsamples) 
     85        if intersection: 
     86            idata, inewsamples = common_genes(data, newsamples) 
     87        n = get_mediannorm(idata) 
     88        return n(idata), n(inewsamples) 
    6189 
    6290def normalize(data1, data2=None, type='median'): 
     
    7199 
    72100# Gene filtering 
    73  
    74 log2 = lambda x: log(x, 2.) 
    75  
    76 def my_ratio(x, y): 
    77     """ compute the log-ratio """ 
    78     return log2(x/y) 
    79  
    80 def sign(int): 
    81     """ returns the sign of int """ 
    82     if(int < 0): 
    83         return -1; 
    84     elif(int > 0): 
    85         return 1; 
    86     else: 
    87         return 0; 
    88101 
    89102def compute_diff_pairs(at_a, d): 
     
    218231 
    219232 
     233def signed_PCA(data): 
     234    pca = Orange.projection.pca.Pca(data, standardize=False, max_components=1) 
     235    classifier = lambda X: [x[0].value for x in pca(X)] 
     236    predictions = classifier(data) 
     237    classes = [ex.getclass().value for ex in data] 
     238    n = 0 
     239    for i1,c1 in enumerate(classes): 
     240        for i2,c2 in enumerate(classes[:i1]): 
     241            n += cmp(c1,c2) * cmp(predictions[i1], predictions[i2]) 
     242    if n < 0: 
     243        def invert(X): 
     244            y = classifier(X) 
     245            return -y if type(y) == float else [-x for x in y] 
     246        return invert 
     247    else: 
     248        return classifier 
     249 
     250signed_PCA.name = 'PCA' 
     251 
     252 
     253def conttime(data, d): 
     254    for a in data.domain.attributes: 
     255        a.attributes['time'] = d[a.attributes['time']] 
     256 
     257def conv(attr_set, ticks=True): 
     258    """Obtain time points with a unique measure (the lowest among [min,h,d]) present in data""" 
     259    ord_measures = ["min","h","d"] 
     260    converter = {"min":{"h":60, "d":24*60}, "h": {"d":24}} 
     261    measures = list(set([t.split(" ")[1] for t in attr_set])) 
     262    if len(measures) == 1: 
     263        time_points = [(t, float(t.split(" ")[0])) for t in attr_set] 
     264    else: 
     265        first_measure = min([(ord_measures.index(m),m) for m in measures])[1] 
     266        time_points = [(t, float(t.split(" ")[0])) for t in attr_set if t.split(" ")[1] == first_measure] 
     267        time_points.extend([(t, float(t.split(" ")[0]) * converter[first_measure][t.split(" ")[1]]) 
     268                            for t in attr_set if t.split(" ")[1] != first_measure]) 
     269    time_points.sort(key=itemgetter(1)) 
     270    if ticks: 
     271        time_points = [(t[0],float(i)) for i,t in enumerate(time_points)] 
     272    return dict(time_points) 
     273 
     274 
     275def get_projections(data1, data2=None): 
     276    labels1 = list(a.attributes['time'] for a in data1.domain.attributes) 
     277    tdata1 = obiGEO.transpose(data1) 
     278    if data2: 
     279        labels2 = list(a.attributes['time'] for a in data2.domain.attributes) 
     280        tdata2 = obiGEO.transpose(data2) 
     281        tdata1, tdata2 = common_domain(tdata1, tdata2) 
     282        classifier = signed_PCA(tdata1) 
     283        proj1 = classifier(tdata1) 
     284        proj2 = classifier(tdata2) 
     285    else: 
     286        classifier = signed_PCA(tdata1) 
     287        proj1 = classifier(tdata1) 
     288        proj2, labels2 = [], [] 
     289    return proj1, labels1, proj2, labels2 
     290 
     291 
     292############ 
     293 
     294if False and __name__ == '__main__': 
     295    import obiGEO 
     296    # Data set 1 
     297    data1 = obiGEO.GDS('GDS2666').getdata(report_genes=True, transpose=False) 
     298    labels1 = list(a.attributes['time'] for a in data1.domain.attributes) 
     299    attr_set = list(set(a.attributes['time'] for a in data1.domain.attributes)) 
     300    convd = conv(attr_set) 
     301    conttime(data1, convd) 
     302    # Data set 2 
     303    data2 = obiGEO.GDS('GDS2667').getdata(report_genes=True, transpose=False) 
     304    labels2 = list(a.attributes['time'] for a in data2.domain.attributes) 
     305    attr_set = list(set(a.attributes['time'] for a in data2.domain.attributes)) 
     306    convd = conv(attr_set) 
     307    conttime(data2, convd) 
     308    # Normalize data set 1 
     309    ndata1, _ = normalize(data1, type='quantile') 
     310    # Filtering 
     311    attr_set = {} 
     312    for a in ndata1.domain.attributes: 
     313        attr_set[a.attributes['time']] = attr_set.get(a.attributes['time'], []) + [a.name] 
     314    scores = AREA(ndata1, sorted(attr_set.items())) 
     315    genes = map(itemgetter(0), sorted(scores, key=itemgetter(1))[:1000]) 
     316    fdata1 = ndata1.filter(gene=genes) 
     317    # Rescale data set 2 
     318    ttrain, ttest = normalize(fdata1, data2) 
     319    # Model construction and prediction 
     320    # train = obiGEO.transpose(ttrain) 
     321    # test = obiGEO.transpose(ttest) 
     322    # cdtrain, cdtest = common_domain(train, test) 
     323    # classifier = signed_PCA(cdtrain) 
     324    # proj1 = classifier(cdtrain) 
     325    # proj2 = classifier(cdtest) 
     326 
     327 
     328 
Note: See TracChangeset for help on using the changeset viewer.