source: orange-bioinformatics/doc/modules/geo_gds5.py @ 913:9d28add50761

Revision 913:9d28add50761, 1.1 KB checked in by blaz <blaz.zupan@…>, 5 years ago (diff)
Line 
1"""
2Check all data files from GEO, find those which include at least N
3samples in all sample subsets of at least one sample type. Useful
4when, for instance, filtering out the data sets that could be used for
5supervised machine learning.
6"""
7
8import obiGEO
9
10def valid(info, n=40):
11    """Return a set of subset types containing more than n samples in every subset"""
12    invalid = set()
13    subsets = set([sinfo["type"] for sinfo in info["subsets"]])
14    for sampleinfo in info["subsets"]:
15        if len(sampleinfo["sample_id"]) < n:
16            invalid.add(sampleinfo["type"])
17    return subsets.difference(invalid)
18
19def report(stypes, info):
20    """Pretty-print GDS and valid susbset types"""
21    for id, sts in stypes:
22        print id
23        for st in sts:
24            print %s:" % st,
25            gds = info[id]
26            print ", ".join(["%s/%d" % (sinfo["description"], len(sinfo["sample_id"])) \
27                             for sinfo in gds["subsets"] if sinfo["type"]==st])
28
29gdsinfo = obiGEO.GDSInfo()
30valid_subset_types = [(id, valid(info)) for id, info in gdsinfo.items() if valid(info)]
31report(valid_subset_types, gdsinfo)
Note: See TracBrowser for help on using the repository browser.