source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1795:663192bd4469

Revision 1795:663192bd4469, 13.3 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

Removed obiCytobands.py and created genesets in obiGeneSets.py. updateCytobands.py saves only genesets now

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def dictyMutantSets():
64    """
65    Return dicty mutant phenotype gene sets from Dictybase
66    """
67    from . import obiDictyMutants
68    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
69    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
70    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
71    #                    for mutant in obiDictyMutants.mutants()]
72 
73    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
74                        link="") \
75                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
76
77    return GeneSets(genesets)
78
79def cytobandGeneSets():
80    """
81    Create cytoband gene sets from Stanford Microarray Database
82    """
83    import urllib2
84
85    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
86    stream = urllib2.urlopen(url)
87    data = stream.read().splitlines()
88
89    genesets = []
90    for band in data:
91        b = band.split("\t")
92        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
93
94    return GeneSets(genesets)
95
96def omimGeneSets():
97    """
98    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
99    """
100    from . import obiOMIM   
101    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
102                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
103                    for disease in obiOMIM.diseases()]
104    return GeneSets(genesets)
105
106def miRNAGeneSets(org):
107    """
108    Return gene sets from miRNA targets
109    """
110    from . import obimiRNA
111    org_code = obiKEGG.from_taxid(org)
112    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
113    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
114    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
115                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
116    return GeneSets(genesets)
117
118def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
119    from . import obimiRNA, obiGO
120    mirnas = obimiRNA.ids(int(org))
121    if ontology is None:
122        ontology = obiGO.Ontology()
123
124    annotations = obiGO.Annotations(org, ontology=ontology)
125
126    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
127    print go_sets
128
129    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
130
131    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
132    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
133                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
134    gset = GeneSets(gsets)
135    return gset
136
137def loadGMT(contents, name):
138    """
139    Eech line consists of tab separated elements. First is
140    the geneset name, next is it's description.
141
142    For now the description is skipped.
143    """
144
145    def hline(s):
146        tabs = [tab.strip() for tab in s.split("\t")]
147        return GeneSet(id=tabs[0], description=tabs[1],
148                                   hierarchy=(name,), genes=tabs[2:])
149
150    def handleNELines(s, fn):
151        """
152        Run function on nonempty lines of a string.
153        Return a list of results for each line.
154        """
155        lines = (l.strip() for l in s.splitlines())
156        return [fn(l) for l in lines if l]
157
158    return GeneSets(handleNELines(contents, hline))
159
160"""
161We have multiple paths for gene set data:
162buffer/bigfiles/gene_sets
163and
164buffer/gene_sets_local
165both have available.txt
166"""
167
168def omakedirs(dir):
169    try:
170        os.makedirs(dir)
171    except OSError:
172        pass
173
174def local_path():
175    """ Returns local path for gene sets. Creates it if it does not exists
176    yet. """
177    from Orange.orng import orngEnviron
178    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
179    omakedirs(pth)
180    return pth
181
182def build_index(dir):
183    """ Returns gene set availability index for some folder. """
184    pass
185
186def filename(hierarchy, organism):
187    """ Obtain a filename for given hierarchy and organism. """
188    return "gs_" + "_._".join(hierarchy + \
189        (organism if organism != None else "",)) + ".pck"
190
191def filename_parse(fn):
192    """ Returns a hierarchy and the organism from the filename."""
193    fn = fn[3:-4]
194    parts = fn.split("_._")
195    hierarchy = tuple(parts[:-1])
196    org = parts[-1] if parts[-1] != "" else None
197    return hierarchy, org
198
199def is_genesets_file(fn):
200    return fn.startswith("gs_") and fn.endswith(".pck")
201
202def list_local():
203    """ Returns available gene sets from the local repository:
204    a list of (hierarchy, organism, on_local) """
205    pth = local_path()
206    gs_files = filter(is_genesets_file, os.listdir(pth))
207    return [ filename_parse(fn) + (True,) for fn in gs_files ]
208
209def list_serverfiles_from_flist(flist):
210    gs_files = filter(is_genesets_file, flist)
211    localfiles = set(orngServerFiles.listfiles(sfdomain))
212    return [ filename_parse(fn) + \
213        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
214
215def list_serverfiles_conn(serverfiles=None):
216    """ Returns available gene sets from the server files
217    repository: a list of (hierarchy, organism, on_local) """
218    if serverfiles == None:
219        serverfiles = orngServerFiles.ServerFiles()
220    flist = serverfiles.listfiles(sfdomain)
221    return list_serverfiles_from_flist(flist)
222
223def list_serverfiles():
224    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
225    flist = pickle.load(open(fname, 'r'))
226    return list_serverfiles_from_flist(flist)
227
228def list_all():
229    """
230    return a list of (hier, org, avalable_locally)
231    If something for a specific (hier, org) is not downloaded
232    yet, show it as not-local. """
233    flist = list_local() + list_serverfiles()
234    d = {}
235    for h,o,local in flist:
236        d[h,o] = min(local, d.get((h,o),True))
237    return [ (h,o,local) for (h,o),local in d.items() ]
238
239def update_server_list(serverfiles_upload, serverfiles_list=None):
240    if serverfiles_list == None:
241        serverfiles_list = orngServerFiles.ServerFiles()
242
243    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
244
245    tfname = pickle_temp(flist)
246
247    try:
248        fn = "index.pck"
249        title = "Gene sets: index"
250        tags = [ "gene sets", "index", "essential" ]
251        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
252        serverfiles_upload.unprotect(sfdomain, fn)
253    except Exception,e:
254        raise e
255    finally:
256        os.remove(tfname)
257
258def _register_local(genesets):
259    """ Registers using the common hierarchy and organism. """
260    pth = local_path()
261
262    org = genesets.common_org()
263    hierarchy = genesets.common_hierarchy()
264    fn = filename(hierarchy, org)
265
266    with open(os.path.join(pth, fn), "w") as f:
267        pickle.dump(genesets, f)
268
269    return fn
270
271def pickle_temp(obj):
272    """ Pickle a file to a temporary file and returns its name """
273    fd,tfname = tempfile.mkstemp()
274    os.close(fd)
275    f = open(tfname, 'wb')
276    pickle.dump(obj, f)
277    f.close()
278    return tfname
279
280def _register_serverfiles(genesets, serverFiles):
281    """ Registers using the common hierarchy and organism. """
282    org = genesets.common_org()
283    hierarchy = genesets.common_hierarchy()
284    fn = filename(hierarchy, org)
285
286    #save to temporary file
287    tfname = pickle_temp(genesets)
288
289    try:
290        taxname = obiTaxonomy.name(org)
291        title = "Gene sets: " + ", ".join(hierarchy) + \
292            ((" (" + taxname + ")") if org != None else "")
293        tags = list(hierarchy) + [ "gene sets", taxname ] + \
294            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
295        serverFiles.upload(sfdomain, fn, tfname, title, tags)
296        serverFiles.unprotect(sfdomain, fn)
297    finally:
298        os.remove(tfname)
299
300    update_server_list(serverFiles)
301
302def register(genesets, serverFiles=None):
303    """
304    Hierarchy is induced from the gene set names.
305    """
306    if serverFiles == None:
307        _register_local(genesets)
308    else:
309        _register_serverfiles(genesets, serverFiles)
310
311def build_hierarchy_dict(files):
312    hierd = defaultdict(list)
313    for ind,f in enumerate(files):
314        hier, org = f
315        for i in range(len(hier)+1):
316            hierd[(hier[:i], org)].append(ind)
317    return hierd
318
319def load_local(hierarchy, organism):
320    files = map(lambda x: x[:2], list_local())
321    hierd = build_hierarchy_dict(files)
322
323    out = GeneSets()
324    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
325        fname = os.path.join(local_path(), filename(h, o))
326        out.update(pickle.load(open(fname, 'r')))
327    return out
328
329def load_serverfiles(hierarchy, organism):
330    files = map(lambda x: x[:2], list_serverfiles())
331    hierd = build_hierarchy_dict(files)
332    out = GeneSets()
333    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
334        fname = orngServerFiles.localpath_download(sfdomain,
335            filename(h, o))
336        out.update(pickle.load(open(fname, 'r')))
337    return out
338
339def load(hierarchy, organism):
340    """ First try to load from the local registred folder. If the file
341    is not available, load it from the server files. """
342    ret = load_local(hierarchy, organism)
343    if len(ret) == 0:
344        ret.update(load_serverfiles(hierarchy, organism))
345    return ret
346
347def collections(*args):
348    """
349    Input is a list of collections.
350    Collection can either be a tuple (hierarchy, orgranism), where
351    hierarchy is a tuple also.
352    """
353    result = GeneSets()
354
355    for collection in args:
356        try:
357            result.update(collection)
358        except (ValueError, TypeError):
359            if issequencens(collection): #have a hierarchy, organism specification
360                new = load(*collection)
361                result.update(new)
362            else:
363                if collection.lower()[-4:] == ".gmt": #format from webpage
364                    result.update(loadGMT(open(collection,"rt").read(), collection))
365                else:
366                    raise Exception("collection() accepts files in .gmt format only.")
367
368    return result
369
370def issequencens(x):
371    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
372    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
373
374class TException(Exception): pass
375
376def upload_genesets(rsf):
377    """
378    Builds the default gene sets and
379    """
380    orngServerFiles.update_local_files()
381
382    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
383    organisms = obiTaxonomy.common_taxids()
384    for fn in genesetsfn:
385        for org in organisms:
386            try:
387                print "Uploading ORG", org, fn
388                genesets = fn(org).split_by_hierarchy()
389                for gs in genesets:
390                    print "registering", gs.common_hierarchy()
391                    register(gs, rsf) #server files
392                    #register(gs)
393                    print "successful", gs.common_hierarchy()
394            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
395                print "organism not found", org
396
397if __name__ == "__main__":
398    print cytobandGeneSets()
399    exit()
400    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
401    upload_genesets(rsf)
402    pass
403
404
Note: See TracBrowser for help on using the repository browser.