source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1782:e6ed6e7bced3

Revision 1782:e6ed6e7bced3, 12.8 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

Added Phenotypes to DictyMutants hierarchy

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def dictyMutantSets():
64    """
65    Return dicty mutant phenotype gene sets from Dictybase
66    """
67    from . import obiDictyMutants
68    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
69    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
70    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
71    #                    for mutant in obiDictyMutants.mutants()]
72 
73    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
74                        link="") \
75                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
76
77    return GeneSets(genesets)
78
79def omimGeneSets():
80    """
81    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
82    """
83    from . import obiOMIM   
84    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
85                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
86                    for disease in obiOMIM.diseases()]
87    return GeneSets(genesets)
88
89def miRNAGeneSets(org):
90    """
91    Return gene sets from miRNA targets
92    """
93    from . import obimiRNA
94    org_code = obiKEGG.from_taxid(org)
95    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
96    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
97    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
98                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
99    return GeneSets(genesets)
100
101def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
102    from . import obimiRNA, obiGO
103    mirnas = obimiRNA.ids(int(org))
104    if ontology is None:
105        ontology = obiGO.Ontology()
106
107    annotations = obiGO.Annotations(org, ontology=ontology)
108
109    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
110    print go_sets
111
112    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
113
114    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
115    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
116                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
117    gset = GeneSets(gsets)
118    return gset
119
120def loadGMT(contents, name):
121    """
122    Eech line consists of tab separated elements. First is
123    the geneset name, next is it's description.
124
125    For now the description is skipped.
126    """
127
128    def hline(s):
129        tabs = [tab.strip() for tab in s.split("\t")]
130        return GeneSet(id=tabs[0], description=tabs[1],
131                                   hierarchy=(name,), genes=tabs[2:])
132
133    def handleNELines(s, fn):
134        """
135        Run function on nonempty lines of a string.
136        Return a list of results for each line.
137        """
138        lines = (l.strip() for l in s.splitlines())
139        return [fn(l) for l in lines if l]
140
141    return GeneSets(handleNELines(contents, hline))
142
143"""
144We have multiple paths for gene set data:
145buffer/bigfiles/gene_sets
146and
147buffer/gene_sets_local
148both have available.txt
149"""
150
151def omakedirs(dir):
152    try:
153        os.makedirs(dir)
154    except OSError:
155        pass
156
157def local_path():
158    """ Returns local path for gene sets. Creates it if it does not exists
159    yet. """
160    from Orange.orng import orngEnviron
161    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
162    omakedirs(pth)
163    return pth
164
165def build_index(dir):
166    """ Returns gene set availability index for some folder. """
167    pass
168
169def filename(hierarchy, organism):
170    """ Obtain a filename for given hierarchy and organism. """
171    return "gs_" + "_._".join(hierarchy + \
172        (organism if organism != None else "",)) + ".pck"
173
174def filename_parse(fn):
175    """ Returns a hierarchy and the organism from the filename."""
176    fn = fn[3:-4]
177    parts = fn.split("_._")
178    hierarchy = tuple(parts[:-1])
179    org = parts[-1] if parts[-1] != "" else None
180    return hierarchy, org
181
182def is_genesets_file(fn):
183    return fn.startswith("gs_") and fn.endswith(".pck")
184
185def list_local():
186    """ Returns available gene sets from the local repository:
187    a list of (hierarchy, organism, on_local) """
188    pth = local_path()
189    gs_files = filter(is_genesets_file, os.listdir(pth))
190    return [ filename_parse(fn) + (True,) for fn in gs_files ]
191
192def list_serverfiles_from_flist(flist):
193    gs_files = filter(is_genesets_file, flist)
194    localfiles = set(orngServerFiles.listfiles(sfdomain))
195    return [ filename_parse(fn) + \
196        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
197
198def list_serverfiles_conn(serverfiles=None):
199    """ Returns available gene sets from the server files
200    repository: a list of (hierarchy, organism, on_local) """
201    if serverfiles == None:
202        serverfiles = orngServerFiles.ServerFiles()
203    flist = serverfiles.listfiles(sfdomain)
204    return list_serverfiles_from_flist(flist)
205
206def list_serverfiles():
207    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
208    flist = pickle.load(open(fname, 'r'))
209    return list_serverfiles_from_flist(flist)
210
211def list_all():
212    """
213    return a list of (hier, org, avalable_locally)
214    If something for a specific (hier, org) is not downloaded
215    yet, show it as not-local. """
216    flist = list_local() + list_serverfiles()
217    d = {}
218    for h,o,local in flist:
219        d[h,o] = min(local, d.get((h,o),True))
220    return [ (h,o,local) for (h,o),local in d.items() ]
221
222def update_server_list(serverfiles_upload, serverfiles_list=None):
223    if serverfiles_list == None:
224        serverfiles_list = orngServerFiles.ServerFiles()
225
226    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
227
228    tfname = pickle_temp(flist)
229
230    try:
231        fn = "index.pck"
232        title = "Gene sets: index"
233        tags = [ "gene sets", "index", "essential" ]
234        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
235        serverfiles_upload.unprotect(sfdomain, fn)
236    except Exception,e:
237        raise e
238    finally:
239        os.remove(tfname)
240
241def _register_local(genesets):
242    """ Registers using the common hierarchy and organism. """
243    pth = local_path()
244
245    org = genesets.common_org()
246    hierarchy = genesets.common_hierarchy()
247    fn = filename(hierarchy, org)
248
249    with open(os.path.join(pth, fn), "w") as f:
250        pickle.dump(genesets, f)
251
252    return fn
253
254def pickle_temp(obj):
255    """ Pickle a file to a temporary file and returns its name """
256    fd,tfname = tempfile.mkstemp()
257    os.close(fd)
258    f = open(tfname, 'wb')
259    pickle.dump(obj, f)
260    f.close()
261    return tfname
262
263def _register_serverfiles(genesets, serverFiles):
264    """ Registers using the common hierarchy and organism. """
265    org = genesets.common_org()
266    hierarchy = genesets.common_hierarchy()
267    fn = filename(hierarchy, org)
268
269    #save to temporary file
270    tfname = pickle_temp(genesets)
271
272    try:
273        taxname = obiTaxonomy.name(org)
274        title = "Gene sets: " + ", ".join(hierarchy) + \
275            ((" (" + taxname + ")") if org != None else "")
276        tags = list(hierarchy) + [ "gene sets", taxname ] + \
277            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
278        serverFiles.upload(sfdomain, fn, tfname, title, tags)
279        serverFiles.unprotect(sfdomain, fn)
280    finally:
281        os.remove(tfname)
282
283    update_server_list(serverFiles)
284
285def register(genesets, serverFiles=None):
286    """
287    Hierarchy is induced from the gene set names.
288    """
289    if serverFiles == None:
290        _register_local(genesets)
291    else:
292        _register_serverfiles(genesets, serverFiles)
293
294def build_hierarchy_dict(files):
295    hierd = defaultdict(list)
296    for ind,f in enumerate(files):
297        hier, org = f
298        for i in range(len(hier)+1):
299            hierd[(hier[:i], org)].append(ind)
300    return hierd
301
302def load_local(hierarchy, organism):
303    files = map(lambda x: x[:2], list_local())
304    hierd = build_hierarchy_dict(files)
305
306    out = GeneSets()
307    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
308        fname = os.path.join(local_path(), filename(h, o))
309        out.update(pickle.load(open(fname, 'r')))
310    return out
311
312def load_serverfiles(hierarchy, organism):
313    files = map(lambda x: x[:2], list_serverfiles())
314    hierd = build_hierarchy_dict(files)
315    out = GeneSets()
316    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
317        fname = orngServerFiles.localpath_download(sfdomain,
318            filename(h, o))
319        out.update(pickle.load(open(fname, 'r')))
320    return out
321
322def load(hierarchy, organism):
323    """ First try to load from the local registred folder. If the file
324    is not available, load it from the server files. """
325    ret = load_local(hierarchy, organism)
326    if len(ret) == 0:
327        ret.update(load_serverfiles(hierarchy, organism))
328    return ret
329
330def collections(*args):
331    """
332    Input is a list of collections.
333    Collection can either be a tuple (hierarchy, orgranism), where
334    hierarchy is a tuple also.
335    """
336    result = GeneSets()
337
338    for collection in args:
339        try:
340            result.update(collection)
341        except (ValueError, TypeError):
342            if issequencens(collection): #have a hierarchy, organism specification
343                new = load(*collection)
344                result.update(new)
345            else:
346                if collection.lower()[-4:] == ".gmt": #format from webpage
347                    result.update(loadGMT(open(collection,"rt").read(), collection))
348                else:
349                    raise Exception("collection() accepts files in .gmt format only.")
350
351    return result
352
353def issequencens(x):
354    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
355    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
356
357class TException(Exception): pass
358
359def upload_genesets(rsf):
360    """
361    Builds the default gene sets and
362    """
363    orngServerFiles.update_local_files()
364
365    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
366    organisms = obiTaxonomy.common_taxids()
367    for fn in genesetsfn:
368        for org in organisms:
369            try:
370                print "Uploading ORG", org, fn
371                genesets = fn(org).split_by_hierarchy()
372                for gs in genesets:
373                    print "registering", gs.common_hierarchy()
374                    register(gs, rsf) #server files
375                    #register(gs)
376                    print "successful", gs.common_hierarchy()
377            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
378                print "organism not found", org
379
380if __name__ == "__main__":
381    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
382    upload_genesets(rsf)
383    pass
384
385
Note: See TracBrowser for help on using the repository browser.