source: orange-bioinformatics/_bioinformatics/obiKEGG2/__init__.py @ 1636:10d234fdadb9

Revision 1636:10d234fdadb9, 10.9 KB checked in by mitar, 2 years ago (diff)

Restructuring because we will not be using namespaces.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6This is a python module for access to `KEGG`_ using its web services. To use this module you need to have
7`SUDS`_ python library installed (other backends are planed).
8
9.. _`KEGG`: http://www.genome.jp/kegg/
10
11.. _`SUDS`: http://pypi.python.org/pypi/suds/
12
13"""
14from __future__ import absolute_import
15
16
17import os, sys
18from collections import defaultdict
19
20from datetime import datetime
21
22from Orange.utils import lru_cache, serverfiles
23
24from . import databases
25from . import entry
26
27from .brite import BriteEntry, Brite
28
29from . import api
30from . import conf
31from . import pathway
32
33KEGGGenome = databases.Genome
34KEGGGenes = databases.Genes
35KEGGEnzymes = databases.Enzymes
36KEGGReaction = databases.Reactions
37KEGGPathways = databases.Pathways
38
39KEGGBrite = Brite
40KEGGBriteEntry = BriteEntry
41
42KEGGPathway = pathway.Pathway
43
44DEFAULT_CACHE_DIR = conf.params["cache.path"]
45
46
47from .. import obiProb
48from Orange.utils import deprecated_keywords, deprecated_attribute
49
50class Organism(object):
51    def __init__(self, org, genematcher=None):
52        self.org_code = self.organism_name_search(org)
53        self.genematcher = genematcher
54        self.api = api.CachedKeggApi()
55       
56    @property
57    def org(self):
58        return self.org_code
59   
60    @property
61    def genes(self):
62        if not hasattr(self, "_genes"):
63            genes = KEGGGenes(self.org_code)
64            self._genes = genes
65        return self._genes
66   
67    def gene_aliases(self):
68        return self.genes().gene_aliases()
69   
70    def pathways(self, with_ids=None):
71        if with_ids is not None:
72            return self.api.get_pathways_by_genes(with_ids)
73        else:
74            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
75   
76    def list_pathways(self):
77        return self.pathways()
78   
79    def get_linked_pathways(self, pathway_id):
80        self.api.get_linked_pathways(pathway_id)
81       
82    def enzymes(self, genes=None):
83        raise NotImplementedError()
84   
85    def get_enriched_pathways(self, genes, reference=None, prob=obiProb.Binomial(), callback=None):
86        """ Return a dictionary with enriched pathways ids as keys
87        and (list_of_genes, p_value, num_of_reference_genes) tuples
88        as items.
89       
90        """
91        allPathways = defaultdict(lambda :[[], 1.0, []])
92        from Orange.orng import orngMisc
93        milestones = orngMisc.progressBarMilestones(len(genes), 100)
94        pathways_db = KEGGPathways()
95       
96        pathways_for_gene = []
97        for i, gene in enumerate(genes):
98            pathways_for_gene.append(self.pathways([gene]))
99            if callback and i in milestones:
100                callback(i*50.0/len(genes))
101               
102        # precache for speed
103        pathways_db.pre_cache([pid for pfg in pathways_for_gene for pid in pfg]) 
104        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
105            for pathway in pathways:
106                if pathways_db.get_entry(pathway).gene: 
107                    allPathways[pathway][0].append(gene)
108            if callback and i in milestones:
109                callback(50.0 + i*50.0/len(genes))
110        reference = set(reference if reference is not None else self.genes.keys())
111       
112        pItems = allPathways.items()
113       
114        for i, (p_id, entry) in enumerate(pItems):
115            pathway = pathways_db.get_entry(p_id)
116            entry[2].extend(reference.intersection(pathway.gene or []))
117            entry[1] = prob.p_value(len(entry[0]), len(reference), len(entry[2]), len(genes))
118        return dict([(pid, (genes, p, len(ref))) for pid, (genes, p, ref) in allPathways.items()])
119       
120    def get_genes_by_enzyme(self, enzyme):
121        enzyme = Enzymes().get_entry(enzyme)
122        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
123   
124    def get_genes_by_pathway(self, pathway_id):
125        return KEGGPathway(pathway_id).genes()
126   
127    def get_enzymes_by_pathway(self, pathway_id):
128        return KEGGPathway(pathway_id).enzymes()
129   
130    def get_compounds_by_pathway(self, pathway_id):
131        return KEGGPathway(pathway_id).compounds()
132   
133    def get_pathways_by_genes(self, gene_ids):
134        return self.api.get_pathways_by_genes(gene_ids)
135        gene_ids = set(gene_ids)
136        pathways = [self.genes[id].pathway for id in gene_ids if self.genes[id].pathway]
137        pathways = reduce(set.union, pathways, set())
138        return [id for id in pathways if gene_ids.issubset(KEGGPathway(id).genes())] 
139   
140    def get_pathways_by_enzymes(self, enzyme_ids):
141        enzyme_ids = set(enzyme_ids)
142        pathways = [KEGGEnzymes()[id].pathway for id in enzyme_ids]
143        pathwats = reduce(set.union, pathways, set())
144        return [id for id in pathways if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
145   
146    def get_pathways_by_compounds(self, compound_ids):
147        compound_ids = set(compound_ids)
148        pathways = [KEGGCompounds()[id].pathway for id in compound_ids]
149        pathwats = reduce(set.union, pathways, set())
150        return [id for id in pathways if compound_ids.issubset(KEGGPathway(id).compounds())]
151   
152    def get_enzymes_by_compound(self, compound_id):
153        return KEGGCompound()[compound_id].enzyme
154   
155    def get_enzymes_by_gene(self, gene_id):
156        return self.genes[gene_id].enzymes
157   
158    def get_compounds_by_enzyme(self, enzyme_id):
159        return self._enzymes_to_compounds.get(enzyme_id)
160   
161    @deprecated_keywords({"caseSensitive": "case_sensitive"})
162    def get_unique_gene_ids(self, genes, case_sensitive=True):
163        """Return a tuple with three elements. The first is a dictionary mapping from unique gene
164        ids to gene names in genes, the second is a list of conflicting gene names and the third is a list
165        of unknown genes.
166        """
167        unique, conflicting, unknown = {}, [], []
168        for gene in genes:
169            names = self.genematcher.match(gene)
170            if len(names) == 1:
171                unique[names[0]] = gene
172            elif len(names) == 0:
173                unknown.append(gene)
174            else:
175                conflicting.append(gene)
176        return unique, conflicting, unknown
177   
178    def get_genes(self):
179        return self.genes
180   
181    @classmethod
182    def organism_name_search(cls, name):
183        genome = KEGGGenome()
184        if name not in genome:
185            ids = genome.search(name)
186            if not ids:
187                from .. import obiTaxonomy
188                ids = obiTaxonomy.search(name)
189                ids = [id for id in ids if genome.search(id)]
190            name = ids.pop(0) if ids else name
191           
192        try:
193            return genome[name].entry_key
194        except KeyError:
195            raise ValueError("Organism with name='%s' not found in KEGG." % name)
196       
197    @classmethod
198    def organism_version(cls, name):
199        name = cls.organism_name_search(name)
200        genome = KEGGGenome()
201        info = genome.api.binfo(name)
202        return info.release
203#        orngServerFiles.localpath_download("KEGG", "kegg_genes_%s.tar.gz" % name)
204#        return orngServerFiles.info("KEGG", "kegg_genes_%s.tar.gz" % name)["datetime"]
205   
206    def _set_genematcher(self, genematcher):
207        setattr(self, "_genematcher", genematcher)
208       
209    def _get_genematcher(self):
210        if getattr(self, "_genematcher", None) == None:
211            from .. import obiGene
212            if self.org_code == "ddi":
213                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
214                                                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]])
215            else:
216                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code)])
217            self._genematcher.set_targets(self.genes.keys())
218        return self._genematcher
219   
220    genematcher = property(_get_genematcher, _set_genematcher)
221   
222KEGGOrganism = Organism
223   
224def organism_name_search(name):
225    return KEGGOrganism.organism_name_search(name)
226
227def pathways(org):
228    return KEGGPathway.list(org)
229
230def organisms():
231    return KEGGOrganism.organisms()
232
233def from_taxid(taxid):
234    genome = KEGGGenome()
235    res = genome.search(taxid)
236    print taxid, res
237    for r in res:
238        e = genome[r]
239       
240        if e.taxid in [taxid,  genome.TAXID_MAP.get(taxid, taxid)]:
241            return e.org_code()
242       
243    return None
244
245def to_taxid(name):
246    genome = KEGGGenome()
247    if name in genome:
248        return genome[name].taxid
249   
250    keys = genome.search(name)
251    if keys:
252        return genome[keys[0]].taxid
253    else:
254        return None
255
256def create_gene_sets():
257    pass
258
259from .. import obiGene
260from Orange.utils import ConsoleProgressBar
261
262class MatcherAliasesKEGG(obiGene.MatcherAliasesPickled):
263    DOMAIN = "KEGG"
264    VERSION = "v3.0"
265    def create_aliases(self):
266        import cPickle
267        files = set(serverfiles.ServerFiles().listfiles(self.DOMAIN))
268        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
269        if ids_filename in files:
270            filename = serverfiles.localpath_download(self.DOMAIN, ids_filename)
271           
272            aliases = cPickle.load(open(filename, "rb"))
273        else:
274            pb = ConsoleProgressBar("Retriving KEGG ids:")
275            kegg_org = KEGGOrganism(self.organism)
276            genes = kegg_org.genes
277            genes.pre_cache(progress_callback=pb.set_state)
278            aliases = []
279            for key, entry in genes.iteritems():
280                aliases.append(set([key]) | set(entry.alt_names))
281            filename = serverfiles.localpath_download(self.DOMAIN, ids_filename)
282            cPickle.dump(aliases, open(filename, "wb"))
283           
284        return aliases
285   
286    def filename(self):
287        return "kegg3_" + self.organism
288   
289    def aliases_path(self):
290        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
291        return serverfiles.localpath(self.DOMAIN, ids_filename)
292   
293    def create_aliases_version(self):
294        files = set(serverfiles.listfiles(self.DOMAIN))
295        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
296        if ids_filename in files:
297            version = serverfiles.info(self.DOMAIN, ids_filename)["datetime"]
298        else:
299            kegg_org = KEGGOrganism(self.organism)
300            genes = kegg_org.genes
301            version = genes.info.release
302        return version
303       
304    def __init__(self, organism, **kwargs):
305        self.organism = organism
306        sf = serverfiles.ServerFiles()
307        files = set(sf.listfiles(self.DOMAIN))
308        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
309        if ids_filename in files:
310            serverfiles.update(self.DOMAIN, ids_filename)
311           
312        obiGene.MatcherAliasesPickled.__init__(self, **kwargs)
313
314def main():
315    KEGGGenome()
316    import doctest
317    extraglobs = {"api": KeggApi()}
318    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
319
320if __name__ == "__main__":
321    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.