source: orange-bioinformatics/obiKEGG2/__init__.py @ 1540:90209537a0e1

Revision 1540:90209537a0e1, 10.4 KB checked in by ales_erjavec, 2 years ago (diff)

Small speedup of get_enriched_pathways.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6This is a python module for access to `KEGG`_ using its web services. To use this module you need to have
7`SUDS`_ python library installed (other backends are planed).
8
9.. _`KEGG`: http://www.genome.jp/kegg/
10
11"""
12from __future__ import absolute_import
13
14import urllib2
15
16import os, sys
17from functools import wraps, partial
18from collections import defaultdict
19
20import re
21from datetime import datetime
22
23from Orange.misc import lru_cache, serverfiles
24
25from . import databases
26from . import entry
27
28from .brite import BriteEntry, Brite
29
30from . import api
31from . import conf
32from . import pathway
33
34KEGGGenome = databases.Genome
35KEGGGenes = databases.Genes
36KEGGEnzymes = databases.Enzymes
37KEGGReaction = databases.Reactions
38KEGGPathways = databases.Pathways
39
40KEGGBrite = Brite
41KEGGBriteEntry = BriteEntry
42
43KEGGPathway = pathway.Pathway
44
45DEFAULT_CACHE_DIR = conf.params["cache.path"]
46
47
48import obiProb
49from Orange.misc import deprecated_keywords, deprecated_attribute
50
51class Organism(object):
52    def __init__(self, org, genematcher=None):
53        self.org_code = self.organism_name_search(org)
54        self.genematcher = genematcher
55        self.api = api.CachedKeggApi()
56       
57    @property
58    def org(self):
59        return self.org_code
60   
61    @property
62    def genes(self):
63        if not hasattr(self, "_genes"):
64            genes = KEGGGenes(self.org_code)
65            self._genes = genes
66        return self._genes
67   
68    def gene_aliases(self):
69        return self.genes().gene_aliases()
70   
71    def pathways(self, with_ids=None):
72        if with_ids is not None:
73            return self.api.get_pathways_by_genes(with_ids)
74        else:
75            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
76   
77    def list_pathways(self):
78        return self.pathways()
79   
80    def get_linked_pathways(self, pathway_id):
81        self.api.get_linked_pathways(pathway_id)
82       
83    def enzymes(self, genes=None):
84        raise NotImplementedError()
85   
86    def get_enriched_pathways(self, genes, reference=None, prob=obiProb.Binomial(), callback=None):
87        """ Return a dictionary with enriched pathways ids as keys
88        and (list_of_genes, p_value, num_of_reference_genes) tuples
89        as items.
90       
91        """
92        allPathways = defaultdict(lambda :[[], 1.0, []])
93        import orngMisc
94        milestones = orngMisc.progressBarMilestones(len(genes), 100)
95        pathways_db = KEGGPathways()
96       
97        pathways_for_gene = []
98        for i, gene in enumerate(genes):
99            pathways_for_gene.append(self.pathways([gene]))
100            if callback and i in milestones:
101                callback(i*50.0/len(genes))
102               
103        # precache for speed
104        pathways_db.pre_cache([pid for pfg in pathways_for_gene for pid in pfg]) 
105        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
106            for pathway in pathways:
107                if pathways_db.get_entry(pathway).gene: 
108                    allPathways[pathway][0].append(gene)
109            if callback and i in milestones:
110                callback(50.0 + i*50.0/len(genes))
111        reference = set(reference if reference is not None else self.genes.keys())
112       
113        pItems = allPathways.items()
114       
115        for i, (p_id, entry) in enumerate(pItems):
116            pathway = pathways_db.get_entry(p_id)
117            entry[2].extend(reference.intersection(pathway.gene or []))
118            entry[1] = prob.p_value(len(entry[0]), len(reference), len(entry[2]), len(genes))
119        return dict([(pid, (genes, p, len(ref))) for pid, (genes, p, ref) in allPathways.items()])
120       
121    def get_genes_by_enzyme(self, enzyme):
122        enzyme = Enzymes().get_entry(enzyme)
123        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
124   
125    def get_genes_by_pathway(self, pathway_id):
126        return Pathway(pathway_id).genes()
127   
128    def get_enzymes_by_pathway(self, pathway_id):
129        return Pathway(pathway_id).enzymes()
130   
131    def get_compounds_by_pathway(self, pathway_id):
132        return Pathway(pathway_id).compounds()
133   
134    def get_pathways_by_genes(self, gene_ids):
135        return self.api.get_pathways_by_genes(gene_ids)
136        gene_ids = set(gene_ids)
137        pathways = [self.genes[id].pathway for id in gene_ids if self.genes[id].pathway]
138        pathways = reduce(set.union, pathways, set())
139        return [id for id in pathways if gene_ids.issubset(KEGGPathway(id).genes())] 
140   
141    def get_pathways_by_enzymes(self, enzyme_ids):
142        enzyme_ids = set(enzyme_ids)
143        pathways = [KEGGEnzymes()[id].pathway for id in enzyme_ids]
144        pathwats = reduce(set.union, pathways, set())
145        return [id for id in pathways if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
146   
147    def get_pathways_by_compounds(self, compound_ids):
148        compound_ids = set(compound_ids)
149        pathways = [KEGGCompounds()[id].pathway for id in compound_ids]
150        pathwats = reduce(set.union, pathways, set())
151        return [id for id in pathways if compound_ids.issubset(KEGGPathway(id).compounds())]
152   
153    def get_enzymes_by_compound(self, compound_id):
154        return KEGGCompound()[compound_id].enzyme
155   
156    def get_enzymes_by_gene(self, gene_id):
157        return self.genes[gene_id].enzymes
158   
159    def get_compounds_by_enzyme(self, enzyme_id):
160        return self._enzymes_to_compounds.get(enzyme_id)
161   
162    @deprecated_keywords({"caseSensitive": "case_sensitive"})
163    def get_unique_gene_ids(self, genes, case_sensitive=True):
164        """Return a tuple with three elements. The first is a dictionary mapping from unique gene
165        ids to gene names in genes, the second is a list of conflicting gene names and the third is a list
166        of unknown genes.
167        """
168        unique, conflicting, unknown = {}, [], []
169        for gene in genes:
170            names = self.genematcher.match(gene)
171            if len(names) == 1:
172                unique[names[0]] = gene
173            elif len(names) == 0:
174                unknown.append(gene)
175            else:
176                conflicting.append(gene)
177        return unique, conflicting, unknown
178   
179    def get_genes(self):
180        return self.genes
181   
182    @classmethod
183    def organism_name_search(cls, name):
184        genome = KEGGGenome()
185        if name not in genome:
186            ids = genome.search(name)
187            if not ids:
188                import obiTaxonomy
189                ids = obiTaxonomy.search(name)
190                ids = [id for id in ids if genome.search(id)]
191            name = ids.pop(0) if ids else name
192           
193        try:
194            return genome[name].entry_key
195        except KeyError:
196            raise ValueError("Organism with name='%s' not found in KEGG." % name)
197       
198    @classmethod
199    def organism_version(cls, name):
200        name = cls.organism_name_search(name)
201        genome = KEGGGenome()
202        info = genome.api.binfo(name)
203        return info.release
204#        orngServerFiles.localpath_download("KEGG", "kegg_genes_%s.tar.gz" % name)
205#        return orngServerFiles.info("KEGG", "kegg_genes_%s.tar.gz" % name)["datetime"]
206   
207    def _set_genematcher(self, genematcher):
208        setattr(self, "_genematcher", genematcher)
209       
210    def _get_genematcher(self):
211        if getattr(self, "_genematcher", None) == None:
212            import obiGene
213            if self.org_code == "ddi":
214                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
215                                                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]])
216            else:
217                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code)])
218            self._genematcher.set_targets(self.genes.keys())
219        return self._genematcher
220   
221    genematcher = property(_get_genematcher, _set_genematcher)
222   
223KEGGOrganism = Organism
224   
225def organism_name_search(name):
226    return KEGGOrganism.organism_name_search(name)
227
228def pathways(org):
229    return KEGGPathway.list(org)
230
231def organisms():
232    return KEGGOrganism.organisms()
233
234def from_taxid(taxid):
235    genome = KEGGGenome()
236    res = genome.search(taxid)
237    for r in res:
238        e = genome[r]
239        if e.taxid == taxid:
240            return e.org_code()
241       
242    return None
243
244def to_taxid(name):
245    genome = KEGGGenome()
246    keys = genome.search(name)
247    if keys:
248        return genome[keys[0]].taxid
249    else:
250        return None
251
252def create_gene_sets():
253    pass
254
255import obiGene
256from Orange.misc import serverfiles
257from Orange.misc import ConsoleProgressBar
258
259class MatcherAliasesKEGG(obiGene.MatcherAliasesPickled):
260    DOMAIN = "KEGG"
261    def create_aliases(self):
262        files = set(serverfiles.listfiles(self.DOMAIN))
263        ids_filename = "gene_matcher_kegg_ids_" + self.organism + ".pickle"
264        if ids_filename in files:
265            filename = serverfiles.localpath_download(self.DOMAIN, ids_filename)
266            import cPickle
267            aliases = cPickle.load(open(filename, "rb"))
268        else:
269            pb = ConsoleProgressBar("Retriving KEGG ids:")
270            kegg_org = KEGGOrganism(self.organism)
271            genes = kegg_org.genes
272            genes.pre_cache(progress_callback=pb.set_state)
273            aliases = []
274            for key, entry in genes.iteritems():
275                aliases.append(set([key]) | set(entry.alt_names))
276        return aliases
277   
278    def filename(self):
279        return "kegg3_" + self.organism
280   
281    def create_aliases_version(self):
282        files = set(serverfiles.listfiles(self.DOMAIN))
283        ids_filename = "gene_matcher_kegg_ids_" + self.organism + ".pickle"
284        if ids_filename in files:
285            version = serverfiles.info(self.DOMAIN, ids_filename)
286        else:
287            kegg_org = KEGGOrganism(self.organism)
288            genes = kegg_org.genes
289            version = genes.info.release
290        return version
291       
292    def __init__(self, organism, **kwargs):
293        self.organism = organism
294        sf = serverfiles.ServerFiles()
295        files = set(sf.listfiles(self.DOMAIN))
296        ids_filename = "gene_matcher_kegg_ids_" + self.organism + ".pickle"
297        if ids_filename in files:
298            serverfiles.update(self.DOMAIN, ids_filename)
299           
300        obiGene.MatcherAliasesPickled.__init__(self, **kwargs)
301
302def main():
303    KEGGGenome()
304    import doctest
305    extraglobs = {"api": KeggApi()}
306    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
307
308if __name__ == "__main__":
309    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.