source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1716:4c4266f7c5a5

Revision 1716:4c4266f7c5a5, 9.3 KB checked in by markotoplak, 20 months ago (diff)

Removed the old version of obiKEGG. Renamed obiKEGG2 -> obiKEGG.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6This is a python module for access to `KEGG`_ using its web services. To use this module you need to have
7`SUDS`_ python library installed (other backends are planed).
8
9.. _`KEGG`: http://www.genome.jp/kegg/
10
11.. _`SUDS`: http://pypi.python.org/pypi/suds/
12
13"""
14from __future__ import absolute_import
15
16import urllib2
17import os, sys
18from collections import defaultdict
19
20from datetime import datetime
21
22from Orange.utils import lru_cache
23
24from . import databases
25from . import entry
26
27from .brite import BriteEntry, Brite
28
29from . import api
30from . import conf
31from . import pathway
32
33KEGGGenome = databases.Genome
34KEGGGenes = databases.Genes
35KEGGEnzymes = databases.Enzymes
36KEGGReaction = databases.Reactions
37KEGGPathways = databases.Pathways
38
39KEGGBrite = Brite
40KEGGBriteEntry = BriteEntry
41
42KEGGPathway = pathway.Pathway
43
44DEFAULT_CACHE_DIR = conf.params["cache.path"]
45
46
47from .. import obiProb
48from Orange.utils import deprecated_keywords, deprecated_attribute
49
50class OrganismNotFoundError(Exception): pass
51
52class Organism(object):
53    def __init__(self, org, genematcher=None):
54        self.org_code = self.organism_name_search(org)
55        self.genematcher = genematcher
56        self.api = api.CachedKeggApi()
57       
58    @property
59    def org(self):
60        return self.org_code
61   
62    @property
63    def genes(self):
64        if not hasattr(self, "_genes"):
65            genes = KEGGGenes(self.org_code)
66            self._genes = genes
67        return self._genes
68   
69    def gene_aliases(self):
70        return self.genes().gene_aliases()
71   
72    def pathways(self, with_ids=None):
73        if with_ids is not None:
74            return self.api.get_pathways_by_genes(with_ids)
75        else:
76            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
77   
78    def list_pathways(self):
79        return self.pathways()
80   
81    def get_linked_pathways(self, pathway_id):
82        self.api.get_linked_pathways(pathway_id)
83       
84    def enzymes(self, genes=None):
85        raise NotImplementedError()
86   
87    def _gm_gene_aliases(self):
88        """
89        Return a list of sets of equal genes. This is a hack for
90        gene matchers to work faster until the whole implementations
91        transitions to REST. Does not include links to DBs.
92        """
93        s1 = urllib2.urlopen("http://rest.kegg.jp/list/%s" % self.org_code).read()
94        out = []
95        for l in s1.split('\n'):
96            if l:
97                tabs = l.split("\t")
98                cset = set([tabs[0]])
99                try:
100                    rest = tabs[1].split(";")[0]
101                    cset |= set(rest.split(", "))
102                except:
103                    pass #do not crash if a line does not conform
104                out.append(cset)
105        return out
106
107    def get_enriched_pathways(self, genes, reference=None, prob=obiProb.Binomial(), callback=None):
108        """ Return a dictionary with enriched pathways ids as keys
109        and (list_of_genes, p_value, num_of_reference_genes) tuples
110        as items.
111       
112        """
113        allPathways = defaultdict(lambda :[[], 1.0, []])
114        from Orange.orng import orngMisc
115        milestones = orngMisc.progressBarMilestones(len(genes), 100)
116        pathways_db = KEGGPathways()
117       
118        pathways_for_gene = []
119        for i, gene in enumerate(genes):
120            pathways_for_gene.append(self.pathways([gene]))
121            if callback and i in milestones:
122                callback(i*50.0/len(genes))
123               
124        # precache for speed
125        pathways_db.pre_cache([pid for pfg in pathways_for_gene for pid in pfg]) 
126        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
127            for pathway in pathways:
128                if pathways_db.get_entry(pathway).gene: 
129                    allPathways[pathway][0].append(gene)
130            if callback and i in milestones:
131                callback(50.0 + i*50.0/len(genes))
132        reference = set(reference if reference is not None else self.genes.keys())
133       
134        pItems = allPathways.items()
135       
136        for i, (p_id, entry) in enumerate(pItems):
137            pathway = pathways_db.get_entry(p_id)
138            entry[2].extend(reference.intersection(pathway.gene or []))
139            entry[1] = prob.p_value(len(entry[0]), len(reference), len(entry[2]), len(genes))
140        return dict([(pid, (genes, p, len(ref))) for pid, (genes, p, ref) in allPathways.items()])
141       
142    def get_genes_by_enzyme(self, enzyme):
143        enzyme = Enzymes().get_entry(enzyme)
144        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
145   
146    def get_genes_by_pathway(self, pathway_id):
147        return KEGGPathway(pathway_id).genes()
148   
149    def get_enzymes_by_pathway(self, pathway_id):
150        return KEGGPathway(pathway_id).enzymes()
151   
152    def get_compounds_by_pathway(self, pathway_id):
153        return KEGGPathway(pathway_id).compounds()
154   
155    def get_pathways_by_genes(self, gene_ids):
156        return self.api.get_pathways_by_genes(gene_ids)
157        gene_ids = set(gene_ids)
158        pathways = [self.genes[id].pathway for id in gene_ids if self.genes[id].pathway]
159        pathways = reduce(set.union, pathways, set())
160        return [id for id in pathways if gene_ids.issubset(KEGGPathway(id).genes())] 
161   
162    def get_pathways_by_enzymes(self, enzyme_ids):
163        enzyme_ids = set(enzyme_ids)
164        pathways = [KEGGEnzymes()[id].pathway for id in enzyme_ids]
165        pathwats = reduce(set.union, pathways, set())
166        return [id for id in pathways if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
167   
168    def get_pathways_by_compounds(self, compound_ids):
169        compound_ids = set(compound_ids)
170        pathways = [KEGGCompounds()[id].pathway for id in compound_ids]
171        pathwats = reduce(set.union, pathways, set())
172        return [id for id in pathways if compound_ids.issubset(KEGGPathway(id).compounds())]
173   
174    def get_enzymes_by_compound(self, compound_id):
175        return KEGGCompound()[compound_id].enzyme
176   
177    def get_enzymes_by_gene(self, gene_id):
178        return self.genes[gene_id].enzymes
179   
180    def get_compounds_by_enzyme(self, enzyme_id):
181        return self._enzymes_to_compounds.get(enzyme_id)
182   
183    @deprecated_keywords({"caseSensitive": "case_sensitive"})
184    def get_unique_gene_ids(self, genes, case_sensitive=True):
185        """Return a tuple with three elements. The first is a dictionary mapping from unique gene
186        ids to gene names in genes, the second is a list of conflicting gene names and the third is a list
187        of unknown genes.
188        """
189        unique, conflicting, unknown = {}, [], []
190        for gene in genes:
191            names = self.genematcher.match(gene)
192            if len(names) == 1:
193                unique[names[0]] = gene
194            elif len(names) == 0:
195                unknown.append(gene)
196            else:
197                conflicting.append(gene)
198        return unique, conflicting, unknown
199   
200    def get_genes(self):
201        return self.genes
202   
203    @classmethod
204    def organism_name_search(cls, name):
205        genome = KEGGGenome()
206        if name not in genome:
207            ids = genome.search(name)
208            if not ids:
209                from .. import obiTaxonomy
210                ids = obiTaxonomy.search(name)
211                ids = [id for id in ids if genome.search(id)]
212            name = ids.pop(0) if ids else name
213           
214        try:
215            return genome[name].entry_key
216        except KeyError:
217            raise OrganismNotFoundError(name)
218       
219    @classmethod
220    def organism_version(cls, name):
221        name = cls.organism_name_search(name)
222        genome = KEGGGenome()
223        info = genome.api.binfo(name)
224        return info.release
225   
226    def _set_genematcher(self, genematcher):
227        setattr(self, "_genematcher", genematcher)
228       
229    def _get_genematcher(self):
230        if getattr(self, "_genematcher", None) == None:
231            from .. import obiGene
232            if self.org_code == "ddi":
233                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
234                                                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]])
235            else:
236                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code)])
237            self._genematcher.set_targets(self.genes.keys())
238        return self._genematcher
239   
240    genematcher = property(_get_genematcher, _set_genematcher)
241   
242KEGGOrganism = Organism
243   
244def organism_name_search(name):
245    return KEGGOrganism.organism_name_search(name)
246
247def pathways(org):
248    return KEGGPathway.list(org)
249
250def organisms():
251    return KEGGOrganism.organisms()
252
253def from_taxid(taxid):
254    genome = KEGGGenome()
255    res = genome.search(taxid)
256    for r in res:
257        e = genome[r]
258       
259        if e.taxid in [taxid,  genome.TAXID_MAP.get(taxid, taxid)]:
260            return e.org_code()
261       
262    return None
263
264def to_taxid(name):
265    genome = KEGGGenome()
266    if name in genome:
267        return genome[name].taxid
268   
269    keys = genome.search(name)
270    if keys:
271        return genome[keys[0]].taxid
272    else:
273        return None
274
275def create_gene_sets():
276    pass
277
278def main():
279    KEGGGenome()
280    import doctest
281    extraglobs = {"api": KeggApi()}
282    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
283
284if __name__ == "__main__":
285    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.