source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1733:548d1187a29f

Revision 1733:548d1187a29f, 10.4 KB checked in by Ales Erjavec <ales.erjavec@…>, 14 months ago (diff)

Porting obiKEGG to use the new REST KEGG API.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6This is a python module for access to `KEGG`_ using its web services.
7
8To use this module you need to have `slumber` and `requests` package
9installed.
10
11.. _`KEGG`: http://www.genome.jp/kegg/
12
13
14"""
15from __future__ import absolute_import
16
17import os
18import sys
19import urllib2
20
21from collections import defaultdict
22
23from datetime import datetime
24
25from Orange.utils import lru_cache
26from Orange.utils import progress_bar_milestones
27from Orange.utils import deprecated_keywords, deprecated_attribute, \
28                         deprecated_function_name
29
30from .. import obiProb
31
32from . import databases
33from . import entry
34
35from .brite import BriteEntry, Brite
36
37from . import api
38from . import conf
39from . import pathway
40
41KEGGGenome = databases.Genome
42KEGGGenes = databases.Genes
43KEGGEnzyme = databases.Enzyme
44KEGGReaction = databases.Reaction
45KEGGPathways = databases.Pathway
46KEGGCompound = databases.Compound
47
48KEGGBrite = Brite
49KEGGBriteEntry = BriteEntry
50
51KEGGPathway = pathway.Pathway
52
53DEFAULT_CACHE_DIR = conf.params["cache.path"]
54
55
56class OrganismNotFoundError(Exception):
57    pass
58
59
60class Organism(object):
61    """
62    A convenience class for retrieving information regarding an
63    organism in the KEGG Genes database.
64
65    :param org: KEGGG organism code (e.g. "hsa", "sce")
66    :type org: str
67
68    """
69    def __init__(self, org, genematcher=None):
70        self.org_code = self.organism_name_search(org)
71        self.genematcher = genematcher
72        self.api = api.CachedKeggApi()
73
74    @property
75    def org(self):
76        """
77        KEGG organism code.
78        """
79        return self.org_code
80
81    @property
82    def genes(self):
83        """
84        An :class:`Genes` database instance for this organism.
85        """
86        # TODO: This should not be a property but a method.
87        # I think it was only put here as back compatibility with old obiKEGG.
88        if not hasattr(self, "_genes"):
89            genes = KEGGGenes(self.org_code)
90            self._genes = genes
91        return self._genes
92
93    def gene_aliases(self):
94        """
95        Return known gene aliases (synonyms in other databases).
96        """
97        return self.genes.gene_aliases()
98
99    def pathways(self, with_ids=None):
100        """
101        Return a list of all pathways for this organism.
102        """
103        if with_ids is not None:
104            return self.api.get_pathways_by_genes(with_ids)
105        else:
106            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
107   
108    def list_pathways(self):
109        """
110        List all pathways.
111        """
112        # NOTE: remove/deprecate and use pathways()
113        return self.pathways()
114   
115    def get_linked_pathways(self, pathway_id):
116        self.api.get_linked_pathways(pathway_id)
117       
118    def enzymes(self, genes=None):
119        raise NotImplementedError()
120   
121    def _gm_gene_aliases(self):
122        """
123        Return a list of sets of equal genes. This is a hack for
124        gene matchers to work faster until the whole implementations
125        transitions to REST. Does not include links to DBs.
126        """
127        s1 = urllib2.urlopen("http://rest.kegg.jp/list/%s" % self.org_code).read()
128        out = []
129        for l in s1.split('\n'):
130            if l:
131                tabs = l.split("\t")
132                cset = set([tabs[0]])
133
134                if ":" in tabs[0]:
135                    # also add 'identifier' from 'org_code:identifier'
136                    cset.add(tabs[0].split(":", 1)[-1])
137
138                try:
139                    rest = tabs[1].split(";")[0]
140                    cset |= set(rest.split(", "))
141                except:
142                    pass  # do not crash if a line does not conform
143                out.append(cset)
144        return out
145
146    def get_enriched_pathways(self, genes, reference=None,
147                              prob=obiProb.Binomial(), callback=None):
148        """
149        Return a dictionary with enriched pathways ids as keys
150        and (list_of_genes, p_value, num_of_reference_genes) tuples
151        as items.
152
153        """
154        if reference is None:
155            reference = self.genes.keys()
156        reference = set(reference)
157
158        allPathways = defaultdict(lambda: [[], 1.0, []])
159        milestones = progress_bar_milestones(len(genes), 100)
160        pathways_db = KEGGPathways()
161
162        pathways_for_gene = []
163        for i, gene in enumerate(genes):
164            pathways_for_gene.append(self.pathways([gene]))
165            if callback and i in milestones:
166                callback(i * 50.0 / len(genes))
167
168        # pre-cache for speed
169        pathways_db.pre_cache([pid for pfg in pathways_for_gene
170                               for pid in pfg])
171        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
172            for pathway in pathways:
173                if pathways_db.get_entry(pathway).gene:
174                    allPathways[pathway][0].append(gene)
175            if callback and i in milestones:
176                callback(50.0 + i * 50.0 / len(genes))
177
178        pItems = allPathways.items()
179
180        for i, (p_id, entry) in enumerate(pItems):
181            pathway = pathways_db.get_entry(p_id)
182            entry[2].extend(reference.intersection(pathway.gene or []))
183            entry[1] = prob.p_value(len(entry[0]), len(reference),
184                                    len(entry[2]), len(genes))
185        return dict([(pid, (genes, p, len(ref)))
186                     for pid, (genes, p, ref) in allPathways.items()])
187
188    def get_genes_by_enzyme(self, enzyme):
189        enzyme = KEGGEnzyme().get_entry(enzyme)
190        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
191
192    def get_genes_by_pathway(self, pathway_id):
193        return KEGGPathway(pathway_id).genes()
194
195    def get_enzymes_by_pathway(self, pathway_id):
196        return KEGGPathway(pathway_id).enzymes()
197   
198    def get_compounds_by_pathway(self, pathway_id):
199        return KEGGPathway(pathway_id).compounds()
200   
201    def get_pathways_by_genes(self, gene_ids):
202        return self.api.get_pathways_by_genes(gene_ids)
203        gene_ids = set(gene_ids)
204        pathways = [self.genes[id].pathway for id in gene_ids
205                    if self.genes[id].pathway]
206        pathways = reduce(set.union, pathways, set())
207        return [id for id in pathways
208                if gene_ids.issubset(KEGGPathway(id).genes())]
209
210    def get_pathways_by_enzymes(self, enzyme_ids):
211        enzyme_ids = set(enzyme_ids)
212        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
213        pathways = reduce(set.union, pathways, set())
214        return [id for id in pathways
215                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
216
217    def get_pathways_by_compounds(self, compound_ids):
218        compound_ids = set(compound_ids)
219        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
220        pathways = reduce(set.union, pathways, set())
221        return [id for id in pathways
222                if compound_ids.issubset(KEGGPathway(id).compounds())]
223
224    def get_enzymes_by_compound(self, compound_id):
225        return KEGGCompound()[compound_id].enzyme
226
227    def get_enzymes_by_gene(self, gene_id):
228        return self.genes[gene_id].enzymes
229
230    def get_compounds_by_enzyme(self, enzyme_id):
231        return self._enzymes_to_compounds.get(enzyme_id)
232   
233    @deprecated_keywords({"caseSensitive": "case_sensitive"})
234    def get_unique_gene_ids(self, genes, case_sensitive=True):
235        """
236        Return a tuple with three elements. The first is a dictionary
237        mapping from unique geneids to gene names in genes, the second
238        is a list of conflicting gene names and the third is a list of
239        unknown genes.
240
241        """
242        unique, conflicting, unknown = {}, [], []
243        for gene in genes:
244            names = self.genematcher.match(gene)
245            if len(names) == 1:
246                unique[names[0]] = gene
247            elif len(names) == 0:
248                unknown.append(gene)
249            else:
250                conflicting.append(gene)
251        return unique, conflicting, unknown
252
253    @deprecated_function_name
254    def get_genes(self):
255        return self.genes
256
257    @classmethod
258    def organism_name_search(cls, name):
259        genome = KEGGGenome()
260        if name not in genome:
261            ids = genome.search(name)
262            if not ids:
263                from .. import obiTaxonomy
264                ids = obiTaxonomy.search(name)
265                ids = [id for id in ids if genome.search(id)]
266            name = ids.pop(0) if ids else name
267           
268        try:
269            return genome[name].organism_code
270        except KeyError:
271            raise OrganismNotFoundError(name)
272       
273    @classmethod
274    def organism_version(cls, name):
275        name = cls.organism_name_search(name)
276        genome = KEGGGenome()
277        info = genome.api.info(name)
278        return info.release
279
280    def _set_genematcher(self, genematcher):
281        setattr(self, "_genematcher", genematcher)
282       
283    def _get_genematcher(self):
284        if getattr(self, "_genematcher", None) == None:
285            from .. import obiGene
286            if self.org_code == "ddi":
287                self._genematcher = obiGene.matcher(
288                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
289                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
290                )
291            else:
292                self._genematcher = obiGene.matcher(
293                    [obiGene.GMKEGG(self.org_code)])
294
295            self._genematcher.set_targets(self.genes.keys())
296        return self._genematcher
297   
298    genematcher = property(_get_genematcher, _set_genematcher)
299
300
301KEGGOrganism = Organism
302
303
304def organism_name_search(name):
305    return KEGGOrganism.organism_name_search(name)
306
307def pathways(org):
308    return KEGGPathway.list(org)
309
310def organisms():
311    return KEGGOrganism.organisms()
312
313def from_taxid(taxid):
314    genome = KEGGGenome()
315    res = genome.search(taxid)
316    for r in res:
317        e = genome[r]
318       
319        if e.taxid in [taxid,  genome.TAXID_MAP.get(taxid, taxid)]:
320            return e.org_code()
321
322    return None
323
324def to_taxid(name):
325    genome = KEGGGenome()
326    if name in genome:
327        return genome[name].taxid
328
329    keys = genome.search(name)
330    if keys:
331        return genome[keys[0]].taxid
332    else:
333        return None
334
335def create_gene_sets():
336    pass
337
338
339def main():
340    KEGGGenome()
341    import doctest
342    extraglobs = {"api": KeggApi()}
343    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
344
345
346if __name__ == "__main__":
347    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.