source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1734:91d14dd2cf0e

Revision 1734:91d14dd2cf0e, 10.4 KB checked in by Ales Erjavec <ales.erjavec@…>, 14 months ago (diff)

obiKEGG code style fixes.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6This is a python module for access to `KEGG`_ using its web services.
7
8To use this module you need to have `slumber` and `requests` package
9installed.
10
11.. _`KEGG`: http://www.genome.jp/kegg/
12
13
14"""
15from __future__ import absolute_import
16
17import os
18import sys
19import urllib2
20
21from collections import defaultdict
22
23from datetime import datetime
24
25from Orange.utils import lru_cache
26from Orange.utils import progress_bar_milestones
27from Orange.utils import (
28    deprecated_keywords, deprecated_attribute, deprecated_function_name
29)
30
31from .. import obiProb
32
33from . import databases
34from . import entry
35
36from .brite import BriteEntry, Brite
37
38from . import api
39from . import conf
40from . import pathway
41
42KEGGGenome = databases.Genome
43KEGGGenes = databases.Genes
44KEGGEnzyme = databases.Enzyme
45KEGGReaction = databases.Reaction
46KEGGPathways = databases.Pathway
47KEGGCompound = databases.Compound
48
49KEGGBrite = Brite
50KEGGBriteEntry = BriteEntry
51
52KEGGPathway = pathway.Pathway
53
54DEFAULT_CACHE_DIR = conf.params["cache.path"]
55
56
57class OrganismNotFoundError(Exception):
58    pass
59
60
61class Organism(object):
62    """
63    A convenience class for retrieving information regarding an
64    organism in the KEGG Genes database.
65
66    :param org: KEGGG organism code (e.g. "hsa", "sce")
67    :type org: str
68
69    """
70    def __init__(self, org, genematcher=None):
71        self.org_code = self.organism_name_search(org)
72        self.genematcher = genematcher
73        self.api = api.CachedKeggApi()
74
75    @property
76    def org(self):
77        """
78        KEGG organism code.
79        """
80        return self.org_code
81
82    @property
83    def genes(self):
84        """
85        An :class:`Genes` database instance for this organism.
86        """
87        # TODO: This should not be a property but a method.
88        # I think it was only put here as back compatibility with old obiKEGG.
89        if not hasattr(self, "_genes"):
90            genes = KEGGGenes(self.org_code)
91            self._genes = genes
92        return self._genes
93
94    def gene_aliases(self):
95        """
96        Return known gene aliases (synonyms in other databases).
97        """
98        return self.genes.gene_aliases()
99
100    def pathways(self, with_ids=None):
101        """
102        Return a list of all pathways for this organism.
103        """
104        if with_ids is not None:
105            return self.api.get_pathways_by_genes(with_ids)
106        else:
107            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
108
109    def list_pathways(self):
110        """
111        List all pathways.
112        """
113        # NOTE: remove/deprecate and use pathways()
114        return self.pathways()
115
116    def get_linked_pathways(self, pathway_id):
117        self.api.get_linked_pathways(pathway_id)
118
119    def enzymes(self, genes=None):
120        raise NotImplementedError()
121
122    def _gm_gene_aliases(self):
123        """
124        Return a list of sets of equal genes. This is a hack for
125        gene matchers to work faster until the whole implementations
126        transitions to REST. Does not include links to DBs.
127        """
128        s1 = urllib2.urlopen("http://rest.kegg.jp/list/%s" % self.org_code).read()
129        out = []
130        for l in s1.split('\n'):
131            if l:
132                tabs = l.split("\t")
133                cset = set([tabs[0]])
134
135                if ":" in tabs[0]:
136                    # also add 'identifier' from 'org_code:identifier'
137                    cset.add(tabs[0].split(":", 1)[-1])
138
139                try:
140                    rest = tabs[1].split(";")[0]
141                    cset |= set(rest.split(", "))
142                except:
143                    pass  # do not crash if a line does not conform
144                out.append(cset)
145        return out
146
147    def get_enriched_pathways(self, genes, reference=None,
148                              prob=obiProb.Binomial(), callback=None):
149        """
150        Return a dictionary with enriched pathways ids as keys
151        and (list_of_genes, p_value, num_of_reference_genes) tuples
152        as items.
153
154        """
155        if reference is None:
156            reference = self.genes.keys()
157        reference = set(reference)
158
159        allPathways = defaultdict(lambda: [[], 1.0, []])
160        milestones = progress_bar_milestones(len(genes), 100)
161        pathways_db = KEGGPathways()
162
163        pathways_for_gene = []
164        for i, gene in enumerate(genes):
165            pathways_for_gene.append(self.pathways([gene]))
166            if callback and i in milestones:
167                callback(i * 50.0 / len(genes))
168
169        # pre-cache for speed
170        pathways_db.pre_cache([pid for pfg in pathways_for_gene
171                               for pid in pfg])
172        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
173            for pathway in pathways:
174                if pathways_db.get_entry(pathway).gene:
175                    allPathways[pathway][0].append(gene)
176            if callback and i in milestones:
177                callback(50.0 + i * 50.0 / len(genes))
178
179        pItems = allPathways.items()
180
181        for i, (p_id, entry) in enumerate(pItems):
182            pathway = pathways_db.get_entry(p_id)
183            entry[2].extend(reference.intersection(pathway.gene or []))
184            entry[1] = prob.p_value(len(entry[0]), len(reference),
185                                    len(entry[2]), len(genes))
186        return dict([(pid, (genes, p, len(ref)))
187                     for pid, (genes, p, ref) in allPathways.items()])
188
189    def get_genes_by_enzyme(self, enzyme):
190        enzyme = KEGGEnzyme().get_entry(enzyme)
191        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
192
193    def get_genes_by_pathway(self, pathway_id):
194        return KEGGPathway(pathway_id).genes()
195
196    def get_enzymes_by_pathway(self, pathway_id):
197        return KEGGPathway(pathway_id).enzymes()
198
199    def get_compounds_by_pathway(self, pathway_id):
200        return KEGGPathway(pathway_id).compounds()
201
202    def get_pathways_by_genes(self, gene_ids):
203        return self.api.get_pathways_by_genes(gene_ids)
204        gene_ids = set(gene_ids)
205        pathways = [self.genes[id].pathway for id in gene_ids
206                    if self.genes[id].pathway]
207        pathways = reduce(set.union, pathways, set())
208        return [id for id in pathways
209                if gene_ids.issubset(KEGGPathway(id).genes())]
210
211    def get_pathways_by_enzymes(self, enzyme_ids):
212        enzyme_ids = set(enzyme_ids)
213        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
214        pathways = reduce(set.union, pathways, set())
215        return [id for id in pathways
216                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
217
218    def get_pathways_by_compounds(self, compound_ids):
219        compound_ids = set(compound_ids)
220        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
221        pathways = reduce(set.union, pathways, set())
222        return [id for id in pathways
223                if compound_ids.issubset(KEGGPathway(id).compounds())]
224
225    def get_enzymes_by_compound(self, compound_id):
226        return KEGGCompound()[compound_id].enzyme
227
228    def get_enzymes_by_gene(self, gene_id):
229        return self.genes[gene_id].enzymes
230
231    def get_compounds_by_enzyme(self, enzyme_id):
232        return self._enzymes_to_compounds.get(enzyme_id)
233
234    @deprecated_keywords({"caseSensitive": "case_sensitive"})
235    def get_unique_gene_ids(self, genes, case_sensitive=True):
236        """
237        Return a tuple with three elements. The first is a dictionary
238        mapping from unique geneids to gene names in genes, the second
239        is a list of conflicting gene names and the third is a list of
240        unknown genes.
241
242        """
243        unique, conflicting, unknown = {}, [], []
244        for gene in genes:
245            names = self.genematcher.match(gene)
246            if len(names) == 1:
247                unique[names[0]] = gene
248            elif len(names) == 0:
249                unknown.append(gene)
250            else:
251                conflicting.append(gene)
252        return unique, conflicting, unknown
253
254    @deprecated_function_name
255    def get_genes(self):
256        return self.genes
257
258    @classmethod
259    def organism_name_search(cls, name):
260        genome = KEGGGenome()
261        if name not in genome:
262            ids = genome.search(name)
263            if not ids:
264                from .. import obiTaxonomy
265                ids = obiTaxonomy.search(name)
266                ids = [id for id in ids if genome.search(id)]
267            name = ids.pop(0) if ids else name
268
269        try:
270            return genome[name].organism_code
271        except KeyError:
272            raise OrganismNotFoundError(name)
273
274    @classmethod
275    def organism_version(cls, name):
276        name = cls.organism_name_search(name)
277        genome = KEGGGenome()
278        info = genome.api.info(name)
279        return info.release
280
281    def _set_genematcher(self, genematcher):
282        setattr(self, "_genematcher", genematcher)
283
284    def _get_genematcher(self):
285        if getattr(self, "_genematcher", None) is None:
286            from .. import obiGene
287            if self.org_code == "ddi":
288                self._genematcher = obiGene.matcher(
289                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
290                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
291                )
292            else:
293                self._genematcher = obiGene.matcher(
294                    [obiGene.GMKEGG(self.org_code)])
295
296            self._genematcher.set_targets(self.genes.keys())
297        return self._genematcher
298
299    genematcher = property(_get_genematcher, _set_genematcher)
300
301
302KEGGOrganism = Organism
303
304
305def organism_name_search(name):
306    return KEGGOrganism.organism_name_search(name)
307
308
309def pathways(org):
310    return KEGGPathway.list(org)
311
312
313def organisms():
314    return KEGGOrganism.organisms()
315
316
317def from_taxid(taxid):
318    genome = KEGGGenome()
319    res = genome.search(taxid)
320    for r in res:
321        e = genome[r]
322
323        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
324            return e.org_code()
325
326    return None
327
328
329def to_taxid(name):
330    genome = KEGGGenome()
331    if name in genome:
332        return genome[name].taxid
333
334    keys = genome.search(name)
335    if keys:
336        return genome[keys[0]].taxid
337    else:
338        return None
339
340
341def create_gene_sets():
342    pass
343
344
345def main():
346    KEGGGenome()
347    import doctest
348    extraglobs = {"api": KeggApi()}
349    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
350
351
352if __name__ == "__main__":
353    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.