source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1734:91d14dd2cf0e

Revision 1734:91d14dd2cf0e, 10.4 KB checked in by Ales Erjavec <ales.erjavec@…>, 14 months ago (diff)

obiKEGG code style fixes.

RevLine 
[1532]1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
[1733]6This is a python module for access to `KEGG`_ using its web services.
7
8To use this module you need to have `slumber` and `requests` package
9installed.
[1532]10
11.. _`KEGG`: http://www.genome.jp/kegg/
12
[1603]13
[1532]14"""
15from __future__ import absolute_import
16
[1733]17import os
18import sys
[1715]19import urllib2
[1733]20
[1532]21from collections import defaultdict
22
23from datetime import datetime
24
[1716]25from Orange.utils import lru_cache
[1733]26from Orange.utils import progress_bar_milestones
[1734]27from Orange.utils import (
28    deprecated_keywords, deprecated_attribute, deprecated_function_name
29)
[1733]30
31from .. import obiProb
[1532]32
33from . import databases
34from . import entry
35
36from .brite import BriteEntry, Brite
37
38from . import api
39from . import conf
40from . import pathway
41
42KEGGGenome = databases.Genome
43KEGGGenes = databases.Genes
[1733]44KEGGEnzyme = databases.Enzyme
45KEGGReaction = databases.Reaction
46KEGGPathways = databases.Pathway
47KEGGCompound = databases.Compound
[1532]48
49KEGGBrite = Brite
50KEGGBriteEntry = BriteEntry
51
52KEGGPathway = pathway.Pathway
53
54DEFAULT_CACHE_DIR = conf.params["cache.path"]
55
56
[1733]57class OrganismNotFoundError(Exception):
58    pass
[1532]59
[1712]60
[1532]61class Organism(object):
[1733]62    """
63    A convenience class for retrieving information regarding an
64    organism in the KEGG Genes database.
65
66    :param org: KEGGG organism code (e.g. "hsa", "sce")
67    :type org: str
68
69    """
[1532]70    def __init__(self, org, genematcher=None):
71        self.org_code = self.organism_name_search(org)
72        self.genematcher = genematcher
73        self.api = api.CachedKeggApi()
[1733]74
[1532]75    @property
76    def org(self):
[1733]77        """
78        KEGG organism code.
79        """
[1532]80        return self.org_code
[1733]81
[1532]82    @property
83    def genes(self):
[1733]84        """
85        An :class:`Genes` database instance for this organism.
86        """
87        # TODO: This should not be a property but a method.
88        # I think it was only put here as back compatibility with old obiKEGG.
[1532]89        if not hasattr(self, "_genes"):
90            genes = KEGGGenes(self.org_code)
91            self._genes = genes
92        return self._genes
[1733]93
[1532]94    def gene_aliases(self):
[1733]95        """
96        Return known gene aliases (synonyms in other databases).
97        """
98        return self.genes.gene_aliases()
99
[1532]100    def pathways(self, with_ids=None):
[1733]101        """
102        Return a list of all pathways for this organism.
103        """
[1532]104        if with_ids is not None:
105            return self.api.get_pathways_by_genes(with_ids)
106        else:
107            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
[1734]108
[1532]109    def list_pathways(self):
[1733]110        """
111        List all pathways.
112        """
113        # NOTE: remove/deprecate and use pathways()
[1532]114        return self.pathways()
[1734]115
[1532]116    def get_linked_pathways(self, pathway_id):
117        self.api.get_linked_pathways(pathway_id)
[1734]118
[1532]119    def enzymes(self, genes=None):
120        raise NotImplementedError()
[1734]121
[1715]122    def _gm_gene_aliases(self):
123        """
124        Return a list of sets of equal genes. This is a hack for
125        gene matchers to work faster until the whole implementations
126        transitions to REST. Does not include links to DBs.
127        """
128        s1 = urllib2.urlopen("http://rest.kegg.jp/list/%s" % self.org_code).read()
129        out = []
130        for l in s1.split('\n'):
131            if l:
132                tabs = l.split("\t")
133                cset = set([tabs[0]])
[1733]134
135                if ":" in tabs[0]:
136                    # also add 'identifier' from 'org_code:identifier'
137                    cset.add(tabs[0].split(":", 1)[-1])
138
[1715]139                try:
140                    rest = tabs[1].split(";")[0]
141                    cset |= set(rest.split(", "))
142                except:
[1733]143                    pass  # do not crash if a line does not conform
[1715]144                out.append(cset)
145        return out
146
[1733]147    def get_enriched_pathways(self, genes, reference=None,
148                              prob=obiProb.Binomial(), callback=None):
149        """
150        Return a dictionary with enriched pathways ids as keys
151        and (list_of_genes, p_value, num_of_reference_genes) tuples
[1540]152        as items.
[1733]153
[1540]154        """
[1733]155        if reference is None:
156            reference = self.genes.keys()
157        reference = set(reference)
158
159        allPathways = defaultdict(lambda: [[], 1.0, []])
160        milestones = progress_bar_milestones(len(genes), 100)
[1532]161        pathways_db = KEGGPathways()
[1733]162
[1540]163        pathways_for_gene = []
[1532]164        for i, gene in enumerate(genes):
[1540]165            pathways_for_gene.append(self.pathways([gene]))
166            if callback and i in milestones:
[1733]167                callback(i * 50.0 / len(genes))
168
169        # pre-cache for speed
170        pathways_db.pre_cache([pid for pfg in pathways_for_gene
171                               for pid in pfg])
[1540]172        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
[1532]173            for pathway in pathways:
[1733]174                if pathways_db.get_entry(pathway).gene:
[1532]175                    allPathways[pathway][0].append(gene)
176            if callback and i in milestones:
[1733]177                callback(50.0 + i * 50.0 / len(genes))
178
[1532]179        pItems = allPathways.items()
[1733]180
[1532]181        for i, (p_id, entry) in enumerate(pItems):
182            pathway = pathways_db.get_entry(p_id)
183            entry[2].extend(reference.intersection(pathway.gene or []))
[1733]184            entry[1] = prob.p_value(len(entry[0]), len(reference),
185                                    len(entry[2]), len(genes))
186        return dict([(pid, (genes, p, len(ref)))
187                     for pid, (genes, p, ref) in allPathways.items()])
188
[1532]189    def get_genes_by_enzyme(self, enzyme):
[1733]190        enzyme = KEGGEnzyme().get_entry(enzyme)
[1532]191        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
[1733]192
[1532]193    def get_genes_by_pathway(self, pathway_id):
[1550]194        return KEGGPathway(pathway_id).genes()
[1733]195
[1532]196    def get_enzymes_by_pathway(self, pathway_id):
[1550]197        return KEGGPathway(pathway_id).enzymes()
[1734]198
[1532]199    def get_compounds_by_pathway(self, pathway_id):
[1550]200        return KEGGPathway(pathway_id).compounds()
[1734]201
[1532]202    def get_pathways_by_genes(self, gene_ids):
203        return self.api.get_pathways_by_genes(gene_ids)
204        gene_ids = set(gene_ids)
[1733]205        pathways = [self.genes[id].pathway for id in gene_ids
206                    if self.genes[id].pathway]
[1532]207        pathways = reduce(set.union, pathways, set())
[1733]208        return [id for id in pathways
209                if gene_ids.issubset(KEGGPathway(id).genes())]
210
[1532]211    def get_pathways_by_enzymes(self, enzyme_ids):
212        enzyme_ids = set(enzyme_ids)
[1733]213        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
214        pathways = reduce(set.union, pathways, set())
215        return [id for id in pathways
216                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
217
[1532]218    def get_pathways_by_compounds(self, compound_ids):
219        compound_ids = set(compound_ids)
[1733]220        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
221        pathways = reduce(set.union, pathways, set())
222        return [id for id in pathways
223                if compound_ids.issubset(KEGGPathway(id).compounds())]
224
[1532]225    def get_enzymes_by_compound(self, compound_id):
226        return KEGGCompound()[compound_id].enzyme
[1733]227
[1532]228    def get_enzymes_by_gene(self, gene_id):
229        return self.genes[gene_id].enzymes
[1733]230
[1532]231    def get_compounds_by_enzyme(self, enzyme_id):
232        return self._enzymes_to_compounds.get(enzyme_id)
[1734]233
[1532]234    @deprecated_keywords({"caseSensitive": "case_sensitive"})
235    def get_unique_gene_ids(self, genes, case_sensitive=True):
[1733]236        """
237        Return a tuple with three elements. The first is a dictionary
238        mapping from unique geneids to gene names in genes, the second
239        is a list of conflicting gene names and the third is a list of
240        unknown genes.
241
[1532]242        """
243        unique, conflicting, unknown = {}, [], []
244        for gene in genes:
245            names = self.genematcher.match(gene)
246            if len(names) == 1:
247                unique[names[0]] = gene
248            elif len(names) == 0:
249                unknown.append(gene)
250            else:
251                conflicting.append(gene)
252        return unique, conflicting, unknown
[1733]253
254    @deprecated_function_name
[1532]255    def get_genes(self):
256        return self.genes
[1733]257
[1532]258    @classmethod
259    def organism_name_search(cls, name):
260        genome = KEGGGenome()
261        if name not in genome:
262            ids = genome.search(name)
263            if not ids:
[1632]264                from .. import obiTaxonomy
[1532]265                ids = obiTaxonomy.search(name)
266                ids = [id for id in ids if genome.search(id)]
267            name = ids.pop(0) if ids else name
[1734]268
[1532]269        try:
[1733]270            return genome[name].organism_code
[1532]271        except KeyError:
[1712]272            raise OrganismNotFoundError(name)
[1734]273
[1532]274    @classmethod
275    def organism_version(cls, name):
276        name = cls.organism_name_search(name)
277        genome = KEGGGenome()
[1733]278        info = genome.api.info(name)
[1532]279        return info.release
[1733]280
[1532]281    def _set_genematcher(self, genematcher):
282        setattr(self, "_genematcher", genematcher)
[1734]283
[1532]284    def _get_genematcher(self):
[1734]285        if getattr(self, "_genematcher", None) is None:
[1632]286            from .. import obiGene
[1532]287            if self.org_code == "ddi":
[1733]288                self._genematcher = obiGene.matcher(
289                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
[1734]290                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
[1733]291                )
[1532]292            else:
[1733]293                self._genematcher = obiGene.matcher(
294                    [obiGene.GMKEGG(self.org_code)])
295
[1532]296            self._genematcher.set_targets(self.genes.keys())
297        return self._genematcher
[1734]298
[1532]299    genematcher = property(_get_genematcher, _set_genematcher)
[1733]300
301
[1532]302KEGGOrganism = Organism
[1733]303
304
[1532]305def organism_name_search(name):
306    return KEGGOrganism.organism_name_search(name)
307
[1734]308
[1532]309def pathways(org):
310    return KEGGPathway.list(org)
311
[1734]312
[1532]313def organisms():
314    return KEGGOrganism.organisms()
315
[1734]316
[1532]317def from_taxid(taxid):
318    genome = KEGGGenome()
319    res = genome.search(taxid)
320    for r in res:
321        e = genome[r]
[1734]322
323        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
[1532]324            return e.org_code()
[1733]325
[1532]326    return None
327
[1734]328
[1532]329def to_taxid(name):
330    genome = KEGGGenome()
[1552]331    if name in genome:
332        return genome[name].taxid
[1733]333
[1532]334    keys = genome.search(name)
335    if keys:
336        return genome[keys[0]].taxid
337    else:
338        return None
339
[1734]340
[1532]341def create_gene_sets():
342    pass
343
[1733]344
[1532]345def main():
346    KEGGGenome()
347    import doctest
348    extraglobs = {"api": KeggApi()}
349    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
350
[1733]351
[1532]352if __name__ == "__main__":
[1601]353    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.