source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1735:50499d1dc55a

Revision 1735:50499d1dc55a, 10.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Changed the Organism.gene_aliases method.

RevLine 
[1532]1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
[1733]6This is a python module for access to `KEGG`_ using its web services.
7
8To use this module you need to have `slumber` and `requests` package
9installed.
[1532]10
11.. _`KEGG`: http://www.genome.jp/kegg/
12
[1603]13
[1532]14"""
15from __future__ import absolute_import
16
[1733]17import os
18import sys
[1715]19import urllib2
[1733]20
[1532]21from collections import defaultdict
[1735]22from itertools import chain
[1532]23from datetime import datetime
24
[1716]25from Orange.utils import lru_cache
[1733]26from Orange.utils import progress_bar_milestones
[1734]27from Orange.utils import (
28    deprecated_keywords, deprecated_attribute, deprecated_function_name
29)
[1733]30
31from .. import obiProb
[1532]32
33from . import databases
34from . import entry
35
36from .brite import BriteEntry, Brite
37
38from . import api
39from . import conf
40from . import pathway
41
42KEGGGenome = databases.Genome
43KEGGGenes = databases.Genes
[1733]44KEGGEnzyme = databases.Enzyme
45KEGGReaction = databases.Reaction
46KEGGPathways = databases.Pathway
47KEGGCompound = databases.Compound
[1532]48
49KEGGBrite = Brite
50KEGGBriteEntry = BriteEntry
51
52KEGGPathway = pathway.Pathway
53
54DEFAULT_CACHE_DIR = conf.params["cache.path"]
55
56
[1733]57class OrganismNotFoundError(Exception):
58    pass
[1532]59
[1712]60
[1532]61class Organism(object):
[1733]62    """
63    A convenience class for retrieving information regarding an
64    organism in the KEGG Genes database.
65
66    :param org: KEGGG organism code (e.g. "hsa", "sce")
67    :type org: str
68
69    """
[1532]70    def __init__(self, org, genematcher=None):
71        self.org_code = self.organism_name_search(org)
72        self.genematcher = genematcher
73        self.api = api.CachedKeggApi()
[1733]74
[1532]75    @property
76    def org(self):
[1733]77        """
78        KEGG organism code.
79        """
[1532]80        return self.org_code
[1733]81
[1532]82    @property
83    def genes(self):
[1733]84        """
85        An :class:`Genes` database instance for this organism.
86        """
87        # TODO: This should not be a property but a method.
88        # I think it was only put here as back compatibility with old obiKEGG.
[1532]89        if not hasattr(self, "_genes"):
90            genes = KEGGGenes(self.org_code)
91            self._genes = genes
92        return self._genes
[1733]93
[1532]94    def gene_aliases(self):
[1733]95        """
[1735]96        Return a list of sets of equal genes (synonyms) in KEGG for
97        this organism.
98
99        .. note::
100
101            This only includes 'ncbi-geneid' and 'ncbi-gi' records
102            from the KEGG Genes DBLINKS entries.
103
[1733]104        """
[1735]105        definitions = self.api.list(self.org_code)
106        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
107        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
108
109        aliases = defaultdict(set)
110
111        for entry_id, definition in definitions:
112            # genes entry id without the organism code
113            aliases[entry_id].add(entry_id.split(":", 1)[1])
114            # all names in the NAME field (KEGG API list returns
115            # 'NAME; DEFINITION') fields for genes
116            names = definition.split(";")[0].split(",")
117            aliases[entry_id].update([name.strip() for name in names])
118
119        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
120            aliases[target_id].add(source_id.split(":", 1)[1])
121
122        return [set([entry_id]).union(names)
123                for entry_id, names in aliases.iteritems()]
[1733]124
[1532]125    def pathways(self, with_ids=None):
[1733]126        """
127        Return a list of all pathways for this organism.
128        """
[1532]129        if with_ids is not None:
130            return self.api.get_pathways_by_genes(with_ids)
131        else:
132            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
[1734]133
[1532]134    def list_pathways(self):
[1733]135        """
136        List all pathways.
137        """
138        # NOTE: remove/deprecate and use pathways()
[1532]139        return self.pathways()
[1734]140
[1532]141    def get_linked_pathways(self, pathway_id):
142        self.api.get_linked_pathways(pathway_id)
[1734]143
[1532]144    def enzymes(self, genes=None):
145        raise NotImplementedError()
[1734]146
[1733]147    def get_enriched_pathways(self, genes, reference=None,
148                              prob=obiProb.Binomial(), callback=None):
149        """
150        Return a dictionary with enriched pathways ids as keys
151        and (list_of_genes, p_value, num_of_reference_genes) tuples
[1540]152        as items.
[1733]153
[1540]154        """
[1733]155        if reference is None:
156            reference = self.genes.keys()
157        reference = set(reference)
158
159        allPathways = defaultdict(lambda: [[], 1.0, []])
160        milestones = progress_bar_milestones(len(genes), 100)
[1532]161        pathways_db = KEGGPathways()
[1733]162
[1540]163        pathways_for_gene = []
[1532]164        for i, gene in enumerate(genes):
[1540]165            pathways_for_gene.append(self.pathways([gene]))
166            if callback and i in milestones:
[1733]167                callback(i * 50.0 / len(genes))
168
169        # pre-cache for speed
170        pathways_db.pre_cache([pid for pfg in pathways_for_gene
171                               for pid in pfg])
[1540]172        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
[1532]173            for pathway in pathways:
[1733]174                if pathways_db.get_entry(pathway).gene:
[1532]175                    allPathways[pathway][0].append(gene)
176            if callback and i in milestones:
[1733]177                callback(50.0 + i * 50.0 / len(genes))
178
[1532]179        pItems = allPathways.items()
[1733]180
[1532]181        for i, (p_id, entry) in enumerate(pItems):
182            pathway = pathways_db.get_entry(p_id)
183            entry[2].extend(reference.intersection(pathway.gene or []))
[1733]184            entry[1] = prob.p_value(len(entry[0]), len(reference),
185                                    len(entry[2]), len(genes))
186        return dict([(pid, (genes, p, len(ref)))
187                     for pid, (genes, p, ref) in allPathways.items()])
188
[1532]189    def get_genes_by_enzyme(self, enzyme):
[1733]190        enzyme = KEGGEnzyme().get_entry(enzyme)
[1532]191        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
[1733]192
[1532]193    def get_genes_by_pathway(self, pathway_id):
[1550]194        return KEGGPathway(pathway_id).genes()
[1733]195
[1532]196    def get_enzymes_by_pathway(self, pathway_id):
[1550]197        return KEGGPathway(pathway_id).enzymes()
[1734]198
[1532]199    def get_compounds_by_pathway(self, pathway_id):
[1550]200        return KEGGPathway(pathway_id).compounds()
[1734]201
[1532]202    def get_pathways_by_genes(self, gene_ids):
203        return self.api.get_pathways_by_genes(gene_ids)
204        gene_ids = set(gene_ids)
[1733]205        pathways = [self.genes[id].pathway for id in gene_ids
206                    if self.genes[id].pathway]
[1532]207        pathways = reduce(set.union, pathways, set())
[1733]208        return [id for id in pathways
209                if gene_ids.issubset(KEGGPathway(id).genes())]
210
[1532]211    def get_pathways_by_enzymes(self, enzyme_ids):
212        enzyme_ids = set(enzyme_ids)
[1733]213        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
214        pathways = reduce(set.union, pathways, set())
215        return [id for id in pathways
216                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
217
[1532]218    def get_pathways_by_compounds(self, compound_ids):
219        compound_ids = set(compound_ids)
[1733]220        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
221        pathways = reduce(set.union, pathways, set())
222        return [id for id in pathways
223                if compound_ids.issubset(KEGGPathway(id).compounds())]
224
[1532]225    def get_enzymes_by_compound(self, compound_id):
226        return KEGGCompound()[compound_id].enzyme
[1733]227
[1532]228    def get_enzymes_by_gene(self, gene_id):
229        return self.genes[gene_id].enzymes
[1733]230
[1532]231    def get_compounds_by_enzyme(self, enzyme_id):
232        return self._enzymes_to_compounds.get(enzyme_id)
[1734]233
[1532]234    @deprecated_keywords({"caseSensitive": "case_sensitive"})
235    def get_unique_gene_ids(self, genes, case_sensitive=True):
[1733]236        """
237        Return a tuple with three elements. The first is a dictionary
238        mapping from unique geneids to gene names in genes, the second
239        is a list of conflicting gene names and the third is a list of
240        unknown genes.
241
[1532]242        """
243        unique, conflicting, unknown = {}, [], []
244        for gene in genes:
245            names = self.genematcher.match(gene)
246            if len(names) == 1:
247                unique[names[0]] = gene
248            elif len(names) == 0:
249                unknown.append(gene)
250            else:
251                conflicting.append(gene)
252        return unique, conflicting, unknown
[1733]253
254    @deprecated_function_name
[1532]255    def get_genes(self):
256        return self.genes
[1733]257
[1532]258    @classmethod
259    def organism_name_search(cls, name):
260        genome = KEGGGenome()
261        if name not in genome:
262            ids = genome.search(name)
263            if not ids:
[1632]264                from .. import obiTaxonomy
[1532]265                ids = obiTaxonomy.search(name)
266                ids = [id for id in ids if genome.search(id)]
267            name = ids.pop(0) if ids else name
[1734]268
[1532]269        try:
[1733]270            return genome[name].organism_code
[1532]271        except KeyError:
[1712]272            raise OrganismNotFoundError(name)
[1734]273
[1532]274    @classmethod
275    def organism_version(cls, name):
276        name = cls.organism_name_search(name)
277        genome = KEGGGenome()
[1733]278        info = genome.api.info(name)
[1532]279        return info.release
[1733]280
[1532]281    def _set_genematcher(self, genematcher):
282        setattr(self, "_genematcher", genematcher)
[1734]283
[1532]284    def _get_genematcher(self):
[1734]285        if getattr(self, "_genematcher", None) is None:
[1632]286            from .. import obiGene
[1532]287            if self.org_code == "ddi":
[1733]288                self._genematcher = obiGene.matcher(
289                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
[1734]290                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
[1733]291                )
[1532]292            else:
[1733]293                self._genematcher = obiGene.matcher(
294                    [obiGene.GMKEGG(self.org_code)])
295
[1532]296            self._genematcher.set_targets(self.genes.keys())
297        return self._genematcher
[1734]298
[1532]299    genematcher = property(_get_genematcher, _set_genematcher)
[1733]300
301
[1532]302KEGGOrganism = Organism
[1733]303
304
[1532]305def organism_name_search(name):
306    return KEGGOrganism.organism_name_search(name)
307
[1734]308
[1532]309def pathways(org):
310    return KEGGPathway.list(org)
311
[1734]312
[1532]313def organisms():
314    return KEGGOrganism.organisms()
315
[1734]316
[1532]317def from_taxid(taxid):
318    genome = KEGGGenome()
319    res = genome.search(taxid)
320    for r in res:
321        e = genome[r]
[1734]322
323        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
[1532]324            return e.org_code()
[1733]325
[1532]326    return None
327
[1734]328
[1532]329def to_taxid(name):
330    genome = KEGGGenome()
[1552]331    if name in genome:
332        return genome[name].taxid
[1733]333
[1532]334    keys = genome.search(name)
335    if keys:
336        return genome[keys[0]].taxid
337    else:
338        return None
339
[1734]340
[1532]341def create_gene_sets():
342    pass
343
[1733]344
[1532]345def main():
346    KEGGGenome()
347    import doctest
348    extraglobs = {"api": KeggApi()}
349    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
350
[1733]351
[1532]352if __name__ == "__main__":
[1601]353    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.