source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1747:fee3c71716ef

Revision 1747:fee3c71716ef, 12.4 KB checked in by markotoplak, 12 months ago (diff)

KEGG organism name finding fixes.

RevLine 
[1532]1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
[1736]6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
[1733]8
[1736]9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
[1532]11
[1736]12.. _`slumber`: https://pypi.python.org/pypi/slumber/
[1532]13
[1736]14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
[1603]51
[1532]52"""
53from __future__ import absolute_import
54
[1733]55import os
56import sys
[1715]57import urllib2
[1733]58
[1532]59from collections import defaultdict
[1735]60from itertools import chain
[1532]61from datetime import datetime
62
[1716]63from Orange.utils import lru_cache
[1733]64from Orange.utils import progress_bar_milestones
[1734]65from Orange.utils import (
66    deprecated_keywords, deprecated_attribute, deprecated_function_name
67)
[1733]68
69from .. import obiProb
[1532]70
71from . import databases
72from . import entry
73
74from .brite import BriteEntry, Brite
75
76from . import api
77from . import conf
78from . import pathway
79
80KEGGGenome = databases.Genome
81KEGGGenes = databases.Genes
[1733]82KEGGEnzyme = databases.Enzyme
83KEGGReaction = databases.Reaction
84KEGGPathways = databases.Pathway
85KEGGCompound = databases.Compound
[1532]86
87KEGGBrite = Brite
88KEGGBriteEntry = BriteEntry
89
90KEGGPathway = pathway.Pathway
91
92DEFAULT_CACHE_DIR = conf.params["cache.path"]
93
94
[1733]95class OrganismNotFoundError(Exception):
96    pass
[1532]97
[1712]98
[1532]99class Organism(object):
[1733]100    """
101    A convenience class for retrieving information regarding an
102    organism in the KEGG Genes database.
103
[1736]104    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
105        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
106        organism code will be searched for by using KEGG `find` api.
[1733]107    :type org: str
108
[1736]109    .. seealso::
110
111        :func:`organism_name_search`
112            Search KEGG for an organism code
113
[1733]114    """
[1532]115    def __init__(self, org, genematcher=None):
116        self.org_code = self.organism_name_search(org)
117        self.genematcher = genematcher
118        self.api = api.CachedKeggApi()
[1733]119
[1532]120    @property
121    def org(self):
[1733]122        """
123        KEGG organism code.
124        """
[1532]125        return self.org_code
[1733]126
[1532]127    @property
128    def genes(self):
[1733]129        """
[1736]130        An :class:`~.databases.Genes` database instance for this organism.
[1733]131        """
132        # TODO: This should not be a property but a method.
133        # I think it was only put here as back compatibility with old obiKEGG.
[1532]134        if not hasattr(self, "_genes"):
135            genes = KEGGGenes(self.org_code)
136            self._genes = genes
137        return self._genes
[1733]138
[1532]139    def gene_aliases(self):
[1733]140        """
[1735]141        Return a list of sets of equal genes (synonyms) in KEGG for
142        this organism.
143
144        .. note::
145
146            This only includes 'ncbi-geneid' and 'ncbi-gi' records
147            from the KEGG Genes DBLINKS entries.
148
[1733]149        """
[1735]150        definitions = self.api.list(self.org_code)
151        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
152        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
153
154        aliases = defaultdict(set)
155
156        for entry_id, definition in definitions:
157            # genes entry id without the organism code
158            aliases[entry_id].add(entry_id.split(":", 1)[1])
159            # all names in the NAME field (KEGG API list returns
160            # 'NAME; DEFINITION') fields for genes
161            names = definition.split(";")[0].split(",")
162            aliases[entry_id].update([name.strip() for name in names])
163
164        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
165            aliases[target_id].add(source_id.split(":", 1)[1])
166
167        return [set([entry_id]).union(names)
168                for entry_id, names in aliases.iteritems()]
[1733]169
[1532]170    def pathways(self, with_ids=None):
[1733]171        """
172        Return a list of all pathways for this organism.
173        """
[1532]174        if with_ids is not None:
175            return self.api.get_pathways_by_genes(with_ids)
176        else:
177            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
[1734]178
[1532]179    def list_pathways(self):
[1733]180        """
[1736]181        List all pathways for this organism.
182
183        .. deprecated: 2.5
184            Use :func:`pathways` instead.
185
[1733]186        """
187        # NOTE: remove/deprecate and use pathways()
[1532]188        return self.pathways()
[1734]189
[1532]190    def get_linked_pathways(self, pathway_id):
191        self.api.get_linked_pathways(pathway_id)
[1734]192
[1532]193    def enzymes(self, genes=None):
194        raise NotImplementedError()
[1734]195
[1733]196    def get_enriched_pathways(self, genes, reference=None,
197                              prob=obiProb.Binomial(), callback=None):
198        """
199        Return a dictionary with enriched pathways ids as keys
200        and (list_of_genes, p_value, num_of_reference_genes) tuples
[1540]201        as items.
[1733]202
[1540]203        """
[1733]204        if reference is None:
205            reference = self.genes.keys()
206        reference = set(reference)
207
208        allPathways = defaultdict(lambda: [[], 1.0, []])
209        milestones = progress_bar_milestones(len(genes), 100)
[1532]210        pathways_db = KEGGPathways()
[1733]211
[1540]212        pathways_for_gene = []
[1532]213        for i, gene in enumerate(genes):
[1540]214            pathways_for_gene.append(self.pathways([gene]))
215            if callback and i in milestones:
[1733]216                callback(i * 50.0 / len(genes))
217
218        # pre-cache for speed
219        pathways_db.pre_cache([pid for pfg in pathways_for_gene
220                               for pid in pfg])
[1540]221        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
[1532]222            for pathway in pathways:
[1733]223                if pathways_db.get_entry(pathway).gene:
[1532]224                    allPathways[pathway][0].append(gene)
225            if callback and i in milestones:
[1733]226                callback(50.0 + i * 50.0 / len(genes))
227
[1532]228        pItems = allPathways.items()
[1733]229
[1532]230        for i, (p_id, entry) in enumerate(pItems):
231            pathway = pathways_db.get_entry(p_id)
232            entry[2].extend(reference.intersection(pathway.gene or []))
[1733]233            entry[1] = prob.p_value(len(entry[0]), len(reference),
234                                    len(entry[2]), len(genes))
235        return dict([(pid, (genes, p, len(ref)))
236                     for pid, (genes, p, ref) in allPathways.items()])
237
[1532]238    def get_genes_by_enzyme(self, enzyme):
[1733]239        enzyme = KEGGEnzyme().get_entry(enzyme)
[1532]240        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
[1733]241
[1532]242    def get_genes_by_pathway(self, pathway_id):
[1550]243        return KEGGPathway(pathway_id).genes()
[1733]244
[1532]245    def get_enzymes_by_pathway(self, pathway_id):
[1550]246        return KEGGPathway(pathway_id).enzymes()
[1734]247
[1532]248    def get_compounds_by_pathway(self, pathway_id):
[1550]249        return KEGGPathway(pathway_id).compounds()
[1734]250
[1532]251    def get_pathways_by_genes(self, gene_ids):
252        return self.api.get_pathways_by_genes(gene_ids)
253        gene_ids = set(gene_ids)
[1733]254        pathways = [self.genes[id].pathway for id in gene_ids
255                    if self.genes[id].pathway]
[1532]256        pathways = reduce(set.union, pathways, set())
[1733]257        return [id for id in pathways
258                if gene_ids.issubset(KEGGPathway(id).genes())]
259
[1532]260    def get_pathways_by_enzymes(self, enzyme_ids):
261        enzyme_ids = set(enzyme_ids)
[1733]262        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
263        pathways = reduce(set.union, pathways, set())
264        return [id for id in pathways
265                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
266
[1532]267    def get_pathways_by_compounds(self, compound_ids):
268        compound_ids = set(compound_ids)
[1733]269        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
270        pathways = reduce(set.union, pathways, set())
271        return [id for id in pathways
272                if compound_ids.issubset(KEGGPathway(id).compounds())]
273
[1532]274    def get_enzymes_by_compound(self, compound_id):
275        return KEGGCompound()[compound_id].enzyme
[1733]276
[1532]277    def get_enzymes_by_gene(self, gene_id):
278        return self.genes[gene_id].enzymes
[1733]279
[1532]280    def get_compounds_by_enzyme(self, enzyme_id):
281        return self._enzymes_to_compounds.get(enzyme_id)
[1734]282
[1532]283    @deprecated_keywords({"caseSensitive": "case_sensitive"})
284    def get_unique_gene_ids(self, genes, case_sensitive=True):
[1733]285        """
286        Return a tuple with three elements. The first is a dictionary
287        mapping from unique geneids to gene names in genes, the second
288        is a list of conflicting gene names and the third is a list of
289        unknown genes.
290
[1532]291        """
292        unique, conflicting, unknown = {}, [], []
293        for gene in genes:
294            names = self.genematcher.match(gene)
295            if len(names) == 1:
296                unique[names[0]] = gene
297            elif len(names) == 0:
298                unknown.append(gene)
299            else:
300                conflicting.append(gene)
301        return unique, conflicting, unknown
[1733]302
303    @deprecated_function_name
[1532]304    def get_genes(self):
305        return self.genes
[1733]306
[1532]307    @classmethod
308    def organism_name_search(cls, name):
309        genome = KEGGGenome()
[1738]310        try:
311            name = genome.org_code_to_entry_key(name)
[1745]312        except ValueError:
[1738]313            pass
314
[1532]315        if name not in genome:
316            ids = genome.search(name)
[1747]317            if ids:
318                name = ids.pop(0) if ids else name
319            else:
320                raise OrganismNotFoundError(name)
[1734]321
[1532]322        try:
[1733]323            return genome[name].organism_code
[1532]324        except KeyError:
[1712]325            raise OrganismNotFoundError(name)
[1734]326
[1532]327    @classmethod
328    def organism_version(cls, name):
329        name = cls.organism_name_search(name)
330        genome = KEGGGenome()
[1733]331        info = genome.api.info(name)
[1532]332        return info.release
[1733]333
[1532]334    def _set_genematcher(self, genematcher):
335        setattr(self, "_genematcher", genematcher)
[1734]336
[1532]337    def _get_genematcher(self):
[1734]338        if getattr(self, "_genematcher", None) is None:
[1632]339            from .. import obiGene
[1532]340            if self.org_code == "ddi":
[1733]341                self._genematcher = obiGene.matcher(
342                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
[1734]343                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
[1733]344                )
[1532]345            else:
[1733]346                self._genematcher = obiGene.matcher(
347                    [obiGene.GMKEGG(self.org_code)])
348
[1532]349            self._genematcher.set_targets(self.genes.keys())
350        return self._genematcher
[1734]351
[1532]352    genematcher = property(_get_genematcher, _set_genematcher)
[1733]353
354
[1532]355KEGGOrganism = Organism
[1733]356
357
[1532]358def organism_name_search(name):
[1736]359    """
360    Search and organism by `name` and return an KEGG organism code.
361    """
[1532]362    return KEGGOrganism.organism_name_search(name)
363
[1734]364
[1532]365def pathways(org):
[1736]366    """
367    Return a list of all KEGG pathways for an KEGG organism code `org`.
368    """
[1532]369    return KEGGPathway.list(org)
370
[1734]371
[1532]372def organisms():
[1736]373    """
374    Return a list of all KEGG organisms.
375    """
[1532]376    return KEGGOrganism.organisms()
377
[1734]378
[1532]379def from_taxid(taxid):
[1736]380    """
381    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
382    """
[1532]383    genome = KEGGGenome()
384    res = genome.search(taxid)
385    for r in res:
386        e = genome[r]
[1734]387
388        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
[1532]389            return e.org_code()
[1733]390
[1532]391    return None
392
[1734]393
[1532]394def to_taxid(name):
[1736]395    """
396    Return a NCBI Taxonomy id for a given KEGG Organism name
397    """
[1532]398    genome = KEGGGenome()
[1552]399    if name in genome:
400        return genome[name].taxid
[1733]401
[1532]402    keys = genome.search(name)
403    if keys:
404        return genome[keys[0]].taxid
405    else:
406        return None
407
[1734]408
[1532]409def main():
410    KEGGGenome()
411    import doctest
412    extraglobs = {"api": KeggApi()}
413    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
414
[1733]415
[1532]416if __name__ == "__main__":
[1601]417    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.