source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1834:91f840200ae3

Revision 1834:91f840200ae3, 13.0 KB checked in by Flashpoint <vid.flashpoint@…>, 10 months ago (diff)

Changed the exception value from KEGG.OrganismNotFound to obiTaxonomy.UnknownSpeciesIdentifier

RevLine 
[1750]1"""
[1532]2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
[1736]6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
[1733]8
[1736]9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
[1532]11
[1736]12.. _`slumber`: https://pypi.python.org/pypi/slumber/
[1532]13
[1736]14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
[1603]51
[1532]52"""
53from __future__ import absolute_import
54
[1733]55import os
56import sys
[1715]57import urllib2
[1750]58import threading
[1733]59
[1532]60from collections import defaultdict
[1735]61from itertools import chain
[1532]62from datetime import datetime
[1750]63from contextlib import contextmanager
[1532]64
[1716]65from Orange.utils import lru_cache
[1733]66from Orange.utils import progress_bar_milestones
[1734]67from Orange.utils import (
68    deprecated_keywords, deprecated_attribute, deprecated_function_name
69)
[1733]70
[1834]71from .. import obiProb, obiTaxonomy
[1532]72
73from . import databases
74from . import entry
75
76from .brite import BriteEntry, Brite
77
78from . import api
79from . import conf
80from . import pathway
81
82KEGGGenome = databases.Genome
83KEGGGenes = databases.Genes
[1733]84KEGGEnzyme = databases.Enzyme
85KEGGReaction = databases.Reaction
86KEGGPathways = databases.Pathway
87KEGGCompound = databases.Compound
[1532]88
89KEGGBrite = Brite
90KEGGBriteEntry = BriteEntry
91
92KEGGPathway = pathway.Pathway
93
94DEFAULT_CACHE_DIR = conf.params["cache.path"]
95
96
97class Organism(object):
[1733]98    """
99    A convenience class for retrieving information regarding an
100    organism in the KEGG Genes database.
101
[1736]102    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
103        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
104        organism code will be searched for by using KEGG `find` api.
[1733]105    :type org: str
106
[1736]107    .. seealso::
108
109        :func:`organism_name_search`
110            Search KEGG for an organism code
111
[1733]112    """
[1532]113    def __init__(self, org, genematcher=None):
114        self.org_code = self.organism_name_search(org)
115        self.genematcher = genematcher
116        self.api = api.CachedKeggApi()
[1733]117
[1532]118    @property
119    def org(self):
[1733]120        """
121        KEGG organism code.
122        """
[1532]123        return self.org_code
[1733]124
[1532]125    @property
126    def genes(self):
[1733]127        """
[1736]128        An :class:`~.databases.Genes` database instance for this organism.
[1733]129        """
130        # TODO: This should not be a property but a method.
131        # I think it was only put here as back compatibility with old obiKEGG.
[1532]132        if not hasattr(self, "_genes"):
133            genes = KEGGGenes(self.org_code)
134            self._genes = genes
135        return self._genes
[1733]136
[1532]137    def gene_aliases(self):
[1733]138        """
[1735]139        Return a list of sets of equal genes (synonyms) in KEGG for
140        this organism.
141
142        .. note::
143
144            This only includes 'ncbi-geneid' and 'ncbi-gi' records
145            from the KEGG Genes DBLINKS entries.
146
[1733]147        """
[1735]148        definitions = self.api.list(self.org_code)
149        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
150        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
151
152        aliases = defaultdict(set)
153
154        for entry_id, definition in definitions:
155            # genes entry id without the organism code
156            aliases[entry_id].add(entry_id.split(":", 1)[1])
157            # all names in the NAME field (KEGG API list returns
158            # 'NAME; DEFINITION') fields for genes
159            names = definition.split(";")[0].split(",")
160            aliases[entry_id].update([name.strip() for name in names])
161
162        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
163            aliases[target_id].add(source_id.split(":", 1)[1])
164
165        return [set([entry_id]).union(names)
166                for entry_id, names in aliases.iteritems()]
[1733]167
[1532]168    def pathways(self, with_ids=None):
[1733]169        """
170        Return a list of all pathways for this organism.
171        """
[1532]172        if with_ids is not None:
173            return self.api.get_pathways_by_genes(with_ids)
174        else:
175            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
[1734]176
[1532]177    def list_pathways(self):
[1733]178        """
[1736]179        List all pathways for this organism.
180
181        .. deprecated: 2.5
182            Use :func:`pathways` instead.
183
[1733]184        """
185        # NOTE: remove/deprecate and use pathways()
[1532]186        return self.pathways()
[1734]187
[1532]188    def get_linked_pathways(self, pathway_id):
189        self.api.get_linked_pathways(pathway_id)
[1734]190
[1532]191    def enzymes(self, genes=None):
192        raise NotImplementedError()
[1734]193
[1733]194    def get_enriched_pathways(self, genes, reference=None,
195                              prob=obiProb.Binomial(), callback=None):
196        """
197        Return a dictionary with enriched pathways ids as keys
198        and (list_of_genes, p_value, num_of_reference_genes) tuples
[1540]199        as items.
[1733]200
[1540]201        """
[1733]202        if reference is None:
203            reference = self.genes.keys()
204        reference = set(reference)
205
206        allPathways = defaultdict(lambda: [[], 1.0, []])
207        milestones = progress_bar_milestones(len(genes), 100)
[1532]208        pathways_db = KEGGPathways()
[1733]209
[1540]210        pathways_for_gene = []
[1532]211        for i, gene in enumerate(genes):
[1540]212            pathways_for_gene.append(self.pathways([gene]))
213            if callback and i in milestones:
[1733]214                callback(i * 50.0 / len(genes))
215
216        # pre-cache for speed
217        pathways_db.pre_cache([pid for pfg in pathways_for_gene
218                               for pid in pfg])
[1540]219        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
[1532]220            for pathway in pathways:
[1733]221                if pathways_db.get_entry(pathway).gene:
[1532]222                    allPathways[pathway][0].append(gene)
223            if callback and i in milestones:
[1733]224                callback(50.0 + i * 50.0 / len(genes))
225
[1532]226        pItems = allPathways.items()
[1733]227
[1532]228        for i, (p_id, entry) in enumerate(pItems):
229            pathway = pathways_db.get_entry(p_id)
230            entry[2].extend(reference.intersection(pathway.gene or []))
[1733]231            entry[1] = prob.p_value(len(entry[0]), len(reference),
232                                    len(entry[2]), len(genes))
233        return dict([(pid, (genes, p, len(ref)))
234                     for pid, (genes, p, ref) in allPathways.items()])
235
[1532]236    def get_genes_by_enzyme(self, enzyme):
[1733]237        enzyme = KEGGEnzyme().get_entry(enzyme)
[1532]238        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
[1733]239
[1532]240    def get_genes_by_pathway(self, pathway_id):
[1550]241        return KEGGPathway(pathway_id).genes()
[1733]242
[1532]243    def get_enzymes_by_pathway(self, pathway_id):
[1550]244        return KEGGPathway(pathway_id).enzymes()
[1734]245
[1532]246    def get_compounds_by_pathway(self, pathway_id):
[1550]247        return KEGGPathway(pathway_id).compounds()
[1734]248
[1532]249    def get_pathways_by_genes(self, gene_ids):
250        return self.api.get_pathways_by_genes(gene_ids)
251        gene_ids = set(gene_ids)
[1733]252        pathways = [self.genes[id].pathway for id in gene_ids
253                    if self.genes[id].pathway]
[1532]254        pathways = reduce(set.union, pathways, set())
[1733]255        return [id for id in pathways
256                if gene_ids.issubset(KEGGPathway(id).genes())]
257
[1532]258    def get_pathways_by_enzymes(self, enzyme_ids):
259        enzyme_ids = set(enzyme_ids)
[1733]260        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
261        pathways = reduce(set.union, pathways, set())
262        return [id for id in pathways
263                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
264
[1532]265    def get_pathways_by_compounds(self, compound_ids):
266        compound_ids = set(compound_ids)
[1733]267        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
268        pathways = reduce(set.union, pathways, set())
269        return [id for id in pathways
270                if compound_ids.issubset(KEGGPathway(id).compounds())]
271
[1532]272    def get_enzymes_by_compound(self, compound_id):
273        return KEGGCompound()[compound_id].enzyme
[1733]274
[1532]275    def get_enzymes_by_gene(self, gene_id):
276        return self.genes[gene_id].enzymes
[1733]277
[1532]278    def get_compounds_by_enzyme(self, enzyme_id):
279        return self._enzymes_to_compounds.get(enzyme_id)
[1734]280
[1532]281    @deprecated_keywords({"caseSensitive": "case_sensitive"})
282    def get_unique_gene_ids(self, genes, case_sensitive=True):
[1733]283        """
284        Return a tuple with three elements. The first is a dictionary
285        mapping from unique geneids to gene names in genes, the second
286        is a list of conflicting gene names and the third is a list of
287        unknown genes.
288
[1532]289        """
290        unique, conflicting, unknown = {}, [], []
291        for gene in genes:
292            names = self.genematcher.match(gene)
293            if len(names) == 1:
294                unique[names[0]] = gene
295            elif len(names) == 0:
296                unknown.append(gene)
297            else:
298                conflicting.append(gene)
299        return unique, conflicting, unknown
[1733]300
301    @deprecated_function_name
[1532]302    def get_genes(self):
303        return self.genes
[1733]304
[1532]305    @classmethod
306    def organism_name_search(cls, name):
[1750]307        return organism_name_search(name)
[1734]308
[1532]309    @classmethod
310    def organism_version(cls, name):
311        name = cls.organism_name_search(name)
[1750]312        with _global_genome_instance() as genome:
313            info = genome.api.info(name)
314            return info.release
[1733]315
[1532]316    def _set_genematcher(self, genematcher):
317        setattr(self, "_genematcher", genematcher)
[1734]318
[1532]319    def _get_genematcher(self):
[1734]320        if getattr(self, "_genematcher", None) is None:
[1632]321            from .. import obiGene
[1532]322            if self.org_code == "ddi":
[1733]323                self._genematcher = obiGene.matcher(
324                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
[1734]325                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
[1733]326                )
[1532]327            else:
[1733]328                self._genematcher = obiGene.matcher(
329                    [obiGene.GMKEGG(self.org_code)])
330
[1532]331            self._genematcher.set_targets(self.genes.keys())
332        return self._genematcher
[1734]333
[1532]334    genematcher = property(_get_genematcher, _set_genematcher)
[1733]335
336
[1532]337KEGGOrganism = Organism
[1733]338
339
[1532]340def organism_name_search(name):
[1736]341    """
[1750]342    Search for a organism by `name` and return it's KEGG organism code.
[1736]343    """
[1750]344    with _global_genome_instance() as genome:
345        try:
346            name = genome.org_code_to_entry_key(name)
347        except ValueError:
348            pass
349
350        if name not in genome:
351            ids = genome.search(name)
352            if ids:
353                name = ids.pop(0) if ids else name
354            else:
[1834]355                raise obiTaxonomy.UnknownSpeciesIdentifier(name)
[1750]356
357        try:
358            return genome[name].organism_code
359        except KeyError:
[1834]360            raise obiTaxonomy.UnknownSpeciesIdentifier(name)
[1532]361
[1734]362
[1532]363def pathways(org):
[1736]364    """
365    Return a list of all KEGG pathways for an KEGG organism code `org`.
366    """
[1532]367    return KEGGPathway.list(org)
368
[1734]369
[1532]370def from_taxid(taxid):
[1736]371    """
372    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
373    """
[1750]374    with _global_genome_instance() as genome:
375        res = genome.search(taxid)
376        for r in res:
377            e = genome[r]
[1734]378
[1750]379            if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
380                return e.org_code()
[1733]381
[1750]382        return None
[1532]383
[1734]384
[1532]385def to_taxid(name):
[1736]386    """
387    Return a NCBI Taxonomy id for a given KEGG Organism name
388    """
[1750]389    with _global_genome_instance() as genome:
390        if name in genome:
391            return genome[name].taxid
[1733]392
[1750]393        keys = genome.search(name)
394        if keys:
395            return genome[keys[0]].taxid
396        else:
397            return None
398
399
400def _global_genome_instance():
401    genome = getattr(_global_genome_instance, "_genome", None)
402    if genome is None:
403        genome = KEGGGenome()
404        genome._lock = threading.RLock()
405        _global_genome_instance._genome = genome
406
407    @contextmanager
408    def instance_locked(instance, lock):
409        lock.acquire()
410        try:
411            yield instance
412        finally:
413            lock.release()
414
415    return instance_locked(genome, genome._lock)
[1532]416
[1734]417
[1532]418def main():
419    KEGGGenome()
420    import doctest
421    extraglobs = {"api": KeggApi()}
422    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
423
[1733]424
[1532]425if __name__ == "__main__":
[1601]426    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.