source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1750:8a7cd2cf4ea2

Revision 1750:8a7cd2cf4ea2, 13.0 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Using a single global Genome instance in obiKEGG module level functions.

RevLine 
[1750]1"""
[1532]2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
[1736]6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
[1733]8
[1736]9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
[1532]11
[1736]12.. _`slumber`: https://pypi.python.org/pypi/slumber/
[1532]13
[1736]14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
[1603]51
[1532]52"""
53from __future__ import absolute_import
54
[1733]55import os
56import sys
[1715]57import urllib2
[1750]58import threading
[1733]59
[1532]60from collections import defaultdict
[1735]61from itertools import chain
[1532]62from datetime import datetime
[1750]63from contextlib import contextmanager
[1532]64
[1716]65from Orange.utils import lru_cache
[1733]66from Orange.utils import progress_bar_milestones
[1734]67from Orange.utils import (
68    deprecated_keywords, deprecated_attribute, deprecated_function_name
69)
[1733]70
71from .. import obiProb
[1532]72
73from . import databases
74from . import entry
75
76from .brite import BriteEntry, Brite
77
78from . import api
79from . import conf
80from . import pathway
81
82KEGGGenome = databases.Genome
83KEGGGenes = databases.Genes
[1733]84KEGGEnzyme = databases.Enzyme
85KEGGReaction = databases.Reaction
86KEGGPathways = databases.Pathway
87KEGGCompound = databases.Compound
[1532]88
89KEGGBrite = Brite
90KEGGBriteEntry = BriteEntry
91
92KEGGPathway = pathway.Pathway
93
94DEFAULT_CACHE_DIR = conf.params["cache.path"]
95
96
[1733]97class OrganismNotFoundError(Exception):
98    pass
[1532]99
[1712]100
[1532]101class Organism(object):
[1733]102    """
103    A convenience class for retrieving information regarding an
104    organism in the KEGG Genes database.
105
[1736]106    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
107        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
108        organism code will be searched for by using KEGG `find` api.
[1733]109    :type org: str
110
[1736]111    .. seealso::
112
113        :func:`organism_name_search`
114            Search KEGG for an organism code
115
[1733]116    """
[1532]117    def __init__(self, org, genematcher=None):
118        self.org_code = self.organism_name_search(org)
119        self.genematcher = genematcher
120        self.api = api.CachedKeggApi()
[1733]121
[1532]122    @property
123    def org(self):
[1733]124        """
125        KEGG organism code.
126        """
[1532]127        return self.org_code
[1733]128
[1532]129    @property
130    def genes(self):
[1733]131        """
[1736]132        An :class:`~.databases.Genes` database instance for this organism.
[1733]133        """
134        # TODO: This should not be a property but a method.
135        # I think it was only put here as back compatibility with old obiKEGG.
[1532]136        if not hasattr(self, "_genes"):
137            genes = KEGGGenes(self.org_code)
138            self._genes = genes
139        return self._genes
[1733]140
[1532]141    def gene_aliases(self):
[1733]142        """
[1735]143        Return a list of sets of equal genes (synonyms) in KEGG for
144        this organism.
145
146        .. note::
147
148            This only includes 'ncbi-geneid' and 'ncbi-gi' records
149            from the KEGG Genes DBLINKS entries.
150
[1733]151        """
[1735]152        definitions = self.api.list(self.org_code)
153        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
154        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
155
156        aliases = defaultdict(set)
157
158        for entry_id, definition in definitions:
159            # genes entry id without the organism code
160            aliases[entry_id].add(entry_id.split(":", 1)[1])
161            # all names in the NAME field (KEGG API list returns
162            # 'NAME; DEFINITION') fields for genes
163            names = definition.split(";")[0].split(",")
164            aliases[entry_id].update([name.strip() for name in names])
165
166        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
167            aliases[target_id].add(source_id.split(":", 1)[1])
168
169        return [set([entry_id]).union(names)
170                for entry_id, names in aliases.iteritems()]
[1733]171
[1532]172    def pathways(self, with_ids=None):
[1733]173        """
174        Return a list of all pathways for this organism.
175        """
[1532]176        if with_ids is not None:
177            return self.api.get_pathways_by_genes(with_ids)
178        else:
179            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
[1734]180
[1532]181    def list_pathways(self):
[1733]182        """
[1736]183        List all pathways for this organism.
184
185        .. deprecated: 2.5
186            Use :func:`pathways` instead.
187
[1733]188        """
189        # NOTE: remove/deprecate and use pathways()
[1532]190        return self.pathways()
[1734]191
[1532]192    def get_linked_pathways(self, pathway_id):
193        self.api.get_linked_pathways(pathway_id)
[1734]194
[1532]195    def enzymes(self, genes=None):
196        raise NotImplementedError()
[1734]197
[1733]198    def get_enriched_pathways(self, genes, reference=None,
199                              prob=obiProb.Binomial(), callback=None):
200        """
201        Return a dictionary with enriched pathways ids as keys
202        and (list_of_genes, p_value, num_of_reference_genes) tuples
[1540]203        as items.
[1733]204
[1540]205        """
[1733]206        if reference is None:
207            reference = self.genes.keys()
208        reference = set(reference)
209
210        allPathways = defaultdict(lambda: [[], 1.0, []])
211        milestones = progress_bar_milestones(len(genes), 100)
[1532]212        pathways_db = KEGGPathways()
[1733]213
[1540]214        pathways_for_gene = []
[1532]215        for i, gene in enumerate(genes):
[1540]216            pathways_for_gene.append(self.pathways([gene]))
217            if callback and i in milestones:
[1733]218                callback(i * 50.0 / len(genes))
219
220        # pre-cache for speed
221        pathways_db.pre_cache([pid for pfg in pathways_for_gene
222                               for pid in pfg])
[1540]223        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
[1532]224            for pathway in pathways:
[1733]225                if pathways_db.get_entry(pathway).gene:
[1532]226                    allPathways[pathway][0].append(gene)
227            if callback and i in milestones:
[1733]228                callback(50.0 + i * 50.0 / len(genes))
229
[1532]230        pItems = allPathways.items()
[1733]231
[1532]232        for i, (p_id, entry) in enumerate(pItems):
233            pathway = pathways_db.get_entry(p_id)
234            entry[2].extend(reference.intersection(pathway.gene or []))
[1733]235            entry[1] = prob.p_value(len(entry[0]), len(reference),
236                                    len(entry[2]), len(genes))
237        return dict([(pid, (genes, p, len(ref)))
238                     for pid, (genes, p, ref) in allPathways.items()])
239
[1532]240    def get_genes_by_enzyme(self, enzyme):
[1733]241        enzyme = KEGGEnzyme().get_entry(enzyme)
[1532]242        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
[1733]243
[1532]244    def get_genes_by_pathway(self, pathway_id):
[1550]245        return KEGGPathway(pathway_id).genes()
[1733]246
[1532]247    def get_enzymes_by_pathway(self, pathway_id):
[1550]248        return KEGGPathway(pathway_id).enzymes()
[1734]249
[1532]250    def get_compounds_by_pathway(self, pathway_id):
[1550]251        return KEGGPathway(pathway_id).compounds()
[1734]252
[1532]253    def get_pathways_by_genes(self, gene_ids):
254        return self.api.get_pathways_by_genes(gene_ids)
255        gene_ids = set(gene_ids)
[1733]256        pathways = [self.genes[id].pathway for id in gene_ids
257                    if self.genes[id].pathway]
[1532]258        pathways = reduce(set.union, pathways, set())
[1733]259        return [id for id in pathways
260                if gene_ids.issubset(KEGGPathway(id).genes())]
261
[1532]262    def get_pathways_by_enzymes(self, enzyme_ids):
263        enzyme_ids = set(enzyme_ids)
[1733]264        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
265        pathways = reduce(set.union, pathways, set())
266        return [id for id in pathways
267                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
268
[1532]269    def get_pathways_by_compounds(self, compound_ids):
270        compound_ids = set(compound_ids)
[1733]271        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
272        pathways = reduce(set.union, pathways, set())
273        return [id for id in pathways
274                if compound_ids.issubset(KEGGPathway(id).compounds())]
275
[1532]276    def get_enzymes_by_compound(self, compound_id):
277        return KEGGCompound()[compound_id].enzyme
[1733]278
[1532]279    def get_enzymes_by_gene(self, gene_id):
280        return self.genes[gene_id].enzymes
[1733]281
[1532]282    def get_compounds_by_enzyme(self, enzyme_id):
283        return self._enzymes_to_compounds.get(enzyme_id)
[1734]284
[1532]285    @deprecated_keywords({"caseSensitive": "case_sensitive"})
286    def get_unique_gene_ids(self, genes, case_sensitive=True):
[1733]287        """
288        Return a tuple with three elements. The first is a dictionary
289        mapping from unique geneids to gene names in genes, the second
290        is a list of conflicting gene names and the third is a list of
291        unknown genes.
292
[1532]293        """
294        unique, conflicting, unknown = {}, [], []
295        for gene in genes:
296            names = self.genematcher.match(gene)
297            if len(names) == 1:
298                unique[names[0]] = gene
299            elif len(names) == 0:
300                unknown.append(gene)
301            else:
302                conflicting.append(gene)
303        return unique, conflicting, unknown
[1733]304
305    @deprecated_function_name
[1532]306    def get_genes(self):
307        return self.genes
[1733]308
[1532]309    @classmethod
310    def organism_name_search(cls, name):
[1750]311        return organism_name_search(name)
[1734]312
[1532]313    @classmethod
314    def organism_version(cls, name):
315        name = cls.organism_name_search(name)
[1750]316        with _global_genome_instance() as genome:
317            info = genome.api.info(name)
318            return info.release
[1733]319
[1532]320    def _set_genematcher(self, genematcher):
321        setattr(self, "_genematcher", genematcher)
[1734]322
[1532]323    def _get_genematcher(self):
[1734]324        if getattr(self, "_genematcher", None) is None:
[1632]325            from .. import obiGene
[1532]326            if self.org_code == "ddi":
[1733]327                self._genematcher = obiGene.matcher(
328                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
[1734]329                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
[1733]330                )
[1532]331            else:
[1733]332                self._genematcher = obiGene.matcher(
333                    [obiGene.GMKEGG(self.org_code)])
334
[1532]335            self._genematcher.set_targets(self.genes.keys())
336        return self._genematcher
[1734]337
[1532]338    genematcher = property(_get_genematcher, _set_genematcher)
[1733]339
340
[1532]341KEGGOrganism = Organism
[1733]342
343
[1532]344def organism_name_search(name):
[1736]345    """
[1750]346    Search for a organism by `name` and return it's KEGG organism code.
[1736]347    """
[1750]348    with _global_genome_instance() as genome:
349        try:
350            name = genome.org_code_to_entry_key(name)
351        except ValueError:
352            pass
353
354        if name not in genome:
355            ids = genome.search(name)
356            if ids:
357                name = ids.pop(0) if ids else name
358            else:
359                raise OrganismNotFoundError(name)
360
361        try:
362            return genome[name].organism_code
363        except KeyError:
364            raise OrganismNotFoundError(name)
[1532]365
[1734]366
[1532]367def pathways(org):
[1736]368    """
369    Return a list of all KEGG pathways for an KEGG organism code `org`.
370    """
[1532]371    return KEGGPathway.list(org)
372
[1734]373
[1532]374def from_taxid(taxid):
[1736]375    """
376    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
377    """
[1750]378    with _global_genome_instance() as genome:
379        res = genome.search(taxid)
380        for r in res:
381            e = genome[r]
[1734]382
[1750]383            if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
384                return e.org_code()
[1733]385
[1750]386        return None
[1532]387
[1734]388
[1532]389def to_taxid(name):
[1736]390    """
391    Return a NCBI Taxonomy id for a given KEGG Organism name
392    """
[1750]393    with _global_genome_instance() as genome:
394        if name in genome:
395            return genome[name].taxid
[1733]396
[1750]397        keys = genome.search(name)
398        if keys:
399            return genome[keys[0]].taxid
400        else:
401            return None
402
403
404def _global_genome_instance():
405    genome = getattr(_global_genome_instance, "_genome", None)
406    if genome is None:
407        genome = KEGGGenome()
408        genome._lock = threading.RLock()
409        _global_genome_instance._genome = genome
410
411    @contextmanager
412    def instance_locked(instance, lock):
413        lock.acquire()
414        try:
415            yield instance
416        finally:
417            lock.release()
418
419    return instance_locked(genome, genome._lock)
[1532]420
[1734]421
[1532]422def main():
423    KEGGGenome()
424    import doctest
425    extraglobs = {"api": KeggApi()}
426    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
427
[1733]428
[1532]429if __name__ == "__main__":
[1601]430    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.