source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1750:8a7cd2cf4ea2

Revision 1750:8a7cd2cf4ea2, 13.0 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Using a single global Genome instance in obiKEGG module level functions.

Line 
1"""
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
8
9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
11
12.. _`slumber`: https://pypi.python.org/pypi/slumber/
13
14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
51
52"""
53from __future__ import absolute_import
54
55import os
56import sys
57import urllib2
58import threading
59
60from collections import defaultdict
61from itertools import chain
62from datetime import datetime
63from contextlib import contextmanager
64
65from Orange.utils import lru_cache
66from Orange.utils import progress_bar_milestones
67from Orange.utils import (
68    deprecated_keywords, deprecated_attribute, deprecated_function_name
69)
70
71from .. import obiProb
72
73from . import databases
74from . import entry
75
76from .brite import BriteEntry, Brite
77
78from . import api
79from . import conf
80from . import pathway
81
82KEGGGenome = databases.Genome
83KEGGGenes = databases.Genes
84KEGGEnzyme = databases.Enzyme
85KEGGReaction = databases.Reaction
86KEGGPathways = databases.Pathway
87KEGGCompound = databases.Compound
88
89KEGGBrite = Brite
90KEGGBriteEntry = BriteEntry
91
92KEGGPathway = pathway.Pathway
93
94DEFAULT_CACHE_DIR = conf.params["cache.path"]
95
96
97class OrganismNotFoundError(Exception):
98    pass
99
100
101class Organism(object):
102    """
103    A convenience class for retrieving information regarding an
104    organism in the KEGG Genes database.
105
106    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
107        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
108        organism code will be searched for by using KEGG `find` api.
109    :type org: str
110
111    .. seealso::
112
113        :func:`organism_name_search`
114            Search KEGG for an organism code
115
116    """
117    def __init__(self, org, genematcher=None):
118        self.org_code = self.organism_name_search(org)
119        self.genematcher = genematcher
120        self.api = api.CachedKeggApi()
121
122    @property
123    def org(self):
124        """
125        KEGG organism code.
126        """
127        return self.org_code
128
129    @property
130    def genes(self):
131        """
132        An :class:`~.databases.Genes` database instance for this organism.
133        """
134        # TODO: This should not be a property but a method.
135        # I think it was only put here as back compatibility with old obiKEGG.
136        if not hasattr(self, "_genes"):
137            genes = KEGGGenes(self.org_code)
138            self._genes = genes
139        return self._genes
140
141    def gene_aliases(self):
142        """
143        Return a list of sets of equal genes (synonyms) in KEGG for
144        this organism.
145
146        .. note::
147
148            This only includes 'ncbi-geneid' and 'ncbi-gi' records
149            from the KEGG Genes DBLINKS entries.
150
151        """
152        definitions = self.api.list(self.org_code)
153        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
154        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
155
156        aliases = defaultdict(set)
157
158        for entry_id, definition in definitions:
159            # genes entry id without the organism code
160            aliases[entry_id].add(entry_id.split(":", 1)[1])
161            # all names in the NAME field (KEGG API list returns
162            # 'NAME; DEFINITION') fields for genes
163            names = definition.split(";")[0].split(",")
164            aliases[entry_id].update([name.strip() for name in names])
165
166        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
167            aliases[target_id].add(source_id.split(":", 1)[1])
168
169        return [set([entry_id]).union(names)
170                for entry_id, names in aliases.iteritems()]
171
172    def pathways(self, with_ids=None):
173        """
174        Return a list of all pathways for this organism.
175        """
176        if with_ids is not None:
177            return self.api.get_pathways_by_genes(with_ids)
178        else:
179            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
180
181    def list_pathways(self):
182        """
183        List all pathways for this organism.
184
185        .. deprecated: 2.5
186            Use :func:`pathways` instead.
187
188        """
189        # NOTE: remove/deprecate and use pathways()
190        return self.pathways()
191
192    def get_linked_pathways(self, pathway_id):
193        self.api.get_linked_pathways(pathway_id)
194
195    def enzymes(self, genes=None):
196        raise NotImplementedError()
197
198    def get_enriched_pathways(self, genes, reference=None,
199                              prob=obiProb.Binomial(), callback=None):
200        """
201        Return a dictionary with enriched pathways ids as keys
202        and (list_of_genes, p_value, num_of_reference_genes) tuples
203        as items.
204
205        """
206        if reference is None:
207            reference = self.genes.keys()
208        reference = set(reference)
209
210        allPathways = defaultdict(lambda: [[], 1.0, []])
211        milestones = progress_bar_milestones(len(genes), 100)
212        pathways_db = KEGGPathways()
213
214        pathways_for_gene = []
215        for i, gene in enumerate(genes):
216            pathways_for_gene.append(self.pathways([gene]))
217            if callback and i in milestones:
218                callback(i * 50.0 / len(genes))
219
220        # pre-cache for speed
221        pathways_db.pre_cache([pid for pfg in pathways_for_gene
222                               for pid in pfg])
223        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
224            for pathway in pathways:
225                if pathways_db.get_entry(pathway).gene:
226                    allPathways[pathway][0].append(gene)
227            if callback and i in milestones:
228                callback(50.0 + i * 50.0 / len(genes))
229
230        pItems = allPathways.items()
231
232        for i, (p_id, entry) in enumerate(pItems):
233            pathway = pathways_db.get_entry(p_id)
234            entry[2].extend(reference.intersection(pathway.gene or []))
235            entry[1] = prob.p_value(len(entry[0]), len(reference),
236                                    len(entry[2]), len(genes))
237        return dict([(pid, (genes, p, len(ref)))
238                     for pid, (genes, p, ref) in allPathways.items()])
239
240    def get_genes_by_enzyme(self, enzyme):
241        enzyme = KEGGEnzyme().get_entry(enzyme)
242        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
243
244    def get_genes_by_pathway(self, pathway_id):
245        return KEGGPathway(pathway_id).genes()
246
247    def get_enzymes_by_pathway(self, pathway_id):
248        return KEGGPathway(pathway_id).enzymes()
249
250    def get_compounds_by_pathway(self, pathway_id):
251        return KEGGPathway(pathway_id).compounds()
252
253    def get_pathways_by_genes(self, gene_ids):
254        return self.api.get_pathways_by_genes(gene_ids)
255        gene_ids = set(gene_ids)
256        pathways = [self.genes[id].pathway for id in gene_ids
257                    if self.genes[id].pathway]
258        pathways = reduce(set.union, pathways, set())
259        return [id for id in pathways
260                if gene_ids.issubset(KEGGPathway(id).genes())]
261
262    def get_pathways_by_enzymes(self, enzyme_ids):
263        enzyme_ids = set(enzyme_ids)
264        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
265        pathways = reduce(set.union, pathways, set())
266        return [id for id in pathways
267                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
268
269    def get_pathways_by_compounds(self, compound_ids):
270        compound_ids = set(compound_ids)
271        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
272        pathways = reduce(set.union, pathways, set())
273        return [id for id in pathways
274                if compound_ids.issubset(KEGGPathway(id).compounds())]
275
276    def get_enzymes_by_compound(self, compound_id):
277        return KEGGCompound()[compound_id].enzyme
278
279    def get_enzymes_by_gene(self, gene_id):
280        return self.genes[gene_id].enzymes
281
282    def get_compounds_by_enzyme(self, enzyme_id):
283        return self._enzymes_to_compounds.get(enzyme_id)
284
285    @deprecated_keywords({"caseSensitive": "case_sensitive"})
286    def get_unique_gene_ids(self, genes, case_sensitive=True):
287        """
288        Return a tuple with three elements. The first is a dictionary
289        mapping from unique geneids to gene names in genes, the second
290        is a list of conflicting gene names and the third is a list of
291        unknown genes.
292
293        """
294        unique, conflicting, unknown = {}, [], []
295        for gene in genes:
296            names = self.genematcher.match(gene)
297            if len(names) == 1:
298                unique[names[0]] = gene
299            elif len(names) == 0:
300                unknown.append(gene)
301            else:
302                conflicting.append(gene)
303        return unique, conflicting, unknown
304
305    @deprecated_function_name
306    def get_genes(self):
307        return self.genes
308
309    @classmethod
310    def organism_name_search(cls, name):
311        return organism_name_search(name)
312
313    @classmethod
314    def organism_version(cls, name):
315        name = cls.organism_name_search(name)
316        with _global_genome_instance() as genome:
317            info = genome.api.info(name)
318            return info.release
319
320    def _set_genematcher(self, genematcher):
321        setattr(self, "_genematcher", genematcher)
322
323    def _get_genematcher(self):
324        if getattr(self, "_genematcher", None) is None:
325            from .. import obiGene
326            if self.org_code == "ddi":
327                self._genematcher = obiGene.matcher(
328                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
329                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
330                )
331            else:
332                self._genematcher = obiGene.matcher(
333                    [obiGene.GMKEGG(self.org_code)])
334
335            self._genematcher.set_targets(self.genes.keys())
336        return self._genematcher
337
338    genematcher = property(_get_genematcher, _set_genematcher)
339
340
341KEGGOrganism = Organism
342
343
344def organism_name_search(name):
345    """
346    Search for a organism by `name` and return it's KEGG organism code.
347    """
348    with _global_genome_instance() as genome:
349        try:
350            name = genome.org_code_to_entry_key(name)
351        except ValueError:
352            pass
353
354        if name not in genome:
355            ids = genome.search(name)
356            if ids:
357                name = ids.pop(0) if ids else name
358            else:
359                raise OrganismNotFoundError(name)
360
361        try:
362            return genome[name].organism_code
363        except KeyError:
364            raise OrganismNotFoundError(name)
365
366
367def pathways(org):
368    """
369    Return a list of all KEGG pathways for an KEGG organism code `org`.
370    """
371    return KEGGPathway.list(org)
372
373
374def from_taxid(taxid):
375    """
376    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
377    """
378    with _global_genome_instance() as genome:
379        res = genome.search(taxid)
380        for r in res:
381            e = genome[r]
382
383            if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
384                return e.org_code()
385
386        return None
387
388
389def to_taxid(name):
390    """
391    Return a NCBI Taxonomy id for a given KEGG Organism name
392    """
393    with _global_genome_instance() as genome:
394        if name in genome:
395            return genome[name].taxid
396
397        keys = genome.search(name)
398        if keys:
399            return genome[keys[0]].taxid
400        else:
401            return None
402
403
404def _global_genome_instance():
405    genome = getattr(_global_genome_instance, "_genome", None)
406    if genome is None:
407        genome = KEGGGenome()
408        genome._lock = threading.RLock()
409        _global_genome_instance._genome = genome
410
411    @contextmanager
412    def instance_locked(instance, lock):
413        lock.acquire()
414        try:
415            yield instance
416        finally:
417            lock.release()
418
419    return instance_locked(genome, genome._lock)
420
421
422def main():
423    KEGGGenome()
424    import doctest
425    extraglobs = {"api": KeggApi()}
426    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
427
428
429if __name__ == "__main__":
430    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.