source: orange-bioinformatics/orangecontrib/bio/obiKEGG/__init__.py @ 1873:0810c5708cc5

Revision 1873:0810c5708cc5, 13.0 KB checked in by Ales Erjavec <ales.erjavec@…>, 6 months ago (diff)

Moved '_bioinformatics' into orangecontrib namespace.

Line 
1"""
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
8
9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
11
12.. _`slumber`: https://pypi.python.org/pypi/slumber/
13
14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
51
52"""
53from __future__ import absolute_import
54
55import os
56import sys
57import urllib2
58import threading
59
60from collections import defaultdict
61from itertools import chain
62from datetime import datetime
63from contextlib import contextmanager
64
65from Orange.utils import lru_cache
66from Orange.utils import progress_bar_milestones
67from Orange.utils import (
68    deprecated_keywords, deprecated_attribute, deprecated_function_name
69)
70
71from .. import obiProb, obiTaxonomy
72
73from . import databases
74from . import entry
75
76from .brite import BriteEntry, Brite
77
78from . import api
79from . import conf
80from . import pathway
81
82KEGGGenome = databases.Genome
83KEGGGenes = databases.Genes
84KEGGEnzyme = databases.Enzyme
85KEGGReaction = databases.Reaction
86KEGGPathways = databases.Pathway
87KEGGCompound = databases.Compound
88
89KEGGBrite = Brite
90KEGGBriteEntry = BriteEntry
91
92KEGGPathway = pathway.Pathway
93
94DEFAULT_CACHE_DIR = conf.params["cache.path"]
95
96
97class Organism(object):
98    """
99    A convenience class for retrieving information regarding an
100    organism in the KEGG Genes database.
101
102    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
103        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
104        organism code will be searched for by using KEGG `find` api.
105    :type org: str
106
107    .. seealso::
108
109        :func:`organism_name_search`
110            Search KEGG for an organism code
111
112    """
113    def __init__(self, org, genematcher=None):
114        self.org_code = self.organism_name_search(org)
115        self.genematcher = genematcher
116        self.api = api.CachedKeggApi()
117
118    @property
119    def org(self):
120        """
121        KEGG organism code.
122        """
123        return self.org_code
124
125    @property
126    def genes(self):
127        """
128        An :class:`~.databases.Genes` database instance for this organism.
129        """
130        # TODO: This should not be a property but a method.
131        # I think it was only put here as back compatibility with old obiKEGG.
132        if not hasattr(self, "_genes"):
133            genes = KEGGGenes(self.org_code)
134            self._genes = genes
135        return self._genes
136
137    def gene_aliases(self):
138        """
139        Return a list of sets of equal genes (synonyms) in KEGG for
140        this organism.
141
142        .. note::
143
144            This only includes 'ncbi-geneid' and 'ncbi-gi' records
145            from the KEGG Genes DBLINKS entries.
146
147        """
148        definitions = self.api.list(self.org_code)
149        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
150        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
151
152        aliases = defaultdict(set)
153
154        for entry_id, definition in definitions:
155            # genes entry id without the organism code
156            aliases[entry_id].add(entry_id.split(":", 1)[1])
157            # all names in the NAME field (KEGG API list returns
158            # 'NAME; DEFINITION') fields for genes
159            names = definition.split(";")[0].split(",")
160            aliases[entry_id].update([name.strip() for name in names])
161
162        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
163            aliases[target_id].add(source_id.split(":", 1)[1])
164
165        return [set([entry_id]).union(names)
166                for entry_id, names in aliases.iteritems()]
167
168    def pathways(self, with_ids=None):
169        """
170        Return a list of all pathways for this organism.
171        """
172        if with_ids is not None:
173            return self.api.get_pathways_by_genes(with_ids)
174        else:
175            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
176
177    def list_pathways(self):
178        """
179        List all pathways for this organism.
180
181        .. deprecated: 2.5
182            Use :func:`pathways` instead.
183
184        """
185        # NOTE: remove/deprecate and use pathways()
186        return self.pathways()
187
188    def get_linked_pathways(self, pathway_id):
189        self.api.get_linked_pathways(pathway_id)
190
191    def enzymes(self, genes=None):
192        raise NotImplementedError()
193
194    def get_enriched_pathways(self, genes, reference=None,
195                              prob=obiProb.Binomial(), callback=None):
196        """
197        Return a dictionary with enriched pathways ids as keys
198        and (list_of_genes, p_value, num_of_reference_genes) tuples
199        as items.
200
201        """
202        if reference is None:
203            reference = self.genes.keys()
204        reference = set(reference)
205
206        allPathways = defaultdict(lambda: [[], 1.0, []])
207        milestones = progress_bar_milestones(len(genes), 100)
208        pathways_db = KEGGPathways()
209
210        pathways_for_gene = []
211        for i, gene in enumerate(genes):
212            pathways_for_gene.append(self.pathways([gene]))
213            if callback and i in milestones:
214                callback(i * 50.0 / len(genes))
215
216        # pre-cache for speed
217        pathways_db.pre_cache([pid for pfg in pathways_for_gene
218                               for pid in pfg])
219        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
220            for pathway in pathways:
221                if pathways_db.get_entry(pathway).gene:
222                    allPathways[pathway][0].append(gene)
223            if callback and i in milestones:
224                callback(50.0 + i * 50.0 / len(genes))
225
226        pItems = allPathways.items()
227
228        for i, (p_id, entry) in enumerate(pItems):
229            pathway = pathways_db.get_entry(p_id)
230            entry[2].extend(reference.intersection(pathway.gene or []))
231            entry[1] = prob.p_value(len(entry[0]), len(reference),
232                                    len(entry[2]), len(genes))
233        return dict([(pid, (genes, p, len(ref)))
234                     for pid, (genes, p, ref) in allPathways.items()])
235
236    def get_genes_by_enzyme(self, enzyme):
237        enzyme = KEGGEnzyme().get_entry(enzyme)
238        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
239
240    def get_genes_by_pathway(self, pathway_id):
241        return KEGGPathway(pathway_id).genes()
242
243    def get_enzymes_by_pathway(self, pathway_id):
244        return KEGGPathway(pathway_id).enzymes()
245
246    def get_compounds_by_pathway(self, pathway_id):
247        return KEGGPathway(pathway_id).compounds()
248
249    def get_pathways_by_genes(self, gene_ids):
250        return self.api.get_pathways_by_genes(gene_ids)
251        gene_ids = set(gene_ids)
252        pathways = [self.genes[id].pathway for id in gene_ids
253                    if self.genes[id].pathway]
254        pathways = reduce(set.union, pathways, set())
255        return [id for id in pathways
256                if gene_ids.issubset(KEGGPathway(id).genes())]
257
258    def get_pathways_by_enzymes(self, enzyme_ids):
259        enzyme_ids = set(enzyme_ids)
260        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
261        pathways = reduce(set.union, pathways, set())
262        return [id for id in pathways
263                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
264
265    def get_pathways_by_compounds(self, compound_ids):
266        compound_ids = set(compound_ids)
267        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
268        pathways = reduce(set.union, pathways, set())
269        return [id for id in pathways
270                if compound_ids.issubset(KEGGPathway(id).compounds())]
271
272    def get_enzymes_by_compound(self, compound_id):
273        return KEGGCompound()[compound_id].enzyme
274
275    def get_enzymes_by_gene(self, gene_id):
276        return self.genes[gene_id].enzymes
277
278    def get_compounds_by_enzyme(self, enzyme_id):
279        return self._enzymes_to_compounds.get(enzyme_id)
280
281    @deprecated_keywords({"caseSensitive": "case_sensitive"})
282    def get_unique_gene_ids(self, genes, case_sensitive=True):
283        """
284        Return a tuple with three elements. The first is a dictionary
285        mapping from unique geneids to gene names in genes, the second
286        is a list of conflicting gene names and the third is a list of
287        unknown genes.
288
289        """
290        unique, conflicting, unknown = {}, [], []
291        for gene in genes:
292            names = self.genematcher.match(gene)
293            if len(names) == 1:
294                unique[names[0]] = gene
295            elif len(names) == 0:
296                unknown.append(gene)
297            else:
298                conflicting.append(gene)
299        return unique, conflicting, unknown
300
301    @deprecated_function_name
302    def get_genes(self):
303        return self.genes
304
305    @classmethod
306    def organism_name_search(cls, name):
307        return organism_name_search(name)
308
309    @classmethod
310    def organism_version(cls, name):
311        name = cls.organism_name_search(name)
312        with _global_genome_instance() as genome:
313            info = genome.api.info(name)
314            return info.release
315
316    def _set_genematcher(self, genematcher):
317        setattr(self, "_genematcher", genematcher)
318
319    def _get_genematcher(self):
320        if getattr(self, "_genematcher", None) is None:
321            from .. import obiGene
322            if self.org_code == "ddi":
323                self._genematcher = obiGene.matcher(
324                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
325                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
326                )
327            else:
328                self._genematcher = obiGene.matcher(
329                    [obiGene.GMKEGG(self.org_code)])
330
331            self._genematcher.set_targets(self.genes.keys())
332        return self._genematcher
333
334    genematcher = property(_get_genematcher, _set_genematcher)
335
336
337KEGGOrganism = Organism
338
339
340def organism_name_search(name):
341    """
342    Search for a organism by `name` and return it's KEGG organism code.
343    """
344    with _global_genome_instance() as genome:
345        try:
346            name = genome.org_code_to_entry_key(name)
347        except ValueError:
348            pass
349
350        if name not in genome:
351            ids = genome.search(name)
352            if ids:
353                name = ids.pop(0) if ids else name
354            else:
355                raise obiTaxonomy.UnknownSpeciesIdentifier(name)
356
357        try:
358            return genome[name].organism_code
359        except KeyError:
360            raise obiTaxonomy.UnknownSpeciesIdentifier(name)
361
362
363def pathways(org):
364    """
365    Return a list of all KEGG pathways for an KEGG organism code `org`.
366    """
367    return KEGGPathway.list(org)
368
369
370def from_taxid(taxid):
371    """
372    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
373    """
374    with _global_genome_instance() as genome:
375        res = genome.search(taxid)
376        for r in res:
377            e = genome[r]
378
379            if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
380                return e.org_code()
381
382        return None
383
384
385def to_taxid(name):
386    """
387    Return a NCBI Taxonomy id for a given KEGG Organism name
388    """
389    with _global_genome_instance() as genome:
390        if name in genome:
391            return genome[name].taxid
392
393        keys = genome.search(name)
394        if keys:
395            return genome[keys[0]].taxid
396        else:
397            return None
398
399
400def _global_genome_instance():
401    genome = getattr(_global_genome_instance, "_genome", None)
402    if genome is None:
403        genome = KEGGGenome()
404        genome._lock = threading.RLock()
405        _global_genome_instance._genome = genome
406
407    @contextmanager
408    def instance_locked(instance, lock):
409        lock.acquire()
410        try:
411            yield instance
412        finally:
413            lock.release()
414
415    return instance_locked(genome, genome._lock)
416
417
418def main():
419    KEGGGenome()
420    import doctest
421    extraglobs = {"api": KeggApi()}
422    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
423
424
425if __name__ == "__main__":
426    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.