source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1741:1c2feb7cd8d2

Revision 1741:1c2feb7cd8d2, 12.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Added 'prefix' parameter for KEGG Pathway database interface.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
8
9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
11
12.. _`slumber`: https://pypi.python.org/pypi/slumber/
13
14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
51
52"""
53from __future__ import absolute_import
54
55import os
56import sys
57import urllib2
58
59from collections import defaultdict
60from itertools import chain
61from datetime import datetime
62
63from Orange.utils import lru_cache
64from Orange.utils import progress_bar_milestones
65from Orange.utils import (
66    deprecated_keywords, deprecated_attribute, deprecated_function_name
67)
68
69from .. import obiProb
70
71from . import databases
72from . import entry
73
74from .brite import BriteEntry, Brite
75
76from . import api
77from . import conf
78from . import pathway
79
80KEGGGenome = databases.Genome
81KEGGGenes = databases.Genes
82KEGGEnzyme = databases.Enzyme
83KEGGReaction = databases.Reaction
84KEGGPathways = databases.Pathway
85KEGGCompound = databases.Compound
86
87KEGGBrite = Brite
88KEGGBriteEntry = BriteEntry
89
90KEGGPathway = pathway.Pathway
91
92DEFAULT_CACHE_DIR = conf.params["cache.path"]
93
94
95class OrganismNotFoundError(Exception):
96    pass
97
98
99class Organism(object):
100    """
101    A convenience class for retrieving information regarding an
102    organism in the KEGG Genes database.
103
104    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
105        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
106        organism code will be searched for by using KEGG `find` api.
107    :type org: str
108
109    .. seealso::
110
111        :func:`organism_name_search`
112            Search KEGG for an organism code
113
114    """
115    def __init__(self, org, genematcher=None):
116        self.org_code = self.organism_name_search(org)
117        self.genematcher = genematcher
118        self.api = api.CachedKeggApi()
119
120    @property
121    def org(self):
122        """
123        KEGG organism code.
124        """
125        return self.org_code
126
127    @property
128    def genes(self):
129        """
130        An :class:`~.databases.Genes` database instance for this organism.
131        """
132        # TODO: This should not be a property but a method.
133        # I think it was only put here as back compatibility with old obiKEGG.
134        if not hasattr(self, "_genes"):
135            genes = KEGGGenes(self.org_code)
136            self._genes = genes
137        return self._genes
138
139    def gene_aliases(self):
140        """
141        Return a list of sets of equal genes (synonyms) in KEGG for
142        this organism.
143
144        .. note::
145
146            This only includes 'ncbi-geneid' and 'ncbi-gi' records
147            from the KEGG Genes DBLINKS entries.
148
149        """
150        definitions = self.api.list(self.org_code)
151        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
152        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
153
154        aliases = defaultdict(set)
155
156        for entry_id, definition in definitions:
157            # genes entry id without the organism code
158            aliases[entry_id].add(entry_id.split(":", 1)[1])
159            # all names in the NAME field (KEGG API list returns
160            # 'NAME; DEFINITION') fields for genes
161            names = definition.split(";")[0].split(",")
162            aliases[entry_id].update([name.strip() for name in names])
163
164        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
165            aliases[target_id].add(source_id.split(":", 1)[1])
166
167        return [set([entry_id]).union(names)
168                for entry_id, names in aliases.iteritems()]
169
170    def pathways(self, with_ids=None):
171        """
172        Return a list of all pathways for this organism.
173        """
174        if with_ids is not None:
175            return self.api.get_pathways_by_genes(with_ids)
176        else:
177            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
178
179    def list_pathways(self):
180        """
181        List all pathways for this organism.
182
183        .. deprecated: 2.5
184            Use :func:`pathways` instead.
185
186        """
187        # NOTE: remove/deprecate and use pathways()
188        return self.pathways()
189
190    def get_linked_pathways(self, pathway_id):
191        self.api.get_linked_pathways(pathway_id)
192
193    def enzymes(self, genes=None):
194        raise NotImplementedError()
195
196    def get_enriched_pathways(self, genes, reference=None,
197                              prob=obiProb.Binomial(), callback=None):
198        """
199        Return a dictionary with enriched pathways ids as keys
200        and (list_of_genes, p_value, num_of_reference_genes) tuples
201        as items.
202
203        """
204        if reference is None:
205            reference = self.genes.keys()
206        reference = set(reference)
207
208        allPathways = defaultdict(lambda: [[], 1.0, []])
209        milestones = progress_bar_milestones(len(genes), 100)
210        pathways_db = KEGGPathways()
211
212        pathways_for_gene = []
213        for i, gene in enumerate(genes):
214            pathways_for_gene.append(self.pathways([gene]))
215            if callback and i in milestones:
216                callback(i * 50.0 / len(genes))
217
218        # pre-cache for speed
219        pathways_db.pre_cache([pid for pfg in pathways_for_gene
220                               for pid in pfg])
221        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
222            for pathway in pathways:
223                if pathways_db.get_entry(pathway).gene:
224                    allPathways[pathway][0].append(gene)
225            if callback and i in milestones:
226                callback(50.0 + i * 50.0 / len(genes))
227
228        pItems = allPathways.items()
229
230        for i, (p_id, entry) in enumerate(pItems):
231            pathway = pathways_db.get_entry(p_id)
232            entry[2].extend(reference.intersection(pathway.gene or []))
233            entry[1] = prob.p_value(len(entry[0]), len(reference),
234                                    len(entry[2]), len(genes))
235        return dict([(pid, (genes, p, len(ref)))
236                     for pid, (genes, p, ref) in allPathways.items()])
237
238    def get_genes_by_enzyme(self, enzyme):
239        enzyme = KEGGEnzyme().get_entry(enzyme)
240        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
241
242    def get_genes_by_pathway(self, pathway_id):
243        return KEGGPathway(pathway_id).genes()
244
245    def get_enzymes_by_pathway(self, pathway_id):
246        return KEGGPathway(pathway_id).enzymes()
247
248    def get_compounds_by_pathway(self, pathway_id):
249        return KEGGPathway(pathway_id).compounds()
250
251    def get_pathways_by_genes(self, gene_ids):
252        return self.api.get_pathways_by_genes(gene_ids)
253        gene_ids = set(gene_ids)
254        pathways = [self.genes[id].pathway for id in gene_ids
255                    if self.genes[id].pathway]
256        pathways = reduce(set.union, pathways, set())
257        return [id for id in pathways
258                if gene_ids.issubset(KEGGPathway(id).genes())]
259
260    def get_pathways_by_enzymes(self, enzyme_ids):
261        enzyme_ids = set(enzyme_ids)
262        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
263        pathways = reduce(set.union, pathways, set())
264        return [id for id in pathways
265                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
266
267    def get_pathways_by_compounds(self, compound_ids):
268        compound_ids = set(compound_ids)
269        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
270        pathways = reduce(set.union, pathways, set())
271        return [id for id in pathways
272                if compound_ids.issubset(KEGGPathway(id).compounds())]
273
274    def get_enzymes_by_compound(self, compound_id):
275        return KEGGCompound()[compound_id].enzyme
276
277    def get_enzymes_by_gene(self, gene_id):
278        return self.genes[gene_id].enzymes
279
280    def get_compounds_by_enzyme(self, enzyme_id):
281        return self._enzymes_to_compounds.get(enzyme_id)
282
283    @deprecated_keywords({"caseSensitive": "case_sensitive"})
284    def get_unique_gene_ids(self, genes, case_sensitive=True):
285        """
286        Return a tuple with three elements. The first is a dictionary
287        mapping from unique geneids to gene names in genes, the second
288        is a list of conflicting gene names and the third is a list of
289        unknown genes.
290
291        """
292        unique, conflicting, unknown = {}, [], []
293        for gene in genes:
294            names = self.genematcher.match(gene)
295            if len(names) == 1:
296                unique[names[0]] = gene
297            elif len(names) == 0:
298                unknown.append(gene)
299            else:
300                conflicting.append(gene)
301        return unique, conflicting, unknown
302
303    @deprecated_function_name
304    def get_genes(self):
305        return self.genes
306
307    @classmethod
308    def organism_name_search(cls, name):
309        genome = KEGGGenome()
310        try:
311            name = genome.org_code_to_entry_key(name)
312        except KeyError:
313            pass
314
315        if name not in genome:
316            ids = genome.search(name)
317            if not ids:
318                from .. import obiTaxonomy
319                ids = obiTaxonomy.search(name)
320                ids = [id for id in ids if genome.search(id)]
321            name = ids.pop(0) if ids else name
322
323        try:
324            return genome[name].organism_code
325        except KeyError:
326            raise OrganismNotFoundError(name)
327
328    @classmethod
329    def organism_version(cls, name):
330        name = cls.organism_name_search(name)
331        genome = KEGGGenome()
332        info = genome.api.info(name)
333        return info.release
334
335    def _set_genematcher(self, genematcher):
336        setattr(self, "_genematcher", genematcher)
337
338    def _get_genematcher(self):
339        if getattr(self, "_genematcher", None) is None:
340            from .. import obiGene
341            if self.org_code == "ddi":
342                self._genematcher = obiGene.matcher(
343                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
344                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
345                )
346            else:
347                self._genematcher = obiGene.matcher(
348                    [obiGene.GMKEGG(self.org_code)])
349
350            self._genematcher.set_targets(self.genes.keys())
351        return self._genematcher
352
353    genematcher = property(_get_genematcher, _set_genematcher)
354
355
356KEGGOrganism = Organism
357
358
359def organism_name_search(name):
360    """
361    Search and organism by `name` and return an KEGG organism code.
362    """
363    return KEGGOrganism.organism_name_search(name)
364
365
366def pathways(org):
367    """
368    Return a list of all KEGG pathways for an KEGG organism code `org`.
369    """
370    return KEGGPathway.list(org)
371
372
373def organisms():
374    """
375    Return a list of all KEGG organisms.
376    """
377    return KEGGOrganism.organisms()
378
379
380def from_taxid(taxid):
381    """
382    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
383    """
384    genome = KEGGGenome()
385    res = genome.search(taxid)
386    for r in res:
387        e = genome[r]
388
389        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
390            return e.org_code()
391
392    return None
393
394
395def to_taxid(name):
396    """
397    Return a NCBI Taxonomy id for a given KEGG Organism name
398    """
399    genome = KEGGGenome()
400    if name in genome:
401        return genome[name].taxid
402
403    keys = genome.search(name)
404    if keys:
405        return genome[keys[0]].taxid
406    else:
407        return None
408
409
410def main():
411    KEGGGenome()
412    import doctest
413    extraglobs = {"api": KeggApi()}
414    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
415
416
417if __name__ == "__main__":
418    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.