source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1736:b3b664abcfdd

Revision 1736:b3b664abcfdd, 12.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Added sphinx rst documentation for obiKEGG.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
8
9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
11
12.. _`slumber`: https://pypi.python.org/pypi/slumber/
13
14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
51
52"""
53from __future__ import absolute_import
54
55import os
56import sys
57import urllib2
58
59from collections import defaultdict
60from itertools import chain
61from datetime import datetime
62
63from Orange.utils import lru_cache
64from Orange.utils import progress_bar_milestones
65from Orange.utils import (
66    deprecated_keywords, deprecated_attribute, deprecated_function_name
67)
68
69from .. import obiProb
70
71from . import databases
72from . import entry
73
74from .brite import BriteEntry, Brite
75
76from . import api
77from . import conf
78from . import pathway
79
80KEGGGenome = databases.Genome
81KEGGGenes = databases.Genes
82KEGGEnzyme = databases.Enzyme
83KEGGReaction = databases.Reaction
84KEGGPathways = databases.Pathway
85KEGGCompound = databases.Compound
86
87KEGGBrite = Brite
88KEGGBriteEntry = BriteEntry
89
90KEGGPathway = pathway.Pathway
91
92DEFAULT_CACHE_DIR = conf.params["cache.path"]
93
94
95class OrganismNotFoundError(Exception):
96    pass
97
98
99class Organism(object):
100    """
101    A convenience class for retrieving information regarding an
102    organism in the KEGG Genes database.
103
104    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
105        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
106        organism code will be searched for by using KEGG `find` api.
107    :type org: str
108
109    .. seealso::
110
111        :func:`organism_name_search`
112            Search KEGG for an organism code
113
114    """
115    def __init__(self, org, genematcher=None):
116        self.org_code = self.organism_name_search(org)
117        self.genematcher = genematcher
118        self.api = api.CachedKeggApi()
119
120    @property
121    def org(self):
122        """
123        KEGG organism code.
124        """
125        return self.org_code
126
127    @property
128    def genes(self):
129        """
130        An :class:`~.databases.Genes` database instance for this organism.
131        """
132        # TODO: This should not be a property but a method.
133        # I think it was only put here as back compatibility with old obiKEGG.
134        if not hasattr(self, "_genes"):
135            genes = KEGGGenes(self.org_code)
136            self._genes = genes
137        return self._genes
138
139    def gene_aliases(self):
140        """
141        Return a list of sets of equal genes (synonyms) in KEGG for
142        this organism.
143
144        .. note::
145
146            This only includes 'ncbi-geneid' and 'ncbi-gi' records
147            from the KEGG Genes DBLINKS entries.
148
149        """
150        definitions = self.api.list(self.org_code)
151        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
152        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
153
154        aliases = defaultdict(set)
155
156        for entry_id, definition in definitions:
157            # genes entry id without the organism code
158            aliases[entry_id].add(entry_id.split(":", 1)[1])
159            # all names in the NAME field (KEGG API list returns
160            # 'NAME; DEFINITION') fields for genes
161            names = definition.split(";")[0].split(",")
162            aliases[entry_id].update([name.strip() for name in names])
163
164        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
165            aliases[target_id].add(source_id.split(":", 1)[1])
166
167        return [set([entry_id]).union(names)
168                for entry_id, names in aliases.iteritems()]
169
170    def pathways(self, with_ids=None):
171        """
172        Return a list of all pathways for this organism.
173        """
174        if with_ids is not None:
175            return self.api.get_pathways_by_genes(with_ids)
176        else:
177            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
178
179    def list_pathways(self):
180        """
181        List all pathways for this organism.
182
183        .. deprecated: 2.5
184            Use :func:`pathways` instead.
185
186        """
187        # NOTE: remove/deprecate and use pathways()
188        return self.pathways()
189
190    def get_linked_pathways(self, pathway_id):
191        self.api.get_linked_pathways(pathway_id)
192
193    def enzymes(self, genes=None):
194        raise NotImplementedError()
195
196    def get_enriched_pathways(self, genes, reference=None,
197                              prob=obiProb.Binomial(), callback=None):
198        """
199        Return a dictionary with enriched pathways ids as keys
200        and (list_of_genes, p_value, num_of_reference_genes) tuples
201        as items.
202
203        """
204        if reference is None:
205            reference = self.genes.keys()
206        reference = set(reference)
207
208        allPathways = defaultdict(lambda: [[], 1.0, []])
209        milestones = progress_bar_milestones(len(genes), 100)
210        pathways_db = KEGGPathways()
211
212        pathways_for_gene = []
213        for i, gene in enumerate(genes):
214            pathways_for_gene.append(self.pathways([gene]))
215            if callback and i in milestones:
216                callback(i * 50.0 / len(genes))
217
218        # pre-cache for speed
219        pathways_db.pre_cache([pid for pfg in pathways_for_gene
220                               for pid in pfg])
221        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
222            for pathway in pathways:
223                if pathways_db.get_entry(pathway).gene:
224                    allPathways[pathway][0].append(gene)
225            if callback and i in milestones:
226                callback(50.0 + i * 50.0 / len(genes))
227
228        pItems = allPathways.items()
229
230        for i, (p_id, entry) in enumerate(pItems):
231            pathway = pathways_db.get_entry(p_id)
232            entry[2].extend(reference.intersection(pathway.gene or []))
233            entry[1] = prob.p_value(len(entry[0]), len(reference),
234                                    len(entry[2]), len(genes))
235        return dict([(pid, (genes, p, len(ref)))
236                     for pid, (genes, p, ref) in allPathways.items()])
237
238    def get_genes_by_enzyme(self, enzyme):
239        enzyme = KEGGEnzyme().get_entry(enzyme)
240        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
241
242    def get_genes_by_pathway(self, pathway_id):
243        return KEGGPathway(pathway_id).genes()
244
245    def get_enzymes_by_pathway(self, pathway_id):
246        return KEGGPathway(pathway_id).enzymes()
247
248    def get_compounds_by_pathway(self, pathway_id):
249        return KEGGPathway(pathway_id).compounds()
250
251    def get_pathways_by_genes(self, gene_ids):
252        return self.api.get_pathways_by_genes(gene_ids)
253        gene_ids = set(gene_ids)
254        pathways = [self.genes[id].pathway for id in gene_ids
255                    if self.genes[id].pathway]
256        pathways = reduce(set.union, pathways, set())
257        return [id for id in pathways
258                if gene_ids.issubset(KEGGPathway(id).genes())]
259
260    def get_pathways_by_enzymes(self, enzyme_ids):
261        enzyme_ids = set(enzyme_ids)
262        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
263        pathways = reduce(set.union, pathways, set())
264        return [id for id in pathways
265                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
266
267    def get_pathways_by_compounds(self, compound_ids):
268        compound_ids = set(compound_ids)
269        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
270        pathways = reduce(set.union, pathways, set())
271        return [id for id in pathways
272                if compound_ids.issubset(KEGGPathway(id).compounds())]
273
274    def get_enzymes_by_compound(self, compound_id):
275        return KEGGCompound()[compound_id].enzyme
276
277    def get_enzymes_by_gene(self, gene_id):
278        return self.genes[gene_id].enzymes
279
280    def get_compounds_by_enzyme(self, enzyme_id):
281        return self._enzymes_to_compounds.get(enzyme_id)
282
283    @deprecated_keywords({"caseSensitive": "case_sensitive"})
284    def get_unique_gene_ids(self, genes, case_sensitive=True):
285        """
286        Return a tuple with three elements. The first is a dictionary
287        mapping from unique geneids to gene names in genes, the second
288        is a list of conflicting gene names and the third is a list of
289        unknown genes.
290
291        """
292        unique, conflicting, unknown = {}, [], []
293        for gene in genes:
294            names = self.genematcher.match(gene)
295            if len(names) == 1:
296                unique[names[0]] = gene
297            elif len(names) == 0:
298                unknown.append(gene)
299            else:
300                conflicting.append(gene)
301        return unique, conflicting, unknown
302
303    @deprecated_function_name
304    def get_genes(self):
305        return self.genes
306
307    @classmethod
308    def organism_name_search(cls, name):
309        genome = KEGGGenome()
310        if name not in genome:
311            ids = genome.search(name)
312            if not ids:
313                from .. import obiTaxonomy
314                ids = obiTaxonomy.search(name)
315                ids = [id for id in ids if genome.search(id)]
316            name = ids.pop(0) if ids else name
317
318        try:
319            return genome[name].organism_code
320        except KeyError:
321            raise OrganismNotFoundError(name)
322
323    @classmethod
324    def organism_version(cls, name):
325        name = cls.organism_name_search(name)
326        genome = KEGGGenome()
327        info = genome.api.info(name)
328        return info.release
329
330    def _set_genematcher(self, genematcher):
331        setattr(self, "_genematcher", genematcher)
332
333    def _get_genematcher(self):
334        if getattr(self, "_genematcher", None) is None:
335            from .. import obiGene
336            if self.org_code == "ddi":
337                self._genematcher = obiGene.matcher(
338                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
339                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
340                )
341            else:
342                self._genematcher = obiGene.matcher(
343                    [obiGene.GMKEGG(self.org_code)])
344
345            self._genematcher.set_targets(self.genes.keys())
346        return self._genematcher
347
348    genematcher = property(_get_genematcher, _set_genematcher)
349
350
351KEGGOrganism = Organism
352
353
354def organism_name_search(name):
355    """
356    Search and organism by `name` and return an KEGG organism code.
357    """
358    return KEGGOrganism.organism_name_search(name)
359
360
361def pathways(org):
362    """
363    Return a list of all KEGG pathways for an KEGG organism code `org`.
364    """
365    return KEGGPathway.list(org)
366
367
368def organisms():
369    """
370    Return a list of all KEGG organisms.
371    """
372    return KEGGOrganism.organisms()
373
374
375def from_taxid(taxid):
376    """
377    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
378    """
379    genome = KEGGGenome()
380    res = genome.search(taxid)
381    for r in res:
382        e = genome[r]
383
384        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
385            return e.org_code()
386
387    return None
388
389
390def to_taxid(name):
391    """
392    Return a NCBI Taxonomy id for a given KEGG Organism name
393    """
394    genome = KEGGGenome()
395    if name in genome:
396        return genome[name].taxid
397
398    keys = genome.search(name)
399    if keys:
400        return genome[keys[0]].taxid
401    else:
402        return None
403
404
405def create_gene_sets():
406    pass
407
408
409def main():
410    KEGGGenome()
411    import doctest
412    extraglobs = {"api": KeggApi()}
413    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
414
415
416if __name__ == "__main__":
417    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.