source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1738:1f7725e12b25

Revision 1738:1f7725e12b25, 12.6 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Fixed 'organism_name_search' for KEGG Genome ids.

RevLine 
[1532]1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
[1736]6:mod:`obiKEGG` is a python module for accessing `KEGG (Kyoto Encyclopedia
7of Genes and Genomes) <http://www.genome.jp/kegg/>`_ using its web services.
[1733]8
[1736]9.. note:: To use this module you need to have `slumber`_ and `requests`_
10          package installed.
[1532]11
[1736]12.. _`slumber`: https://pypi.python.org/pypi/slumber/
[1532]13
[1736]14.. _`requests`: https://pypi.python.org/pypi/requests
15
16
17>>> # Create a KEGG Genes database interface
18>>> genome = KEGGGenome()
19>>> # List all available entry ids
20>>> keys = genome.keys()
21>>> print keys[0]
22T01001
23>>> # Retrieve the entry for the key.
24>>> entry = genome[keys[0]]
25>>> print entry.entry_key
26T01001
27>>> print entry.definition
28Homo sapiens (human)
29>>> print str(entry)
30ENTRY       T01001            Complete  Genome
31NAME        hsa, HUMAN, 9606
32DEFINITION  Homo sapiens (human)
33...
34
35The :class:`Organism` class can be used as a convenient starting point
36for organism specific databases.
37
38>>> organism = Organism("Homo sapiens")  # searches for the organism by name
39>>> print organism.org_code  # prints the KEGG organism code
40hsa
41>>> genes = organism.genes  # get the genes database for the organism
42>>> gene_ids = genes.keys() # KEGG gene identifiers
43>>> entry = genes["hsa:672"]
44>>> print entry.definition
45breast cancer 1, early onset
46>>> print entry  # print the entry in DBGET database format.
47ENTRY       672               CDS       T01001
48NAME        BRCA1, BRCAI, BRCC1, BROVCA1, IRIS, PNCA4, PPP1R53, PSCP, RNF53
49DEFINITION  breast cancer 1, early onset
50...
[1603]51
[1532]52"""
53from __future__ import absolute_import
54
[1733]55import os
56import sys
[1715]57import urllib2
[1733]58
[1532]59from collections import defaultdict
[1735]60from itertools import chain
[1532]61from datetime import datetime
62
[1716]63from Orange.utils import lru_cache
[1733]64from Orange.utils import progress_bar_milestones
[1734]65from Orange.utils import (
66    deprecated_keywords, deprecated_attribute, deprecated_function_name
67)
[1733]68
69from .. import obiProb
[1532]70
71from . import databases
72from . import entry
73
74from .brite import BriteEntry, Brite
75
76from . import api
77from . import conf
78from . import pathway
79
80KEGGGenome = databases.Genome
81KEGGGenes = databases.Genes
[1733]82KEGGEnzyme = databases.Enzyme
83KEGGReaction = databases.Reaction
84KEGGPathways = databases.Pathway
85KEGGCompound = databases.Compound
[1532]86
87KEGGBrite = Brite
88KEGGBriteEntry = BriteEntry
89
90KEGGPathway = pathway.Pathway
91
92DEFAULT_CACHE_DIR = conf.params["cache.path"]
93
94
[1733]95class OrganismNotFoundError(Exception):
96    pass
[1532]97
[1712]98
[1532]99class Organism(object):
[1733]100    """
101    A convenience class for retrieving information regarding an
102    organism in the KEGG Genes database.
103
[1736]104    :param org: KEGG organism code (e.g. "hsa", "sce"). Can also be a
105        descriptive name (e.g. 'yeast', "homo sapiens") in which case the
106        organism code will be searched for by using KEGG `find` api.
[1733]107    :type org: str
108
[1736]109    .. seealso::
110
111        :func:`organism_name_search`
112            Search KEGG for an organism code
113
[1733]114    """
[1532]115    def __init__(self, org, genematcher=None):
116        self.org_code = self.organism_name_search(org)
117        self.genematcher = genematcher
118        self.api = api.CachedKeggApi()
[1733]119
[1532]120    @property
121    def org(self):
[1733]122        """
123        KEGG organism code.
124        """
[1532]125        return self.org_code
[1733]126
[1532]127    @property
128    def genes(self):
[1733]129        """
[1736]130        An :class:`~.databases.Genes` database instance for this organism.
[1733]131        """
132        # TODO: This should not be a property but a method.
133        # I think it was only put here as back compatibility with old obiKEGG.
[1532]134        if not hasattr(self, "_genes"):
135            genes = KEGGGenes(self.org_code)
136            self._genes = genes
137        return self._genes
[1733]138
[1532]139    def gene_aliases(self):
[1733]140        """
[1735]141        Return a list of sets of equal genes (synonyms) in KEGG for
142        this organism.
143
144        .. note::
145
146            This only includes 'ncbi-geneid' and 'ncbi-gi' records
147            from the KEGG Genes DBLINKS entries.
148
[1733]149        """
[1735]150        definitions = self.api.list(self.org_code)
151        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
152        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
153
154        aliases = defaultdict(set)
155
156        for entry_id, definition in definitions:
157            # genes entry id without the organism code
158            aliases[entry_id].add(entry_id.split(":", 1)[1])
159            # all names in the NAME field (KEGG API list returns
160            # 'NAME; DEFINITION') fields for genes
161            names = definition.split(";")[0].split(",")
162            aliases[entry_id].update([name.strip() for name in names])
163
164        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
165            aliases[target_id].add(source_id.split(":", 1)[1])
166
167        return [set([entry_id]).union(names)
168                for entry_id, names in aliases.iteritems()]
[1733]169
[1532]170    def pathways(self, with_ids=None):
[1733]171        """
172        Return a list of all pathways for this organism.
173        """
[1532]174        if with_ids is not None:
175            return self.api.get_pathways_by_genes(with_ids)
176        else:
177            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
[1734]178
[1532]179    def list_pathways(self):
[1733]180        """
[1736]181        List all pathways for this organism.
182
183        .. deprecated: 2.5
184            Use :func:`pathways` instead.
185
[1733]186        """
187        # NOTE: remove/deprecate and use pathways()
[1532]188        return self.pathways()
[1734]189
[1532]190    def get_linked_pathways(self, pathway_id):
191        self.api.get_linked_pathways(pathway_id)
[1734]192
[1532]193    def enzymes(self, genes=None):
194        raise NotImplementedError()
[1734]195
[1733]196    def get_enriched_pathways(self, genes, reference=None,
197                              prob=obiProb.Binomial(), callback=None):
198        """
199        Return a dictionary with enriched pathways ids as keys
200        and (list_of_genes, p_value, num_of_reference_genes) tuples
[1540]201        as items.
[1733]202
[1540]203        """
[1733]204        if reference is None:
205            reference = self.genes.keys()
206        reference = set(reference)
207
208        allPathways = defaultdict(lambda: [[], 1.0, []])
209        milestones = progress_bar_milestones(len(genes), 100)
[1532]210        pathways_db = KEGGPathways()
[1733]211
[1540]212        pathways_for_gene = []
[1532]213        for i, gene in enumerate(genes):
[1540]214            pathways_for_gene.append(self.pathways([gene]))
215            if callback and i in milestones:
[1733]216                callback(i * 50.0 / len(genes))
217
218        # pre-cache for speed
219        pathways_db.pre_cache([pid for pfg in pathways_for_gene
220                               for pid in pfg])
[1540]221        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
[1532]222            for pathway in pathways:
[1733]223                if pathways_db.get_entry(pathway).gene:
[1532]224                    allPathways[pathway][0].append(gene)
225            if callback and i in milestones:
[1733]226                callback(50.0 + i * 50.0 / len(genes))
227
[1532]228        pItems = allPathways.items()
[1733]229
[1532]230        for i, (p_id, entry) in enumerate(pItems):
231            pathway = pathways_db.get_entry(p_id)
232            entry[2].extend(reference.intersection(pathway.gene or []))
[1733]233            entry[1] = prob.p_value(len(entry[0]), len(reference),
234                                    len(entry[2]), len(genes))
235        return dict([(pid, (genes, p, len(ref)))
236                     for pid, (genes, p, ref) in allPathways.items()])
237
[1532]238    def get_genes_by_enzyme(self, enzyme):
[1733]239        enzyme = KEGGEnzyme().get_entry(enzyme)
[1532]240        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
[1733]241
[1532]242    def get_genes_by_pathway(self, pathway_id):
[1550]243        return KEGGPathway(pathway_id).genes()
[1733]244
[1532]245    def get_enzymes_by_pathway(self, pathway_id):
[1550]246        return KEGGPathway(pathway_id).enzymes()
[1734]247
[1532]248    def get_compounds_by_pathway(self, pathway_id):
[1550]249        return KEGGPathway(pathway_id).compounds()
[1734]250
[1532]251    def get_pathways_by_genes(self, gene_ids):
252        return self.api.get_pathways_by_genes(gene_ids)
253        gene_ids = set(gene_ids)
[1733]254        pathways = [self.genes[id].pathway for id in gene_ids
255                    if self.genes[id].pathway]
[1532]256        pathways = reduce(set.union, pathways, set())
[1733]257        return [id for id in pathways
258                if gene_ids.issubset(KEGGPathway(id).genes())]
259
[1532]260    def get_pathways_by_enzymes(self, enzyme_ids):
261        enzyme_ids = set(enzyme_ids)
[1733]262        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
263        pathways = reduce(set.union, pathways, set())
264        return [id for id in pathways
265                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
266
[1532]267    def get_pathways_by_compounds(self, compound_ids):
268        compound_ids = set(compound_ids)
[1733]269        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
270        pathways = reduce(set.union, pathways, set())
271        return [id for id in pathways
272                if compound_ids.issubset(KEGGPathway(id).compounds())]
273
[1532]274    def get_enzymes_by_compound(self, compound_id):
275        return KEGGCompound()[compound_id].enzyme
[1733]276
[1532]277    def get_enzymes_by_gene(self, gene_id):
278        return self.genes[gene_id].enzymes
[1733]279
[1532]280    def get_compounds_by_enzyme(self, enzyme_id):
281        return self._enzymes_to_compounds.get(enzyme_id)
[1734]282
[1532]283    @deprecated_keywords({"caseSensitive": "case_sensitive"})
284    def get_unique_gene_ids(self, genes, case_sensitive=True):
[1733]285        """
286        Return a tuple with three elements. The first is a dictionary
287        mapping from unique geneids to gene names in genes, the second
288        is a list of conflicting gene names and the third is a list of
289        unknown genes.
290
[1532]291        """
292        unique, conflicting, unknown = {}, [], []
293        for gene in genes:
294            names = self.genematcher.match(gene)
295            if len(names) == 1:
296                unique[names[0]] = gene
297            elif len(names) == 0:
298                unknown.append(gene)
299            else:
300                conflicting.append(gene)
301        return unique, conflicting, unknown
[1733]302
303    @deprecated_function_name
[1532]304    def get_genes(self):
305        return self.genes
[1733]306
[1532]307    @classmethod
308    def organism_name_search(cls, name):
309        genome = KEGGGenome()
[1738]310        try:
311            name = genome.org_code_to_entry_key(name)
312        except KeyError:
313            pass
314
[1532]315        if name not in genome:
316            ids = genome.search(name)
317            if not ids:
[1632]318                from .. import obiTaxonomy
[1532]319                ids = obiTaxonomy.search(name)
320                ids = [id for id in ids if genome.search(id)]
321            name = ids.pop(0) if ids else name
[1734]322
[1532]323        try:
[1733]324            return genome[name].organism_code
[1532]325        except KeyError:
[1712]326            raise OrganismNotFoundError(name)
[1734]327
[1532]328    @classmethod
329    def organism_version(cls, name):
330        name = cls.organism_name_search(name)
331        genome = KEGGGenome()
[1733]332        info = genome.api.info(name)
[1532]333        return info.release
[1733]334
[1532]335    def _set_genematcher(self, genematcher):
336        setattr(self, "_genematcher", genematcher)
[1734]337
[1532]338    def _get_genematcher(self):
[1734]339        if getattr(self, "_genematcher", None) is None:
[1632]340            from .. import obiGene
[1532]341            if self.org_code == "ddi":
[1733]342                self._genematcher = obiGene.matcher(
343                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
[1734]344                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
[1733]345                )
[1532]346            else:
[1733]347                self._genematcher = obiGene.matcher(
348                    [obiGene.GMKEGG(self.org_code)])
349
[1532]350            self._genematcher.set_targets(self.genes.keys())
351        return self._genematcher
[1734]352
[1532]353    genematcher = property(_get_genematcher, _set_genematcher)
[1733]354
355
[1532]356KEGGOrganism = Organism
[1733]357
358
[1532]359def organism_name_search(name):
[1736]360    """
361    Search and organism by `name` and return an KEGG organism code.
362    """
[1532]363    return KEGGOrganism.organism_name_search(name)
364
[1734]365
[1532]366def pathways(org):
[1736]367    """
368    Return a list of all KEGG pathways for an KEGG organism code `org`.
369    """
[1532]370    return KEGGPathway.list(org)
371
[1734]372
[1532]373def organisms():
[1736]374    """
375    Return a list of all KEGG organisms.
376    """
[1532]377    return KEGGOrganism.organisms()
378
[1734]379
[1532]380def from_taxid(taxid):
[1736]381    """
382    Return a KEGG organism code for a an NCBI Taxonomy id string `taxid`.
383    """
[1532]384    genome = KEGGGenome()
385    res = genome.search(taxid)
386    for r in res:
387        e = genome[r]
[1734]388
389        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
[1532]390            return e.org_code()
[1733]391
[1532]392    return None
393
[1734]394
[1532]395def to_taxid(name):
[1736]396    """
397    Return a NCBI Taxonomy id for a given KEGG Organism name
398    """
[1532]399    genome = KEGGGenome()
[1552]400    if name in genome:
401        return genome[name].taxid
[1733]402
[1532]403    keys = genome.search(name)
404    if keys:
405        return genome[keys[0]].taxid
406    else:
407        return None
408
[1734]409
[1532]410def create_gene_sets():
411    pass
412
[1733]413
[1532]414def main():
415    KEGGGenome()
416    import doctest
417    extraglobs = {"api": KeggApi()}
418    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
419
[1733]420
[1532]421if __name__ == "__main__":
[1601]422    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.