source: orange-bioinformatics/_bioinformatics/obiKEGG/__init__.py @ 1735:50499d1dc55a

Revision 1735:50499d1dc55a, 10.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Changed the Organism.gene_aliases method.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6This is a python module for access to `KEGG`_ using its web services.
7
8To use this module you need to have `slumber` and `requests` package
9installed.
10
11.. _`KEGG`: http://www.genome.jp/kegg/
12
13
14"""
15from __future__ import absolute_import
16
17import os
18import sys
19import urllib2
20
21from collections import defaultdict
22from itertools import chain
23from datetime import datetime
24
25from Orange.utils import lru_cache
26from Orange.utils import progress_bar_milestones
27from Orange.utils import (
28    deprecated_keywords, deprecated_attribute, deprecated_function_name
29)
30
31from .. import obiProb
32
33from . import databases
34from . import entry
35
36from .brite import BriteEntry, Brite
37
38from . import api
39from . import conf
40from . import pathway
41
42KEGGGenome = databases.Genome
43KEGGGenes = databases.Genes
44KEGGEnzyme = databases.Enzyme
45KEGGReaction = databases.Reaction
46KEGGPathways = databases.Pathway
47KEGGCompound = databases.Compound
48
49KEGGBrite = Brite
50KEGGBriteEntry = BriteEntry
51
52KEGGPathway = pathway.Pathway
53
54DEFAULT_CACHE_DIR = conf.params["cache.path"]
55
56
57class OrganismNotFoundError(Exception):
58    pass
59
60
61class Organism(object):
62    """
63    A convenience class for retrieving information regarding an
64    organism in the KEGG Genes database.
65
66    :param org: KEGGG organism code (e.g. "hsa", "sce")
67    :type org: str
68
69    """
70    def __init__(self, org, genematcher=None):
71        self.org_code = self.organism_name_search(org)
72        self.genematcher = genematcher
73        self.api = api.CachedKeggApi()
74
75    @property
76    def org(self):
77        """
78        KEGG organism code.
79        """
80        return self.org_code
81
82    @property
83    def genes(self):
84        """
85        An :class:`Genes` database instance for this organism.
86        """
87        # TODO: This should not be a property but a method.
88        # I think it was only put here as back compatibility with old obiKEGG.
89        if not hasattr(self, "_genes"):
90            genes = KEGGGenes(self.org_code)
91            self._genes = genes
92        return self._genes
93
94    def gene_aliases(self):
95        """
96        Return a list of sets of equal genes (synonyms) in KEGG for
97        this organism.
98
99        .. note::
100
101            This only includes 'ncbi-geneid' and 'ncbi-gi' records
102            from the KEGG Genes DBLINKS entries.
103
104        """
105        definitions = self.api.list(self.org_code)
106        ncbi_geneid = self.api.conv(self.org_code, "ncbi-geneid")
107        ncbi_gi = self.api.conv(self.org_code, "ncbi-gi")
108
109        aliases = defaultdict(set)
110
111        for entry_id, definition in definitions:
112            # genes entry id without the organism code
113            aliases[entry_id].add(entry_id.split(":", 1)[1])
114            # all names in the NAME field (KEGG API list returns
115            # 'NAME; DEFINITION') fields for genes
116            names = definition.split(";")[0].split(",")
117            aliases[entry_id].update([name.strip() for name in names])
118
119        for source_id, target_id in chain(ncbi_geneid, ncbi_gi):
120            aliases[target_id].add(source_id.split(":", 1)[1])
121
122        return [set([entry_id]).union(names)
123                for entry_id, names in aliases.iteritems()]
124
125    def pathways(self, with_ids=None):
126        """
127        Return a list of all pathways for this organism.
128        """
129        if with_ids is not None:
130            return self.api.get_pathways_by_genes(with_ids)
131        else:
132            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
133
134    def list_pathways(self):
135        """
136        List all pathways.
137        """
138        # NOTE: remove/deprecate and use pathways()
139        return self.pathways()
140
141    def get_linked_pathways(self, pathway_id):
142        self.api.get_linked_pathways(pathway_id)
143
144    def enzymes(self, genes=None):
145        raise NotImplementedError()
146
147    def get_enriched_pathways(self, genes, reference=None,
148                              prob=obiProb.Binomial(), callback=None):
149        """
150        Return a dictionary with enriched pathways ids as keys
151        and (list_of_genes, p_value, num_of_reference_genes) tuples
152        as items.
153
154        """
155        if reference is None:
156            reference = self.genes.keys()
157        reference = set(reference)
158
159        allPathways = defaultdict(lambda: [[], 1.0, []])
160        milestones = progress_bar_milestones(len(genes), 100)
161        pathways_db = KEGGPathways()
162
163        pathways_for_gene = []
164        for i, gene in enumerate(genes):
165            pathways_for_gene.append(self.pathways([gene]))
166            if callback and i in milestones:
167                callback(i * 50.0 / len(genes))
168
169        # pre-cache for speed
170        pathways_db.pre_cache([pid for pfg in pathways_for_gene
171                               for pid in pfg])
172        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
173            for pathway in pathways:
174                if pathways_db.get_entry(pathway).gene:
175                    allPathways[pathway][0].append(gene)
176            if callback and i in milestones:
177                callback(50.0 + i * 50.0 / len(genes))
178
179        pItems = allPathways.items()
180
181        for i, (p_id, entry) in enumerate(pItems):
182            pathway = pathways_db.get_entry(p_id)
183            entry[2].extend(reference.intersection(pathway.gene or []))
184            entry[1] = prob.p_value(len(entry[0]), len(reference),
185                                    len(entry[2]), len(genes))
186        return dict([(pid, (genes, p, len(ref)))
187                     for pid, (genes, p, ref) in allPathways.items()])
188
189    def get_genes_by_enzyme(self, enzyme):
190        enzyme = KEGGEnzyme().get_entry(enzyme)
191        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
192
193    def get_genes_by_pathway(self, pathway_id):
194        return KEGGPathway(pathway_id).genes()
195
196    def get_enzymes_by_pathway(self, pathway_id):
197        return KEGGPathway(pathway_id).enzymes()
198
199    def get_compounds_by_pathway(self, pathway_id):
200        return KEGGPathway(pathway_id).compounds()
201
202    def get_pathways_by_genes(self, gene_ids):
203        return self.api.get_pathways_by_genes(gene_ids)
204        gene_ids = set(gene_ids)
205        pathways = [self.genes[id].pathway for id in gene_ids
206                    if self.genes[id].pathway]
207        pathways = reduce(set.union, pathways, set())
208        return [id for id in pathways
209                if gene_ids.issubset(KEGGPathway(id).genes())]
210
211    def get_pathways_by_enzymes(self, enzyme_ids):
212        enzyme_ids = set(enzyme_ids)
213        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids]
214        pathways = reduce(set.union, pathways, set())
215        return [id for id in pathways
216                if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
217
218    def get_pathways_by_compounds(self, compound_ids):
219        compound_ids = set(compound_ids)
220        pathways = [KEGGCompound()[id].pathway for id in compound_ids]
221        pathways = reduce(set.union, pathways, set())
222        return [id for id in pathways
223                if compound_ids.issubset(KEGGPathway(id).compounds())]
224
225    def get_enzymes_by_compound(self, compound_id):
226        return KEGGCompound()[compound_id].enzyme
227
228    def get_enzymes_by_gene(self, gene_id):
229        return self.genes[gene_id].enzymes
230
231    def get_compounds_by_enzyme(self, enzyme_id):
232        return self._enzymes_to_compounds.get(enzyme_id)
233
234    @deprecated_keywords({"caseSensitive": "case_sensitive"})
235    def get_unique_gene_ids(self, genes, case_sensitive=True):
236        """
237        Return a tuple with three elements. The first is a dictionary
238        mapping from unique geneids to gene names in genes, the second
239        is a list of conflicting gene names and the third is a list of
240        unknown genes.
241
242        """
243        unique, conflicting, unknown = {}, [], []
244        for gene in genes:
245            names = self.genematcher.match(gene)
246            if len(names) == 1:
247                unique[names[0]] = gene
248            elif len(names) == 0:
249                unknown.append(gene)
250            else:
251                conflicting.append(gene)
252        return unique, conflicting, unknown
253
254    @deprecated_function_name
255    def get_genes(self):
256        return self.genes
257
258    @classmethod
259    def organism_name_search(cls, name):
260        genome = KEGGGenome()
261        if name not in genome:
262            ids = genome.search(name)
263            if not ids:
264                from .. import obiTaxonomy
265                ids = obiTaxonomy.search(name)
266                ids = [id for id in ids if genome.search(id)]
267            name = ids.pop(0) if ids else name
268
269        try:
270            return genome[name].organism_code
271        except KeyError:
272            raise OrganismNotFoundError(name)
273
274    @classmethod
275    def organism_version(cls, name):
276        name = cls.organism_name_search(name)
277        genome = KEGGGenome()
278        info = genome.api.info(name)
279        return info.release
280
281    def _set_genematcher(self, genematcher):
282        setattr(self, "_genematcher", genematcher)
283
284    def _get_genematcher(self):
285        if getattr(self, "_genematcher", None) is None:
286            from .. import obiGene
287            if self.org_code == "ddi":
288                self._genematcher = obiGene.matcher(
289                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
290                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]
291                )
292            else:
293                self._genematcher = obiGene.matcher(
294                    [obiGene.GMKEGG(self.org_code)])
295
296            self._genematcher.set_targets(self.genes.keys())
297        return self._genematcher
298
299    genematcher = property(_get_genematcher, _set_genematcher)
300
301
302KEGGOrganism = Organism
303
304
305def organism_name_search(name):
306    return KEGGOrganism.organism_name_search(name)
307
308
309def pathways(org):
310    return KEGGPathway.list(org)
311
312
313def organisms():
314    return KEGGOrganism.organisms()
315
316
317def from_taxid(taxid):
318    genome = KEGGGenome()
319    res = genome.search(taxid)
320    for r in res:
321        e = genome[r]
322
323        if e.taxid in [taxid, genome.TAXID_MAP.get(taxid, taxid)]:
324            return e.org_code()
325
326    return None
327
328
329def to_taxid(name):
330    genome = KEGGGenome()
331    if name in genome:
332        return genome[name].taxid
333
334    keys = genome.search(name)
335    if keys:
336        return genome[keys[0]].taxid
337    else:
338        return None
339
340
341def create_gene_sets():
342    pass
343
344
345def main():
346    KEGGGenome()
347    import doctest
348    extraglobs = {"api": KeggApi()}
349    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
350
351
352if __name__ == "__main__":
353    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.