source: orange-bioinformatics/_bioinformatics/obiKEGG2/__init__.py @ 1713:7113883f4437

Revision 1713:7113883f4437, 10.9 KB checked in by markotoplak, 20 months ago (diff)

Fixes for obiGeneSets. Compatible files were generated and uploaded to SF.

Line 
1"""\
2==============================================
3KEGG - Kyoto Encyclopedia of Genes and Genomes
4==============================================
5
6This is a python module for access to `KEGG`_ using its web services. To use this module you need to have
7`SUDS`_ python library installed (other backends are planed).
8
9.. _`KEGG`: http://www.genome.jp/kegg/
10
11.. _`SUDS`: http://pypi.python.org/pypi/suds/
12
13"""
14from __future__ import absolute_import
15
16
17import os, sys
18from collections import defaultdict
19
20from datetime import datetime
21
22from Orange.utils import lru_cache, serverfiles
23
24from . import databases
25from . import entry
26
27from .brite import BriteEntry, Brite
28
29from . import api
30from . import conf
31from . import pathway
32
33KEGGGenome = databases.Genome
34KEGGGenes = databases.Genes
35KEGGEnzymes = databases.Enzymes
36KEGGReaction = databases.Reactions
37KEGGPathways = databases.Pathways
38
39KEGGBrite = Brite
40KEGGBriteEntry = BriteEntry
41
42KEGGPathway = pathway.Pathway
43
44DEFAULT_CACHE_DIR = conf.params["cache.path"]
45
46
47from .. import obiProb
48from Orange.utils import deprecated_keywords, deprecated_attribute
49
50class OrganismNotFoundError(Exception): pass
51
52class Organism(object):
53    def __init__(self, org, genematcher=None):
54        self.org_code = self.organism_name_search(org)
55        self.genematcher = genematcher
56        self.api = api.CachedKeggApi()
57       
58    @property
59    def org(self):
60        return self.org_code
61   
62    @property
63    def genes(self):
64        if not hasattr(self, "_genes"):
65            genes = KEGGGenes(self.org_code)
66            self._genes = genes
67        return self._genes
68   
69    def gene_aliases(self):
70        return self.genes().gene_aliases()
71   
72    def pathways(self, with_ids=None):
73        if with_ids is not None:
74            return self.api.get_pathways_by_genes(with_ids)
75        else:
76            return [p.entry_id for p in self.api.list_pathways(self.org_code)]
77   
78    def list_pathways(self):
79        return self.pathways()
80   
81    def get_linked_pathways(self, pathway_id):
82        self.api.get_linked_pathways(pathway_id)
83       
84    def enzymes(self, genes=None):
85        raise NotImplementedError()
86   
87    def get_enriched_pathways(self, genes, reference=None, prob=obiProb.Binomial(), callback=None):
88        """ Return a dictionary with enriched pathways ids as keys
89        and (list_of_genes, p_value, num_of_reference_genes) tuples
90        as items.
91       
92        """
93        allPathways = defaultdict(lambda :[[], 1.0, []])
94        from Orange.orng import orngMisc
95        milestones = orngMisc.progressBarMilestones(len(genes), 100)
96        pathways_db = KEGGPathways()
97       
98        pathways_for_gene = []
99        for i, gene in enumerate(genes):
100            pathways_for_gene.append(self.pathways([gene]))
101            if callback and i in milestones:
102                callback(i*50.0/len(genes))
103               
104        # precache for speed
105        pathways_db.pre_cache([pid for pfg in pathways_for_gene for pid in pfg]) 
106        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)):
107            for pathway in pathways:
108                if pathways_db.get_entry(pathway).gene: 
109                    allPathways[pathway][0].append(gene)
110            if callback and i in milestones:
111                callback(50.0 + i*50.0/len(genes))
112        reference = set(reference if reference is not None else self.genes.keys())
113       
114        pItems = allPathways.items()
115       
116        for i, (p_id, entry) in enumerate(pItems):
117            pathway = pathways_db.get_entry(p_id)
118            entry[2].extend(reference.intersection(pathway.gene or []))
119            entry[1] = prob.p_value(len(entry[0]), len(reference), len(entry[2]), len(genes))
120        return dict([(pid, (genes, p, len(ref))) for pid, (genes, p, ref) in allPathways.items()])
121       
122    def get_genes_by_enzyme(self, enzyme):
123        enzyme = Enzymes().get_entry(enzyme)
124        return enzyme.genes.get(self.org_code, []) if enzyme.genes else []
125   
126    def get_genes_by_pathway(self, pathway_id):
127        return KEGGPathway(pathway_id).genes()
128   
129    def get_enzymes_by_pathway(self, pathway_id):
130        return KEGGPathway(pathway_id).enzymes()
131   
132    def get_compounds_by_pathway(self, pathway_id):
133        return KEGGPathway(pathway_id).compounds()
134   
135    def get_pathways_by_genes(self, gene_ids):
136        return self.api.get_pathways_by_genes(gene_ids)
137        gene_ids = set(gene_ids)
138        pathways = [self.genes[id].pathway for id in gene_ids if self.genes[id].pathway]
139        pathways = reduce(set.union, pathways, set())
140        return [id for id in pathways if gene_ids.issubset(KEGGPathway(id).genes())] 
141   
142    def get_pathways_by_enzymes(self, enzyme_ids):
143        enzyme_ids = set(enzyme_ids)
144        pathways = [KEGGEnzymes()[id].pathway for id in enzyme_ids]
145        pathwats = reduce(set.union, pathways, set())
146        return [id for id in pathways if enzyme_ids.issubset(KEGGPathway(id).enzymes())]
147   
148    def get_pathways_by_compounds(self, compound_ids):
149        compound_ids = set(compound_ids)
150        pathways = [KEGGCompounds()[id].pathway for id in compound_ids]
151        pathwats = reduce(set.union, pathways, set())
152        return [id for id in pathways if compound_ids.issubset(KEGGPathway(id).compounds())]
153   
154    def get_enzymes_by_compound(self, compound_id):
155        return KEGGCompound()[compound_id].enzyme
156   
157    def get_enzymes_by_gene(self, gene_id):
158        return self.genes[gene_id].enzymes
159   
160    def get_compounds_by_enzyme(self, enzyme_id):
161        return self._enzymes_to_compounds.get(enzyme_id)
162   
163    @deprecated_keywords({"caseSensitive": "case_sensitive"})
164    def get_unique_gene_ids(self, genes, case_sensitive=True):
165        """Return a tuple with three elements. The first is a dictionary mapping from unique gene
166        ids to gene names in genes, the second is a list of conflicting gene names and the third is a list
167        of unknown genes.
168        """
169        unique, conflicting, unknown = {}, [], []
170        for gene in genes:
171            names = self.genematcher.match(gene)
172            if len(names) == 1:
173                unique[names[0]] = gene
174            elif len(names) == 0:
175                unknown.append(gene)
176            else:
177                conflicting.append(gene)
178        return unique, conflicting, unknown
179   
180    def get_genes(self):
181        return self.genes
182   
183    @classmethod
184    def organism_name_search(cls, name):
185        genome = KEGGGenome()
186        if name not in genome:
187            ids = genome.search(name)
188            if not ids:
189                from .. import obiTaxonomy
190                ids = obiTaxonomy.search(name)
191                ids = [id for id in ids if genome.search(id)]
192            name = ids.pop(0) if ids else name
193           
194        try:
195            return genome[name].entry_key
196        except KeyError:
197            raise OrganismNotFoundError(name)
198       
199    @classmethod
200    def organism_version(cls, name):
201        name = cls.organism_name_search(name)
202        genome = KEGGGenome()
203        info = genome.api.binfo(name)
204        return info.release
205#        orngServerFiles.localpath_download("KEGG", "kegg_genes_%s.tar.gz" % name)
206#        return orngServerFiles.info("KEGG", "kegg_genes_%s.tar.gz" % name)["datetime"]
207   
208    def _set_genematcher(self, genematcher):
209        setattr(self, "_genematcher", genematcher)
210       
211    def _get_genematcher(self):
212        if getattr(self, "_genematcher", None) == None:
213            from .. import obiGene
214            if self.org_code == "ddi":
215                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code), obiGene.GMDicty(),
216                                                     [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]])
217            else:
218                self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code)])
219            self._genematcher.set_targets(self.genes.keys())
220        return self._genematcher
221   
222    genematcher = property(_get_genematcher, _set_genematcher)
223   
224KEGGOrganism = Organism
225   
226def organism_name_search(name):
227    return KEGGOrganism.organism_name_search(name)
228
229def pathways(org):
230    return KEGGPathway.list(org)
231
232def organisms():
233    return KEGGOrganism.organisms()
234
235def from_taxid(taxid):
236    genome = KEGGGenome()
237    res = genome.search(taxid)
238    for r in res:
239        e = genome[r]
240       
241        if e.taxid in [taxid,  genome.TAXID_MAP.get(taxid, taxid)]:
242            return e.org_code()
243       
244    return None
245
246def to_taxid(name):
247    genome = KEGGGenome()
248    if name in genome:
249        return genome[name].taxid
250   
251    keys = genome.search(name)
252    if keys:
253        return genome[keys[0]].taxid
254    else:
255        return None
256
257def create_gene_sets():
258    pass
259
260from .. import obiGene
261from Orange.utils import ConsoleProgressBar
262
263class MatcherAliasesKEGG(obiGene.MatcherAliasesPickled):
264    DOMAIN = "KEGG"
265    VERSION = "v3.0"
266    def create_aliases(self):
267        import cPickle
268        files = set(serverfiles.ServerFiles().listfiles(self.DOMAIN))
269        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
270        if ids_filename in files:
271            filename = serverfiles.localpath_download(self.DOMAIN, ids_filename)
272           
273            aliases = cPickle.load(open(filename, "rb"))
274        else:
275            pb = ConsoleProgressBar("Retriving KEGG ids:")
276            kegg_org = KEGGOrganism(self.organism)
277            genes = kegg_org.genes
278            genes.pre_cache(progress_callback=pb.set_state)
279            aliases = []
280            for key, entry in genes.iteritems():
281                aliases.append(set([key]) | set(entry.alt_names))
282            filename = serverfiles.localpath_download(self.DOMAIN, ids_filename)
283            cPickle.dump(aliases, open(filename, "wb"))
284           
285        return aliases
286   
287    def filename(self):
288        return "kegg3_" + self.organism
289   
290    def aliases_path(self):
291        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
292        return serverfiles.localpath(self.DOMAIN, ids_filename)
293   
294    def create_aliases_version(self):
295        files = set(serverfiles.listfiles(self.DOMAIN))
296        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
297        if ids_filename in files:
298            version = serverfiles.info(self.DOMAIN, ids_filename)["datetime"]
299        else:
300            kegg_org = KEGGOrganism(self.organism)
301            genes = kegg_org.genes
302            version = genes.info.release
303        return version
304       
305    def __init__(self, organism, **kwargs):
306        self.organism = organism
307        sf = serverfiles.ServerFiles()
308        files = set(sf.listfiles(self.DOMAIN))
309        ids_filename = "kegg_gene_id_aliases_" + self.organism + ".pickle"
310        if ids_filename in files:
311            serverfiles.update(self.DOMAIN, ids_filename)
312           
313        obiGene.MatcherAliasesPickled.__init__(self, **kwargs)
314
315def main():
316    KEGGGenome()
317    import doctest
318    extraglobs = {"api": KeggApi()}
319    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs)
320
321if __name__ == "__main__":
322    sys.exit(main())
Note: See TracBrowser for help on using the repository browser.