source: orange-bioinformatics/_bioinformatics/obiKEGG/api.py @ 1733:548d1187a29f

Revision 1733:548d1187a29f, 20.0 KB checked in by Ales Erjavec <ales.erjavec@…>, 14 months ago (diff)

Porting obiKEGG to use the new REST KEGG API.

Line 
1"""
2KEGG api interface.
3
4"""
5from __future__ import absolute_import
6
7from datetime import datetime
8from contextlib import closing
9from operator import itemgetter
10import warnings
11
12from .service import web_service
13from .types import OrganismSummary, Definition, BInfo, Link
14
15
16DATABASES = [
17    ("KEGG Pathway", "pathway", "path", None),
18    ("KEGG Brite", "brite", "br", None),
19    ("KEGG Module", "module", "md", "M"),
20    ("KEGG Disease", "disease", "ds", "H"),
21    ("KEGG Drug", "drug", "dr", "D"),
22    ("KEGG Orthology", "orthology", "ko", "K"),
23    ("KEGG Genome", "genome", "genome", "T"),
24    ("KEGG Genomes", "genomes", "gn", "T"),
25    ("KEGG Genes", "genes", None, None),
26    ("KEGG Ligand", "ligand", "ligand", None),
27    ("KEGG Compound", "compound", "cpd", "C"),
28    ("KEGG Glycan", "glycan", "gl", "G"),
29    ("KEGG Reaction", "reaction", "rn", "R"),
30    ("KEGG RPair", "rpair", "rp", "RP"),
31    ("KEGG RClass", "rclass", "rc", "RC"),
32    ("KEGG Enzyme", "enzyme", "ec", "E")
33]
34
35
36def _link_targets(links):
37    return sorted(set(map(itemgetter(1), links)))
38
39
40class KeggApi(object):
41    """
42    An abstraction of a kegg api.
43    """
44
45    def __init__(self):
46        self.service = web_service()
47
48    def list_organisms(self):
49        """
50        Return a list of all available organisms
51
52        >>> api.list_organisms()
53        [Definition(entry_id='hsa',...
54
55        """
56        return map(OrganismSummary.from_str,
57                   self.service.list.organism.get().splitlines())
58
59    def list_pathways(self, organism):
60        """
61        Return a list of all available pathways for `organism`
62
63        >>> api.list_pathways("hsa")
64        [Definition(entry_id=',...
65
66        """
67        return map(Definition.from_str,
68                   self.service.list.pathway(organism).get().splitlines())
69
70    def list(self, db):
71        """
72        Return a list of all available entries in database `db`.
73        """
74        return map(Definition.from_str,
75                   self.service.list(db).get().splitlines())
76
77    #######
78    # DBGET
79    #######
80
81    def info(self, db):
82        """
83        Return info for database `db`
84
85        >>> print api.info("pathway")
86        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
87
88        """
89        result = self.service.info(db).get()
90        return BInfo.from_text(str(result))
91
92    def find(self, db, keywords):
93        """
94        Search database 'db' for keywords.
95        """
96        if isinstance(keywords, basestring):
97            keywords = [keywords]
98
99        return self.service.find(db)("+".join(keywords)).get()
100
101    def get(self, ids):
102        """
103        Retrieve database entries for `ids` list.
104        """
105        if not isinstance(ids, basestring):
106            # Sequence of ids
107            ids = "+".join(ids)
108
109        return self.service.get(ids).get()
110
111    def conv(self, ids):
112        raise NotImplementedError()
113
114    def link(self, target_db, source_db=None, ids=None):
115        if not (source_db or ids):
116            raise ValueError("One of 'source_db' or 'ids' must be supplied")
117        if source_db and ids:
118            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
119
120        if source_db:
121            result = self.service.link(target_db)(source_db).get()
122        else:
123            result = self.service.link(target_db)("+".join(ids)).get()
124
125        return map(Link._make, map(str.split, result.splitlines()))
126
127    def get_genes_by_enzyme(self, enzyme_id, org):
128        return _link_targets(self.link(org, ids=[enzyme_id]))
129
130    def get_enzymes_by_gene(self, gene_id):
131        return _link_targets(self.link("ec", ids=[gene_id]))
132
133    def get_enzymes_by_compound(self, compound_id):
134        return _link_targets(self.link("ec", ids=[compound_id]))
135
136    def get_enzymes_by_glycan(self, glycan_id):
137        return _link_targets(self.link("ec", ids=[glycan_id]))
138
139    def get_enzymes_by_reaction(self, reaction_id):
140        return _link_targets(self.link("ec", ids=[reaction_id]))
141
142    def get_compounds_by_enzyme(self, enzyme_id):
143        return _link_targets(self.link("compound", ids=[enzyme_id]))
144
145    def get_compounds_by_reaction(self, reaction_id):
146        return _link_targets(self.link("compound", ids=[reaction_id]))
147
148    def get_glycans_by_enzyme(self, enzyme_id):
149        return _link_targets(self.link("gl", ids=[enzyme_id]))
150
151    def get_glycans_by_reaction(self, reaction_id):
152        return _link_targets(self.link("gl", ids=[reaction_id]))
153
154    def get_reactions_by_enzyme(self, enzyme_id):
155        return _link_targets(self.link("rn", ids=[enzyme_id]))
156
157    def get_reactions_by_compound(self, compound_id):
158        return _link_targets(self.link("rn", ids=[compound_id]))
159
160    def get_reactions_by_glycan(self, glycan_id):
161        return _link_targets(self.link("rn", ids=[glycan_id]))
162
163    ######
164    # SSDB
165    ######
166
167    # No replacement api in the KEGG REST api.
168    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
169        raise NotImplementedError
170
171    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
172        raise NotImplementedError
173
174    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
175        raise NotImplementedError
176
177    def get_paralogs_by_gene(self, genes_id, offset, limit):
178        raise NotImplementedError
179
180    #######
181    # Motif
182    #######
183
184    # No replacement api in KEGG REST api
185    def get_motifs_by_gene(self, genes_id, db):
186        raise NotImplementedError
187
188    def get_genes_by_motifs(self, motif_id_list, offset, limit):
189        raise NotImplementedError
190
191    ####
192    # KO
193    ####
194
195    def get_ko_by_gene(self, genes_id):
196        raise NotImplementedError
197
198    def get_ko_by_ko_class(self, ko_class_id):
199        raise NotImplementedError
200
201    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
202        raise NotImplementedError
203
204    def get_genes_by_ko(self, ko_id, org):
205        raise NotImplementedError
206
207    #########
208    # Pathway
209    #########
210
211    def mark_pathway_by_objects(self, pathway_id, object_id_list):
212        raise NotImplementedError
213
214    def color_pathway_by_objects(self, pathway_id, object_id_list,
215                                 fg_color_list, bg_color_list):
216        raise NotImplementedError
217
218    def color_pathway_by_elements(self, pathway_id, element_id_list,
219                                  fg_color_list, bg_color_list):
220        raise NotImplementedError
221
222    def get_html_of_marked_pathway_by_objects(self, pathway_id,
223                                              object_id_list):
224        raise NotImplementedError
225
226    def get_html_of_colored_pathway_by_objects(self, pathway_id,
227                                               object_id_list, fg_color_list,
228                                               bg_color_list):
229        raise NotImplementedError
230
231    def get_html_of_colored_pathway_by_elements(self, pathway_id,
232                                                element_id_list, fg_color_list,
233                                                bg_color_list):
234        raise NotImplementedError
235
236    def get_references_by_pathway(self, pathway_id):
237        return self.service.get_references_by_pathway(pathway_id)
238
239    def get_element_relations_by_pathway(self, pathway_id):
240        return self.service.get_element_relations_by_pathway(pathway_id)
241
242    def get_genes_by_organism(self, organism, offset=None, limit=None):
243        if offset is not None:
244            raise NotImplementedError("offset is no longer supported")
245        if limit is not None:
246            raise NotImplementedError("limit is no longer supported.")
247
248        res = self.service.list(organism).get().splitlines()
249        return [r.split(None, 1)[0] for r in res]
250
251    def get_number_of_genes_by_organism(self, organism):
252        raise NotImplementedError
253
254    ####################
255    # Objects by pathway
256    ####################
257
258    def get_elements_by_pathway(self, pathway_id):
259        raise NotImplementedError
260
261    def get_genes_by_pathway(self, pathway_id):
262        return _link_targets(self.link("genes", ids=[pathway_id]))
263
264    def get_enzymes_by_pathway(self, pathway_id):
265        return _link_targets(self.link("ec", ids=[pathway_id]))
266
267    def get_compounds_by_pathway(self, pathway_id):
268        return _link_targets(self.link("compound", ids=[pathway_id]))
269
270    def get_drugs_by_pathway(self, pathway_id):
271        return _link_targets(self.link("drug", ids=[pathway_id]))
272
273    def get_glycans_by_pathway(self, pathway_id):
274        return _link_targets(self.link("gl", ids=[pathway_id]))
275
276    def get_reactions_by_pathway(self, pathway_id):
277        return _link_targets(self.link("rn", ids=[pathway_id]))
278
279    def get_kos_by_pathway(self, pathway_id):
280        return _link_targets(self.link("ko", ids=[pathway_id]))
281
282    #####################
283    # Pathways by objects
284    #####################
285
286    # These functions returned results intersections.
287    def get_pathways_by_genes(self, gene_list):
288        raise NotImplementedError
289
290    def get_pathways_by_enzymes(self, enzyme_list):
291        raise NotImplementedError
292
293    def get_pathways_by_compounds(self, compound_list):
294        raise NotImplementedError
295
296    def get_pathways_by_drugs(self, drug_list):
297        raise NotImplementedError
298
299    def get_pathways_by_glycans(self, glycan_list):
300        raise NotImplementedError
301
302    def get_pathways_by_reactions(self, reaction_list):
303        raise NotImplementedError
304
305    def get_pathways_by_kos(self, ko_list):
306        raise NotImplementedError
307
308    ##########################
309    # Relations among pathways
310    ##########################
311
312    def get_linked_pathways(self, pathway_id):
313        if not pathway_id.startswith("path:"):
314            pathway_id = "path:" + pathway_id
315        return _link_targets(self.link("pathway", ids=[pathway_id]))
316
317
318"""
319KEGG api with caching
320"""
321
322import os
323
324from . import caching
325from .caching import cached_method, cache_entry, touch_dir
326
327try:
328    from functools import lru_cache
329except ImportError:
330    # TODO: move a copy of lru_cache in .caching if distributing this as a
331    # standalone package
332    from Orange.utils import lru_cache
333
334   
335class CachedKeggApi(KeggApi):
336    def __init__(self, store=None):
337        KeggApi.__init__(self)
338        if store is None:
339            self.store = {}
340   
341    # Needed API for cached decorator.
342    def cache_store(self):
343        from . import conf
344        path = conf.params["cache.path"]
345        touch_dir(path)
346        return caching.Sqlite3Store(os.path.join(path,
347                                                 "kegg_api_cache_1.sqlite3"))
348   
349    def last_modified(self, args, kwargs=None):
350        return getattr(self, "default_release", "")
351   
352    def set_default_release(self, release):
353        self.default_release = release
354
355    @cached_method
356    def list_organisms(self):
357        return KeggApi.list_organisms(self)
358   
359    @cached_method
360    def list_pathways(self, organism):
361        return KeggApi.list_pathways(self, organism)
362
363    @cached_method
364    def list(self, db):
365        return KeggApi.list(self, db)
366
367    @lru_cache()  # not persistently cached
368    def info(self, db):
369        return KeggApi.info(self, db)
370
371    @cached_method
372    def find(self, db, keywords):
373        return KeggApi.find(self, db, keywords)
374
375    @cached_method
376    def get(self, ids):
377        if not isinstance(ids, basestring):
378            return self._batch_get(ids)
379        else:
380            return KeggApi.get(self, ids)
381
382    def _batch_get(self, ids):
383        if len(ids) > 10:
384            raise ValueError("Can batch at most 10 ids at a time.")
385
386        get = self.get
387        uncached = []
388        unmatched = set()
389
390        with closing(get.cache_store()) as store:
391            # Which ids are already cached
392            # TODO: Invalidate entries by release string.
393            for id in ids:
394                key = get.key_from_args((id,))
395                if key not in store:
396                    uncached.append(id)
397
398        if uncached:
399            # in case there are duplicate ids
400            uncached = sorted(set(uncached))
401            rval = KeggApi.get(self, uncached)
402
403            if rval is not None:
404                entries = rval.split("///\n")
405            else:
406                entries = []
407
408            if entries and not entries[-1].strip():
409                # Delete the last single newline entry if present
410                del entries[-1]
411
412            if len(entries) != len(uncached):
413                new_uncached, entries = match_by_ids(uncached, entries)
414                unmatched = set(uncached) - set(new_uncached)
415                uncached = new_uncached
416                warnings.warn("Unable to match entries for keys: %s." %
417                              ", ".join(map(repr, unmatched)))
418
419            with closing(get.cache_store()) as store:
420                for id, entry in zip(uncached, entries):
421                    key = get.key_from_args((id,))
422                    if entry is not None:
423                        entry = entry + "///\n"
424                    store[key] = cache_entry(entry, mtime=datetime.now())
425
426        # Finally join all the results, but drop all None objects
427        entries = filter(lambda e: e is not None, map(get, ids))
428
429        rval = "".join(entries)
430        return rval
431
432    @cached_method
433    def conv(self, ids):
434        return KeggApi.conv(self, ids)
435
436    ########
437    # LinkDB
438    ########
439
440    @cached_method
441    def get_genes_by_enzyme(self, enzyme_id, org):
442        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
443
444    @cached_method
445    def get_enzymes_by_gene(self, genes_id):
446        return KeggApi.get_enzymes_by_gene(self, genes_id)
447
448    @cached_method
449    def get_enzymes_by_compound(self, compound_id):
450        return KeggApi.get_enzymes_by_compound(self, compound_id)
451
452    @cached_method
453    def get_enzymes_by_glycan(self, glycan_id):
454        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
455
456    @cached_method
457    def get_enzymes_by_reaction(self, reaction_id):
458        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
459
460    @cached_method
461    def get_compounds_by_enzyme(self, enzyme_id):
462        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
463
464    @cached_method
465    def get_compounds_by_reaction(self, reaction_id):
466        return KeggApi.get_compounds_by_reaction(self, reaction_id)
467
468    @cached_method
469    def get_glycans_by_enzyme(self, enzyme_id):
470        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
471
472    @cached_method
473    def get_glycans_by_reaction(self, reaction_id):
474        return KeggApi.get_glycans_by_reaction(self, reaction_id)
475
476    @cached_method
477    def get_reactions_by_enzyme(self, enzyme_id):
478        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
479
480    @cached_method
481    def get_reactions_by_compound(self, compound_id):
482        return KeggApi.get_reactions_by_compound(self, compound_id)
483
484    @cached_method
485    def get_reactions_by_glycan(self, glycan_id):
486        return KeggApi.get_reactions_by_glycan(self, glycan_id)
487
488    ######
489    # SSDB
490    ######
491   
492    @cached_method
493    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
494        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset, limit)
495   
496    @cached_method
497    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
498        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset, limit)
499   
500    @cached_method
501    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
502        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit)
503   
504    @cached_method
505    def get_paralogs_by_gene(self, genes_id, offset, limit):
506        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
507   
508    #######
509    # Motif
510    #######
511   
512    @cached_method
513    def get_motifs_by_gene(self, genes_id, db):
514        return KeggApi.get_motifs_by_gene(self, genes_id, db)
515   
516    @cached_method
517    def get_genes_by_motifs(self, motif_id_list, offset, limit):
518        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
519
520    ####
521    # KO
522    ####
523   
524    @cached_method
525    def get_ko_by_gene(self, genes_id):
526        return KeggApi.get_ko_by_gene(self, genes_id)
527   
528    @cached_method
529    def get_ko_by_ko_class(self, ko_class_id):
530        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
531   
532    @cached_method
533    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
534        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset, limit)
535   
536    @cached_method
537    def get_genes_by_ko(self, ko_id, org):
538        return KeggApi.get_genes_by_ko(self, ko_id, org)
539   
540    #########
541    # Pathway
542    #########
543
544    @cached_method
545    def get_genes_by_organism(self, organism, offset=None, limit=None):
546        return KeggApi.get_genes_by_organism(self, organism, offset=offset, limit=limit)
547   
548    @cached_method
549    def get_number_of_genes_by_organism(self, organism):
550        return KeggApi.get_number_of_genes_by_organism(self, organism)
551     
552    @cached_method
553    def get_pathways_by_genes(self, gene_list):
554        return KeggApi.get_pathways_by_genes(self, gene_list)
555   
556    @cached_method
557    def get_pathways_by_enzymes(self, enzyme_list):
558        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
559   
560    @cached_method
561    def get_pathways_by_compounds(self, compound_list):
562        return KeggApi.get_pathways_by_compounds(self, compound_list)
563   
564    @cached_method
565    def get_pathways_by_drugs(self, drug_list):
566        return KeggApi.get_pathways_by_drugs(self, drug_list)
567   
568    @cached_method
569    def get_pathways_by_glycans(self, glycan_list):
570        return KeggApi.get_pathways_by_glycans(self, glycan_list)
571   
572    @cached_method
573    def get_pathways_by_reactions(self, reaction_list):
574        return KeggApi.get_pathways_by_reactions(self, reaction_list)
575   
576    @cached_method
577    def get_pathways_by_kos(self, ko_list):
578        return KeggApi.get_pathways_by_kos(self, ko_list)
579   
580    @cached_method
581    def get_elements_by_pathway(self, pathway_id):
582        return KeggApi.get_elements_by_pathway(self, pathway_id)
583   
584    @cached_method
585    def get_genes_by_pathway(self, pathway_id):
586        return KeggApi.get_genes_by_pathway(self, pathway_id)
587   
588    @cached_method
589    def get_enzymes_by_pathway(self, pathway_id):
590        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
591   
592    @cached_method
593    def get_compounds_by_pathway(self, pathway_id):
594        return KeggApi.get_compounds_by_pathway(self, pathway_id)
595   
596    @cached_method
597    def get_drugs_by_pathway(self, pathway_id):
598        return KeggApi.get_drugs_by_pathway(self, pathway_id)
599   
600    @cached_method
601    def get_glycans_by_pathway(self, pathway_id):
602        return KeggApi.get_glycans_by_pathway(self, pathway_id)
603   
604    @cached_method
605    def get_reactions_by_pathway(self, pathway_id):
606        return KeggApi.get_reactions_by_pathway(self, pathway_id)
607   
608    @cached_method
609    def get_kos_by_pathway(self, pathway_id):
610        return KeggApi.get_kos_by_pathway(self, pathway_id)
611
612
613def match_by_ids(ids, entries):
614    """
615
616    """
617
618    unmatched_ids = set(ids)
619    unmatched_entries = set(entries)
620
621    matched_ids = []
622    matched_entries = []
623
624    def match_add(search_id, entry):
625        """
626        Move search_id and entry to the matched lists.
627        """
628        matched_ids.append(search_id)
629        matched_entries.append(entry)
630
631        # Remove from the unmatched set
632        unmatched_ids.remove(search_id)
633        unmatched_entries.remove(entry)
634
635    def entry_split(entry_text):
636        line, _ = entry_text.split("\n", 1)
637        return line.split(None, 2)
638
639    entries_by_id = {}
640
641    for entry in entries:
642        _, eid, _ = entry_split(entry)
643        entries_by_id[eid] = entry
644
645    # First match full search ids
646    for search_id in list(unmatched_ids):
647        if search_id in entries_by_id:
648            entry = entries_by_id.pop(search_id)
649            match_add(search_id, entry)
650
651    # Second pass, split the search ids by ':' to db and identifier part,
652    # match by identifier
653    for search_id in list(unmatched_ids):
654        if ":" in search_id:
655            db_id, rest = search_id.split(":", 1)
656            if rest in entries_by_id:
657                entry = entries_by_id.pop(rest)
658                match_add(search_id, entry)
659
660    return matched_ids, matched_entries
Note: See TracBrowser for help on using the repository browser.