source: orange-bioinformatics/_bioinformatics/obiKEGG/api.py @ 1757:1aee9d34655b

Revision 1757:1aee9d34655b, 20.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Disabled automatic cache invalidation.

Line 
1"""
2KEGG api interface.
3
4"""
5from __future__ import absolute_import
6
7from datetime import datetime
8from contextlib import closing
9from operator import itemgetter
10import warnings
11
12from .service import web_service
13from .types import OrganismSummary, Definition, BInfo, Link
14
15
16# A list of all databases with names, abbreviations
17DATABASES = [
18    ("KEGG Pathway", "pathway", "path", None),
19    ("KEGG Brite", "brite", "br", None),
20    ("KEGG Module", "module", "md", "M"),
21    ("KEGG Disease", "disease", "ds", "H"),
22    ("KEGG Drug", "drug", "dr", "D"),
23    ("KEGG Orthology", "orthology", "ko", "K"),
24    ("KEGG Genome", "genome", "genome", "T"),
25    ("KEGG Genomes", "genomes", "gn", "T"),
26    ("KEGG Genes", "genes", None, None),
27    ("KEGG Ligand", "ligand", "ligand", None),
28    ("KEGG Compound", "compound", "cpd", "C"),
29    ("KEGG Glycan", "glycan", "gl", "G"),
30    ("KEGG Reaction", "reaction", "rn", "R"),
31    ("KEGG RPair", "rpair", "rp", "RP"),
32    ("KEGG RClass", "rclass", "rc", "RC"),
33    ("KEGG Enzyme", "enzyme", "ec", "E")
34]
35
36
37def _link_targets(links):
38    return sorted(set(map(itemgetter(1), links)))
39
40
41class KeggApi(object):
42    """
43    An abstraction of a rest KEGG API.
44    """
45
46    def __init__(self):
47        self.service = web_service()
48
49    def list_organisms(self):
50        """
51        Return a list of all available organisms
52
53        >>> api.list_organisms()
54        [OrganismSummary(entry_id=T0..
55
56        """
57        return map(OrganismSummary.from_str,
58                   self.service.list.organism.get().splitlines())
59
60    def list_pathways(self, organism):
61        """
62        Return a list of all available pathways for `organism`
63
64        >>> api.list_pathways("hsa")
65        [Definition(entry_id=',...
66
67        """
68        return map(Definition.from_str,
69                   self.service.list.pathway(organism).get().splitlines())
70
71    def list(self, db):
72        """
73        Return a list of all available entries in database `db`.
74        """
75        return map(Definition.from_str,
76                   self.service.list(db).get().splitlines())
77
78    #######
79    # DBGET
80    #######
81
82    def info(self, db):
83        """
84        Return info for database `db`
85
86        >>> print api.info("pathway")
87        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
88
89        """
90        result = self.service.info(db).get()
91        return BInfo.from_text(str(result))
92
93    def find(self, db, keywords):
94        """
95        Search database 'db' for keywords.
96        """
97        if isinstance(keywords, basestring):
98            keywords = [keywords]
99
100        return self.service.find(db)("+".join(keywords)).get()
101
102    def get(self, ids):
103        """
104        Retrieve database entries for `ids` list.
105        """
106        if not isinstance(ids, basestring):
107            # Sequence of ids
108            ids = "+".join(ids)
109
110        return self.service.get(ids).get()
111
112    def conv(self, target_db, source):
113        """
114        Return a mapping from source to target_db ids as a list of two
115        tuples [(source_id, target_id), ...].
116
117        """
118        if not isinstance(source, basestring):
119            source = "+".join(source)
120
121        res = self.service.conv(target_db)(source).get()
122        return [tuple(line.split("\t")) for line in res.splitlines()]
123
124    def link(self, target_db, source_db=None, ids=None):
125        if not (source_db or ids):
126            raise ValueError("One of 'source_db' or 'ids' must be supplied")
127        if source_db and ids:
128            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
129
130        if source_db:
131            result = self.service.link(target_db)(source_db).get()
132        else:
133            result = self.service.link(target_db)("+".join(ids)).get()
134
135        return map(Link._make, map(str.split, result.splitlines()))
136
137    def get_genes_by_enzyme(self, enzyme_id, org):
138        return _link_targets(self.link(org, ids=[enzyme_id]))
139
140    def get_enzymes_by_gene(self, gene_id):
141        return _link_targets(self.link("ec", ids=[gene_id]))
142
143    def get_enzymes_by_compound(self, compound_id):
144        return _link_targets(self.link("ec", ids=[compound_id]))
145
146    def get_enzymes_by_glycan(self, glycan_id):
147        return _link_targets(self.link("ec", ids=[glycan_id]))
148
149    def get_enzymes_by_reaction(self, reaction_id):
150        return _link_targets(self.link("ec", ids=[reaction_id]))
151
152    def get_compounds_by_enzyme(self, enzyme_id):
153        return _link_targets(self.link("compound", ids=[enzyme_id]))
154
155    def get_compounds_by_reaction(self, reaction_id):
156        return _link_targets(self.link("compound", ids=[reaction_id]))
157
158    def get_glycans_by_enzyme(self, enzyme_id):
159        return _link_targets(self.link("gl", ids=[enzyme_id]))
160
161    def get_glycans_by_reaction(self, reaction_id):
162        return _link_targets(self.link("gl", ids=[reaction_id]))
163
164    def get_reactions_by_enzyme(self, enzyme_id):
165        return _link_targets(self.link("rn", ids=[enzyme_id]))
166
167    def get_reactions_by_compound(self, compound_id):
168        return _link_targets(self.link("rn", ids=[compound_id]))
169
170    def get_reactions_by_glycan(self, glycan_id):
171        return _link_targets(self.link("rn", ids=[glycan_id]))
172
173    ######
174    # SSDB
175    ######
176
177    # No replacement api in the KEGG REST api.
178    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
179        raise NotImplementedError
180
181    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
182        raise NotImplementedError
183
184    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
185        raise NotImplementedError
186
187    def get_paralogs_by_gene(self, genes_id, offset, limit):
188        raise NotImplementedError
189
190    #######
191    # Motif
192    #######
193
194    # No replacement api in KEGG REST api
195    def get_motifs_by_gene(self, genes_id, db):
196        raise NotImplementedError
197
198    def get_genes_by_motifs(self, motif_id_list, offset, limit):
199        raise NotImplementedError
200
201    ####
202    # KO
203    ####
204
205    def get_ko_by_gene(self, genes_id):
206        raise NotImplementedError
207
208    def get_ko_by_ko_class(self, ko_class_id):
209        raise NotImplementedError
210
211    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
212        raise NotImplementedError
213
214    def get_genes_by_ko(self, ko_id, org):
215        raise NotImplementedError
216
217    #########
218    # Pathway
219    #########
220
221    def mark_pathway_by_objects(self, pathway_id, object_id_list):
222        raise NotImplementedError
223
224    def color_pathway_by_objects(self, pathway_id, object_id_list,
225                                 fg_color_list, bg_color_list):
226        raise NotImplementedError
227
228    def color_pathway_by_elements(self, pathway_id, element_id_list,
229                                  fg_color_list, bg_color_list):
230        raise NotImplementedError
231
232    def get_html_of_marked_pathway_by_objects(self, pathway_id,
233                                              object_id_list):
234        raise NotImplementedError
235
236    def get_html_of_colored_pathway_by_objects(self, pathway_id,
237                                               object_id_list, fg_color_list,
238                                               bg_color_list):
239        raise NotImplementedError
240
241    def get_html_of_colored_pathway_by_elements(self, pathway_id,
242                                                element_id_list, fg_color_list,
243                                                bg_color_list):
244        raise NotImplementedError
245
246    def get_references_by_pathway(self, pathway_id):
247        return self.service.get_references_by_pathway(pathway_id)
248
249    def get_element_relations_by_pathway(self, pathway_id):
250        return self.service.get_element_relations_by_pathway(pathway_id)
251
252    def get_genes_by_organism(self, organism, offset=None, limit=None):
253        if offset is not None:
254            raise NotImplementedError("offset is no longer supported")
255        if limit is not None:
256            raise NotImplementedError("limit is no longer supported.")
257
258        res = self.service.list(organism).get().splitlines()
259        return [r.split(None, 1)[0] for r in res]
260
261    def get_number_of_genes_by_organism(self, organism):
262        raise NotImplementedError
263
264    ####################
265    # Objects by pathway
266    ####################
267
268    def get_elements_by_pathway(self, pathway_id):
269        raise NotImplementedError
270
271    def get_genes_by_pathway(self, pathway_id):
272        return _link_targets(self.link("genes", ids=[pathway_id]))
273
274    def get_enzymes_by_pathway(self, pathway_id):
275        return _link_targets(self.link("ec", ids=[pathway_id]))
276
277    def get_compounds_by_pathway(self, pathway_id):
278        return _link_targets(self.link("compound", ids=[pathway_id]))
279
280    def get_drugs_by_pathway(self, pathway_id):
281        return _link_targets(self.link("drug", ids=[pathway_id]))
282
283    def get_glycans_by_pathway(self, pathway_id):
284        return _link_targets(self.link("gl", ids=[pathway_id]))
285
286    def get_reactions_by_pathway(self, pathway_id):
287        return _link_targets(self.link("rn", ids=[pathway_id]))
288
289    def get_kos_by_pathway(self, pathway_id):
290        return _link_targets(self.link("ko", ids=[pathway_id]))
291
292    #####################
293    # Pathways by objects
294    #####################
295
296    # These functions returned results intersections.
297    def get_pathways_by_genes(self, gene_list):
298        raise NotImplementedError
299
300    def get_pathways_by_enzymes(self, enzyme_list):
301        raise NotImplementedError
302
303    def get_pathways_by_compounds(self, compound_list):
304        raise NotImplementedError
305
306    def get_pathways_by_drugs(self, drug_list):
307        raise NotImplementedError
308
309    def get_pathways_by_glycans(self, glycan_list):
310        raise NotImplementedError
311
312    def get_pathways_by_reactions(self, reaction_list):
313        raise NotImplementedError
314
315    def get_pathways_by_kos(self, ko_list):
316        raise NotImplementedError
317
318    ##########################
319    # Relations among pathways
320    ##########################
321
322    def get_linked_pathways(self, pathway_id):
323        if not pathway_id.startswith("path:"):
324            pathway_id = "path:" + pathway_id
325        return _link_targets(self.link("pathway", ids=[pathway_id]))
326
327
328"""
329KEGG api with caching
330"""
331
332import os
333
334from . import caching
335from .caching import cached_method, cache_entry, touch_dir
336
337try:
338    from functools import lru_cache
339except ImportError:
340    # TODO: move a copy of lru_cache in .caching if distributing this as a
341    # standalone package
342    from Orange.utils import lru_cache
343
344
345class CachedKeggApi(KeggApi):
346    def __init__(self, store=None):
347        KeggApi.__init__(self)
348        if store is None:
349            self.store = {}
350
351    # Needed API for cached decorator.
352    def cache_store(self):
353        from . import conf
354        path = conf.params["cache.path"]
355        touch_dir(path)
356        return caching.Sqlite3Store(os.path.join(path,
357                                                 "kegg_api_cache_1.sqlite3"))
358
359    def last_modified(self, args, kwargs=None):
360        return getattr(self, "default_release", "")
361
362    def set_default_release(self, release):
363        self.default_release = release
364
365    @cached_method
366    def list_organisms(self):
367        return KeggApi.list_organisms(self)
368
369    @cached_method
370    def list_pathways(self, organism):
371        return KeggApi.list_pathways(self, organism)
372
373    @cached_method
374    def list(self, db):
375        return KeggApi.list(self, db)
376
377    @lru_cache()  # not persistently cached
378    def info(self, db):
379        return KeggApi.info(self, db)
380
381    @cached_method
382    def find(self, db, keywords):
383        return KeggApi.find(self, db, keywords)
384
385    @cached_method
386    def get(self, ids):
387        if not isinstance(ids, basestring):
388            return self._batch_get(ids)
389        else:
390            return KeggApi.get(self, ids)
391
392    def _batch_get(self, ids):
393        if len(ids) > 10:
394            raise ValueError("Can batch at most 10 ids at a time.")
395
396        get = self.get
397        uncached = []
398        unmatched = set()
399
400        with closing(get.cache_store()) as store:
401            # Which ids are already cached
402            # TODO: Invalidate entries by release string.
403            for id in ids:
404                key = get.key_from_args((id,))
405                if not get.key_has_valid_cache(key, store):
406                    uncached.append(id)
407
408        if uncached:
409            # in case there are duplicate ids
410            uncached = sorted(set(uncached))
411
412            rval = KeggApi.get(self, uncached)
413
414            if rval is not None:
415                entries = rval.split("///\n")
416            else:
417                entries = []
418
419            if entries and not entries[-1].strip():
420                # Delete the last single newline entry if present
421                del entries[-1]
422
423            if len(entries) != len(uncached):
424                new_uncached, entries = match_by_ids(uncached, entries)
425                unmatched = set(uncached) - set(new_uncached)
426                uncached = new_uncached
427                warnings.warn("Unable to match entries for keys: %s." %
428                              ", ".join(map(repr, unmatched)))
429
430            with closing(get.cache_store()) as store:
431                for id, entry in zip(uncached, entries):
432                    key = get.key_from_args((id,))
433                    if entry is not None:
434                        entry = entry + "///\n"
435                    store[key] = cache_entry(entry, mtime=datetime.now())
436
437        # Finally join all the results, but drop all None objects
438        entries = filter(lambda e: e is not None, map(get, ids))
439
440        rval = "".join(entries)
441        return rval
442
443    @cached_method
444    def conv(self, target_db, source):
445        return KeggApi.conv(self, target_db, source)
446
447    ########
448    # LinkDB
449    ########
450
451    @cached_method
452    def get_genes_by_enzyme(self, enzyme_id, org):
453        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
454
455    @cached_method
456    def get_enzymes_by_gene(self, genes_id):
457        return KeggApi.get_enzymes_by_gene(self, genes_id)
458
459    @cached_method
460    def get_enzymes_by_compound(self, compound_id):
461        return KeggApi.get_enzymes_by_compound(self, compound_id)
462
463    @cached_method
464    def get_enzymes_by_glycan(self, glycan_id):
465        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
466
467    @cached_method
468    def get_enzymes_by_reaction(self, reaction_id):
469        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
470
471    @cached_method
472    def get_compounds_by_enzyme(self, enzyme_id):
473        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
474
475    @cached_method
476    def get_compounds_by_reaction(self, reaction_id):
477        return KeggApi.get_compounds_by_reaction(self, reaction_id)
478
479    @cached_method
480    def get_glycans_by_enzyme(self, enzyme_id):
481        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
482
483    @cached_method
484    def get_glycans_by_reaction(self, reaction_id):
485        return KeggApi.get_glycans_by_reaction(self, reaction_id)
486
487    @cached_method
488    def get_reactions_by_enzyme(self, enzyme_id):
489        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
490
491    @cached_method
492    def get_reactions_by_compound(self, compound_id):
493        return KeggApi.get_reactions_by_compound(self, compound_id)
494
495    @cached_method
496    def get_reactions_by_glycan(self, glycan_id):
497        return KeggApi.get_reactions_by_glycan(self, glycan_id)
498
499    ######
500    # SSDB
501    ######
502
503    @cached_method
504    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
505        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset,
506                                                       limit)
507
508    @cached_method
509    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
510        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset,
511                                                  limit)
512
513    @cached_method
514    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
515        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id,
516                                                          offset, limit)
517
518    @cached_method
519    def get_paralogs_by_gene(self, genes_id, offset, limit):
520        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
521
522    #######
523    # Motif
524    #######
525
526    @cached_method
527    def get_motifs_by_gene(self, genes_id, db):
528        return KeggApi.get_motifs_by_gene(self, genes_id, db)
529
530    @cached_method
531    def get_genes_by_motifs(self, motif_id_list, offset, limit):
532        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
533
534    ####
535    # KO
536    ####
537
538    @cached_method
539    def get_ko_by_gene(self, genes_id):
540        return KeggApi.get_ko_by_gene(self, genes_id)
541
542    @cached_method
543    def get_ko_by_ko_class(self, ko_class_id):
544        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
545
546    @cached_method
547    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
548        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset,
549                                             limit)
550
551    @cached_method
552    def get_genes_by_ko(self, ko_id, org):
553        return KeggApi.get_genes_by_ko(self, ko_id, org)
554
555    #########
556    # Pathway
557    #########
558
559    @cached_method
560    def get_genes_by_organism(self, organism, offset=None, limit=None):
561        return KeggApi.get_genes_by_organism(self, organism, offset=offset,
562                                             limit=limit)
563
564    @cached_method
565    def get_number_of_genes_by_organism(self, organism):
566        return KeggApi.get_number_of_genes_by_organism(self, organism)
567
568    @cached_method
569    def get_pathways_by_genes(self, gene_list):
570        return KeggApi.get_pathways_by_genes(self, gene_list)
571
572    @cached_method
573    def get_pathways_by_enzymes(self, enzyme_list):
574        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
575
576    @cached_method
577    def get_pathways_by_compounds(self, compound_list):
578        return KeggApi.get_pathways_by_compounds(self, compound_list)
579
580    @cached_method
581    def get_pathways_by_drugs(self, drug_list):
582        return KeggApi.get_pathways_by_drugs(self, drug_list)
583
584    @cached_method
585    def get_pathways_by_glycans(self, glycan_list):
586        return KeggApi.get_pathways_by_glycans(self, glycan_list)
587
588    @cached_method
589    def get_pathways_by_reactions(self, reaction_list):
590        return KeggApi.get_pathways_by_reactions(self, reaction_list)
591
592    @cached_method
593    def get_pathways_by_kos(self, ko_list):
594        return KeggApi.get_pathways_by_kos(self, ko_list)
595
596    @cached_method
597    def get_elements_by_pathway(self, pathway_id):
598        return KeggApi.get_elements_by_pathway(self, pathway_id)
599
600    @cached_method
601    def get_genes_by_pathway(self, pathway_id):
602        return KeggApi.get_genes_by_pathway(self, pathway_id)
603
604    @cached_method
605    def get_enzymes_by_pathway(self, pathway_id):
606        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
607
608    @cached_method
609    def get_compounds_by_pathway(self, pathway_id):
610        return KeggApi.get_compounds_by_pathway(self, pathway_id)
611
612    @cached_method
613    def get_drugs_by_pathway(self, pathway_id):
614        return KeggApi.get_drugs_by_pathway(self, pathway_id)
615
616    @cached_method
617    def get_glycans_by_pathway(self, pathway_id):
618        return KeggApi.get_glycans_by_pathway(self, pathway_id)
619
620    @cached_method
621    def get_reactions_by_pathway(self, pathway_id):
622        return KeggApi.get_reactions_by_pathway(self, pathway_id)
623
624    @cached_method
625    def get_kos_by_pathway(self, pathway_id):
626        return KeggApi.get_kos_by_pathway(self, pathway_id)
627
628
629def match_by_ids(ids, entries):
630    """
631
632    """
633
634    unmatched_ids = set(ids)
635    unmatched_entries = set(entries)
636
637    matched_ids = []
638    matched_entries = []
639
640    def match_add(search_id, entry):
641        """
642        Move search_id and entry to the matched lists.
643        """
644        matched_ids.append(search_id)
645        matched_entries.append(entry)
646
647        # Remove from the unmatched set
648        unmatched_ids.remove(search_id)
649        unmatched_entries.remove(entry)
650
651    def entry_split(entry_text):
652        line, _ = entry_text.split("\n", 1)
653        return line.split(None, 2)
654
655    entries_by_id = {}
656
657    for entry in entries:
658        _, eid, _ = entry_split(entry)
659        entries_by_id[eid] = entry
660
661    # First match full search ids
662    for search_id in list(unmatched_ids):
663        if search_id in entries_by_id:
664            entry = entries_by_id.pop(search_id)
665            match_add(search_id, entry)
666
667    # Second pass, split the search ids by ':' to db and identifier part,
668    # match by identifier
669    for search_id in list(unmatched_ids):
670        if ":" in search_id:
671            db_id, rest = search_id.split(":", 1)
672            if rest in entries_by_id:
673                entry = entries_by_id.pop(rest)
674                match_add(search_id, entry)
675
676    return matched_ids, matched_entries
Note: See TracBrowser for help on using the repository browser.