source: orange-bioinformatics/_bioinformatics/obiKEGG/api.py @ 1757:1aee9d34655b

Revision 1757:1aee9d34655b, 20.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Disabled automatic cache invalidation.

RevLine 
[1532]1"""
2KEGG api interface.
3
4"""
5from __future__ import absolute_import
6
[1733]7from datetime import datetime
[1532]8from contextlib import closing
[1733]9from operator import itemgetter
10import warnings
[1532]11
12from .service import web_service
[1733]13from .types import OrganismSummary, Definition, BInfo, Link
14
15
[1757]16# A list of all databases with names, abbreviations
[1733]17DATABASES = [
18    ("KEGG Pathway", "pathway", "path", None),
19    ("KEGG Brite", "brite", "br", None),
20    ("KEGG Module", "module", "md", "M"),
21    ("KEGG Disease", "disease", "ds", "H"),
22    ("KEGG Drug", "drug", "dr", "D"),
23    ("KEGG Orthology", "orthology", "ko", "K"),
24    ("KEGG Genome", "genome", "genome", "T"),
25    ("KEGG Genomes", "genomes", "gn", "T"),
26    ("KEGG Genes", "genes", None, None),
27    ("KEGG Ligand", "ligand", "ligand", None),
28    ("KEGG Compound", "compound", "cpd", "C"),
29    ("KEGG Glycan", "glycan", "gl", "G"),
30    ("KEGG Reaction", "reaction", "rn", "R"),
31    ("KEGG RPair", "rpair", "rp", "RP"),
32    ("KEGG RClass", "rclass", "rc", "RC"),
33    ("KEGG Enzyme", "enzyme", "ec", "E")
34]
35
36
37def _link_targets(links):
38    return sorted(set(map(itemgetter(1), links)))
39
[1532]40
41class KeggApi(object):
[1733]42    """
[1757]43    An abstraction of a rest KEGG API.
[1733]44    """
45
[1532]46    def __init__(self):
47        self.service = web_service()
[1733]48
49    def list_organisms(self):
[1532]50        """
[1733]51        Return a list of all available organisms
52
[1532]53        >>> api.list_organisms()
[1757]54        [OrganismSummary(entry_id=T0..
[1733]55
[1532]56        """
[1733]57        return map(OrganismSummary.from_str,
58                   self.service.list.organism.get().splitlines())
59
[1532]60    def list_pathways(self, organism):
[1733]61        """
62        Return a list of all available pathways for `organism`
63
[1532]64        >>> api.list_pathways("hsa")
65        [Definition(entry_id=',...
[1733]66
[1532]67        """
[1733]68        return map(Definition.from_str,
69                   self.service.list.pathway(organism).get().splitlines())
70
71    def list(self, db):
72        """
73        Return a list of all available entries in database `db`.
74        """
75        return map(Definition.from_str,
76                   self.service.list(db).get().splitlines())
77
[1532]78    #######
79    # DBGET
80    #######
[1733]81
82    def info(self, db):
[1532]83        """
[1733]84        Return info for database `db`
85
86        >>> print api.info("pathway")
87        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
88
[1532]89        """
[1733]90        result = self.service.info(db).get()
91        return BInfo.from_text(str(result))
92
93    def find(self, db, keywords):
[1532]94        """
[1733]95        Search database 'db' for keywords.
96        """
97        if isinstance(keywords, basestring):
98            keywords = [keywords]
99
100        return self.service.find(db)("+".join(keywords)).get()
101
102    def get(self, ids):
103        """
104        Retrieve database entries for `ids` list.
[1532]105        """
106        if not isinstance(ids, basestring):
107            # Sequence of ids
[1733]108            ids = "+".join(ids)
109
110        return self.service.get(ids).get()
111
[1735]112    def conv(self, target_db, source):
113        """
114        Return a mapping from source to target_db ids as a list of two
115        tuples [(source_id, target_id), ...].
116
117        """
118        if not isinstance(source, basestring):
119            source = "+".join(source)
120
121        res = self.service.conv(target_db)(source).get()
122        return [tuple(line.split("\t")) for line in res.splitlines()]
[1733]123
124    def link(self, target_db, source_db=None, ids=None):
125        if not (source_db or ids):
126            raise ValueError("One of 'source_db' or 'ids' must be supplied")
127        if source_db and ids:
128            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
129
130        if source_db:
131            result = self.service.link(target_db)(source_db).get()
[1532]132        else:
[1733]133            result = self.service.link(target_db)("+".join(ids)).get()
134
135        return map(Link._make, map(str.split, result.splitlines()))
136
[1532]137    def get_genes_by_enzyme(self, enzyme_id, org):
[1733]138        return _link_targets(self.link(org, ids=[enzyme_id]))
139
140    def get_enzymes_by_gene(self, gene_id):
141        return _link_targets(self.link("ec", ids=[gene_id]))
142
[1532]143    def get_enzymes_by_compound(self, compound_id):
[1733]144        return _link_targets(self.link("ec", ids=[compound_id]))
145
[1532]146    def get_enzymes_by_glycan(self, glycan_id):
[1733]147        return _link_targets(self.link("ec", ids=[glycan_id]))
148
[1532]149    def get_enzymes_by_reaction(self, reaction_id):
[1733]150        return _link_targets(self.link("ec", ids=[reaction_id]))
151
[1532]152    def get_compounds_by_enzyme(self, enzyme_id):
[1733]153        return _link_targets(self.link("compound", ids=[enzyme_id]))
154
[1532]155    def get_compounds_by_reaction(self, reaction_id):
[1733]156        return _link_targets(self.link("compound", ids=[reaction_id]))
157
[1532]158    def get_glycans_by_enzyme(self, enzyme_id):
[1733]159        return _link_targets(self.link("gl", ids=[enzyme_id]))
160
[1532]161    def get_glycans_by_reaction(self, reaction_id):
[1733]162        return _link_targets(self.link("gl", ids=[reaction_id]))
163
[1532]164    def get_reactions_by_enzyme(self, enzyme_id):
[1733]165        return _link_targets(self.link("rn", ids=[enzyme_id]))
166
[1532]167    def get_reactions_by_compound(self, compound_id):
[1733]168        return _link_targets(self.link("rn", ids=[compound_id]))
169
[1532]170    def get_reactions_by_glycan(self, glycan_id):
[1733]171        return _link_targets(self.link("rn", ids=[glycan_id]))
172
[1532]173    ######
174    # SSDB
175    ######
[1733]176
177    # No replacement api in the KEGG REST api.
[1532]178    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]179        raise NotImplementedError
180
[1532]181    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]182        raise NotImplementedError
183
[1532]184    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]185        raise NotImplementedError
186
[1532]187    def get_paralogs_by_gene(self, genes_id, offset, limit):
[1733]188        raise NotImplementedError
189
[1532]190    #######
191    # Motif
192    #######
[1733]193
194    # No replacement api in KEGG REST api
[1532]195    def get_motifs_by_gene(self, genes_id, db):
[1733]196        raise NotImplementedError
197
[1532]198    def get_genes_by_motifs(self, motif_id_list, offset, limit):
[1733]199        raise NotImplementedError
200
[1532]201    ####
202    # KO
203    ####
[1733]204
[1532]205    def get_ko_by_gene(self, genes_id):
[1733]206        raise NotImplementedError
207
[1532]208    def get_ko_by_ko_class(self, ko_class_id):
[1733]209        raise NotImplementedError
210
[1532]211    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
[1733]212        raise NotImplementedError
213
[1532]214    def get_genes_by_ko(self, ko_id, org):
[1733]215        raise NotImplementedError
216
[1532]217    #########
218    # Pathway
219    #########
[1733]220
[1532]221    def mark_pathway_by_objects(self, pathway_id, object_id_list):
[1733]222        raise NotImplementedError
223
224    def color_pathway_by_objects(self, pathway_id, object_id_list,
225                                 fg_color_list, bg_color_list):
226        raise NotImplementedError
227
228    def color_pathway_by_elements(self, pathway_id, element_id_list,
229                                  fg_color_list, bg_color_list):
230        raise NotImplementedError
231
232    def get_html_of_marked_pathway_by_objects(self, pathway_id,
233                                              object_id_list):
234        raise NotImplementedError
235
236    def get_html_of_colored_pathway_by_objects(self, pathway_id,
237                                               object_id_list, fg_color_list,
238                                               bg_color_list):
239        raise NotImplementedError
240
241    def get_html_of_colored_pathway_by_elements(self, pathway_id,
242                                                element_id_list, fg_color_list,
243                                                bg_color_list):
244        raise NotImplementedError
245
[1532]246    def get_references_by_pathway(self, pathway_id):
247        return self.service.get_references_by_pathway(pathway_id)
[1733]248
[1532]249    def get_element_relations_by_pathway(self, pathway_id):
250        return self.service.get_element_relations_by_pathway(pathway_id)
[1733]251
[1532]252    def get_genes_by_organism(self, organism, offset=None, limit=None):
[1733]253        if offset is not None:
254            raise NotImplementedError("offset is no longer supported")
255        if limit is not None:
256            raise NotImplementedError("limit is no longer supported.")
257
258        res = self.service.list(organism).get().splitlines()
259        return [r.split(None, 1)[0] for r in res]
260
[1532]261    def get_number_of_genes_by_organism(self, organism):
[1733]262        raise NotImplementedError
263
[1532]264    ####################
265    # Objects by pathway
266    ####################
[1733]267
[1532]268    def get_elements_by_pathway(self, pathway_id):
[1733]269        raise NotImplementedError
270
[1532]271    def get_genes_by_pathway(self, pathway_id):
[1733]272        return _link_targets(self.link("genes", ids=[pathway_id]))
273
[1532]274    def get_enzymes_by_pathway(self, pathway_id):
[1733]275        return _link_targets(self.link("ec", ids=[pathway_id]))
276
[1532]277    def get_compounds_by_pathway(self, pathway_id):
[1733]278        return _link_targets(self.link("compound", ids=[pathway_id]))
279
[1532]280    def get_drugs_by_pathway(self, pathway_id):
[1733]281        return _link_targets(self.link("drug", ids=[pathway_id]))
282
[1532]283    def get_glycans_by_pathway(self, pathway_id):
[1733]284        return _link_targets(self.link("gl", ids=[pathway_id]))
285
[1532]286    def get_reactions_by_pathway(self, pathway_id):
[1733]287        return _link_targets(self.link("rn", ids=[pathway_id]))
288
[1532]289    def get_kos_by_pathway(self, pathway_id):
[1733]290        return _link_targets(self.link("ko", ids=[pathway_id]))
291
[1532]292    #####################
293    # Pathways by objects
294    #####################
[1733]295
296    # These functions returned results intersections.
[1532]297    def get_pathways_by_genes(self, gene_list):
[1733]298        raise NotImplementedError
299
[1532]300    def get_pathways_by_enzymes(self, enzyme_list):
[1733]301        raise NotImplementedError
302
[1532]303    def get_pathways_by_compounds(self, compound_list):
[1733]304        raise NotImplementedError
305
[1532]306    def get_pathways_by_drugs(self, drug_list):
[1733]307        raise NotImplementedError
308
[1532]309    def get_pathways_by_glycans(self, glycan_list):
[1733]310        raise NotImplementedError
311
[1532]312    def get_pathways_by_reactions(self, reaction_list):
[1733]313        raise NotImplementedError
314
[1532]315    def get_pathways_by_kos(self, ko_list):
[1733]316        raise NotImplementedError
317
[1532]318    ##########################
319    # Relations among pathways
320    ##########################
[1733]321
[1532]322    def get_linked_pathways(self, pathway_id):
323        if not pathway_id.startswith("path:"):
324            pathway_id = "path:" + pathway_id
[1733]325        return _link_targets(self.link("pathway", ids=[pathway_id]))
326
327
[1532]328"""
329KEGG api with caching
330"""
331
332import os
333
334from . import caching
335from .caching import cached_method, cache_entry, touch_dir
336
337try:
338    from functools import lru_cache
339except ImportError:
340    # TODO: move a copy of lru_cache in .caching if distributing this as a
341    # standalone package
[1601]342    from Orange.utils import lru_cache
[1532]343
[1734]344
[1532]345class CachedKeggApi(KeggApi):
346    def __init__(self, store=None):
347        KeggApi.__init__(self)
348        if store is None:
349            self.store = {}
[1734]350
[1532]351    # Needed API for cached decorator.
352    def cache_store(self):
353        from . import conf
354        path = conf.params["cache.path"]
355        touch_dir(path)
356        return caching.Sqlite3Store(os.path.join(path,
[1716]357                                                 "kegg_api_cache_1.sqlite3"))
[1734]358
[1532]359    def last_modified(self, args, kwargs=None):
360        return getattr(self, "default_release", "")
[1734]361
[1532]362    def set_default_release(self, release):
363        self.default_release = release
[1733]364
[1532]365    @cached_method
366    def list_organisms(self):
367        return KeggApi.list_organisms(self)
[1734]368
[1532]369    @cached_method
370    def list_pathways(self, organism):
371        return KeggApi.list_pathways(self, organism)
[1733]372
[1532]373    @cached_method
[1733]374    def list(self, db):
375        return KeggApi.list(self, db)
376
377    @lru_cache()  # not persistently cached
378    def info(self, db):
379        return KeggApi.info(self, db)
380
[1532]381    @cached_method
[1733]382    def find(self, db, keywords):
383        return KeggApi.find(self, db, keywords)
384
[1532]385    @cached_method
[1733]386    def get(self, ids):
[1532]387        if not isinstance(ids, basestring):
[1733]388            return self._batch_get(ids)
[1532]389        else:
[1733]390            return KeggApi.get(self, ids)
391
392    def _batch_get(self, ids):
393        if len(ids) > 10:
394            raise ValueError("Can batch at most 10 ids at a time.")
395
396        get = self.get
[1532]397        uncached = []
[1733]398        unmatched = set()
399
400        with closing(get.cache_store()) as store:
[1532]401            # Which ids are already cached
402            # TODO: Invalidate entries by release string.
403            for id in ids:
[1733]404                key = get.key_from_args((id,))
[1757]405                if not get.key_has_valid_cache(key, store):
[1532]406                    uncached.append(id)
[1733]407
[1532]408        if uncached:
[1539]409            # in case there are duplicate ids
410            uncached = sorted(set(uncached))
[1757]411
[1733]412            rval = KeggApi.get(self, uncached)
413
[1532]414            if rval is not None:
[1733]415                entries = rval.split("///\n")
[1532]416            else:
[1733]417                entries = []
418
419            if entries and not entries[-1].strip():
420                # Delete the last single newline entry if present
421                del entries[-1]
422
423            if len(entries) != len(uncached):
424                new_uncached, entries = match_by_ids(uncached, entries)
425                unmatched = set(uncached) - set(new_uncached)
426                uncached = new_uncached
427                warnings.warn("Unable to match entries for keys: %s." %
428                              ", ".join(map(repr, unmatched)))
429
430            with closing(get.cache_store()) as store:
431                for id, entry in zip(uncached, entries):
432                    key = get.key_from_args((id,))
433                    if entry is not None:
434                        entry = entry + "///\n"
435                    store[key] = cache_entry(entry, mtime=datetime.now())
436
[1532]437        # Finally join all the results, but drop all None objects
[1733]438        entries = filter(lambda e: e is not None, map(get, ids))
439
[1532]440        rval = "".join(entries)
441        return rval
[1733]442
[1532]443    @cached_method
[1735]444    def conv(self, target_db, source):
445        return KeggApi.conv(self, target_db, source)
[1733]446
[1532]447    ########
448    # LinkDB
449    ########
[1733]450
[1532]451    @cached_method
452    def get_genes_by_enzyme(self, enzyme_id, org):
453        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
[1733]454
[1532]455    @cached_method
456    def get_enzymes_by_gene(self, genes_id):
457        return KeggApi.get_enzymes_by_gene(self, genes_id)
[1733]458
[1532]459    @cached_method
460    def get_enzymes_by_compound(self, compound_id):
461        return KeggApi.get_enzymes_by_compound(self, compound_id)
[1733]462
[1532]463    @cached_method
464    def get_enzymes_by_glycan(self, glycan_id):
465        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
[1733]466
[1532]467    @cached_method
468    def get_enzymes_by_reaction(self, reaction_id):
469        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
[1733]470
[1532]471    @cached_method
472    def get_compounds_by_enzyme(self, enzyme_id):
473        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
[1733]474
[1532]475    @cached_method
476    def get_compounds_by_reaction(self, reaction_id):
477        return KeggApi.get_compounds_by_reaction(self, reaction_id)
[1733]478
[1532]479    @cached_method
480    def get_glycans_by_enzyme(self, enzyme_id):
481        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
[1733]482
[1532]483    @cached_method
484    def get_glycans_by_reaction(self, reaction_id):
485        return KeggApi.get_glycans_by_reaction(self, reaction_id)
[1733]486
[1532]487    @cached_method
488    def get_reactions_by_enzyme(self, enzyme_id):
489        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
[1733]490
[1532]491    @cached_method
492    def get_reactions_by_compound(self, compound_id):
493        return KeggApi.get_reactions_by_compound(self, compound_id)
[1733]494
[1532]495    @cached_method
496    def get_reactions_by_glycan(self, glycan_id):
497        return KeggApi.get_reactions_by_glycan(self, glycan_id)
[1733]498
[1532]499    ######
500    # SSDB
501    ######
[1734]502
[1532]503    @cached_method
504    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]505        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset,
506                                                       limit)
507
[1532]508    @cached_method
509    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]510        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset,
511                                                  limit)
512
[1532]513    @cached_method
514    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]515        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id,
516                                                          offset, limit)
517
[1532]518    @cached_method
519    def get_paralogs_by_gene(self, genes_id, offset, limit):
520        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
[1734]521
[1532]522    #######
523    # Motif
524    #######
[1734]525
[1532]526    @cached_method
527    def get_motifs_by_gene(self, genes_id, db):
528        return KeggApi.get_motifs_by_gene(self, genes_id, db)
[1734]529
[1532]530    @cached_method
531    def get_genes_by_motifs(self, motif_id_list, offset, limit):
532        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
533
534    ####
535    # KO
536    ####
[1734]537
[1532]538    @cached_method
539    def get_ko_by_gene(self, genes_id):
540        return KeggApi.get_ko_by_gene(self, genes_id)
[1734]541
[1532]542    @cached_method
543    def get_ko_by_ko_class(self, ko_class_id):
544        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
[1734]545
[1532]546    @cached_method
547    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
[1734]548        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset,
549                                             limit)
550
[1532]551    @cached_method
552    def get_genes_by_ko(self, ko_id, org):
553        return KeggApi.get_genes_by_ko(self, ko_id, org)
[1734]554
[1532]555    #########
556    # Pathway
557    #########
[1733]558
[1532]559    @cached_method
560    def get_genes_by_organism(self, organism, offset=None, limit=None):
[1734]561        return KeggApi.get_genes_by_organism(self, organism, offset=offset,
562                                             limit=limit)
563
[1532]564    @cached_method
565    def get_number_of_genes_by_organism(self, organism):
566        return KeggApi.get_number_of_genes_by_organism(self, organism)
[1734]567
[1532]568    @cached_method
569    def get_pathways_by_genes(self, gene_list):
570        return KeggApi.get_pathways_by_genes(self, gene_list)
[1734]571
[1532]572    @cached_method
573    def get_pathways_by_enzymes(self, enzyme_list):
574        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
[1734]575
[1532]576    @cached_method
577    def get_pathways_by_compounds(self, compound_list):
578        return KeggApi.get_pathways_by_compounds(self, compound_list)
[1734]579
[1532]580    @cached_method
581    def get_pathways_by_drugs(self, drug_list):
582        return KeggApi.get_pathways_by_drugs(self, drug_list)
[1734]583
[1532]584    @cached_method
585    def get_pathways_by_glycans(self, glycan_list):
586        return KeggApi.get_pathways_by_glycans(self, glycan_list)
[1734]587
[1532]588    @cached_method
589    def get_pathways_by_reactions(self, reaction_list):
590        return KeggApi.get_pathways_by_reactions(self, reaction_list)
[1734]591
[1532]592    @cached_method
593    def get_pathways_by_kos(self, ko_list):
594        return KeggApi.get_pathways_by_kos(self, ko_list)
[1734]595
[1532]596    @cached_method
597    def get_elements_by_pathway(self, pathway_id):
598        return KeggApi.get_elements_by_pathway(self, pathway_id)
[1734]599
[1532]600    @cached_method
601    def get_genes_by_pathway(self, pathway_id):
602        return KeggApi.get_genes_by_pathway(self, pathway_id)
[1734]603
[1532]604    @cached_method
605    def get_enzymes_by_pathway(self, pathway_id):
606        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
[1734]607
[1532]608    @cached_method
609    def get_compounds_by_pathway(self, pathway_id):
610        return KeggApi.get_compounds_by_pathway(self, pathway_id)
[1734]611
[1532]612    @cached_method
613    def get_drugs_by_pathway(self, pathway_id):
614        return KeggApi.get_drugs_by_pathway(self, pathway_id)
[1734]615
[1532]616    @cached_method
617    def get_glycans_by_pathway(self, pathway_id):
618        return KeggApi.get_glycans_by_pathway(self, pathway_id)
[1734]619
[1532]620    @cached_method
621    def get_reactions_by_pathway(self, pathway_id):
622        return KeggApi.get_reactions_by_pathway(self, pathway_id)
[1734]623
[1532]624    @cached_method
625    def get_kos_by_pathway(self, pathway_id):
626        return KeggApi.get_kos_by_pathway(self, pathway_id)
[1733]627
628
629def match_by_ids(ids, entries):
630    """
631
632    """
633
634    unmatched_ids = set(ids)
635    unmatched_entries = set(entries)
636
637    matched_ids = []
638    matched_entries = []
639
640    def match_add(search_id, entry):
641        """
642        Move search_id and entry to the matched lists.
643        """
644        matched_ids.append(search_id)
645        matched_entries.append(entry)
646
647        # Remove from the unmatched set
648        unmatched_ids.remove(search_id)
649        unmatched_entries.remove(entry)
650
651    def entry_split(entry_text):
652        line, _ = entry_text.split("\n", 1)
653        return line.split(None, 2)
654
655    entries_by_id = {}
656
657    for entry in entries:
658        _, eid, _ = entry_split(entry)
659        entries_by_id[eid] = entry
660
661    # First match full search ids
662    for search_id in list(unmatched_ids):
663        if search_id in entries_by_id:
664            entry = entries_by_id.pop(search_id)
665            match_add(search_id, entry)
666
667    # Second pass, split the search ids by ':' to db and identifier part,
668    # match by identifier
669    for search_id in list(unmatched_ids):
670        if ":" in search_id:
671            db_id, rest = search_id.split(":", 1)
672            if rest in entries_by_id:
673                entry = entries_by_id.pop(rest)
674                match_add(search_id, entry)
675
676    return matched_ids, matched_entries
Note: See TracBrowser for help on using the repository browser.