source: orange-bioinformatics/_bioinformatics/obiKEGG/api.py @ 1734:91d14dd2cf0e

Revision 1734:91d14dd2cf0e, 20.1 KB checked in by Ales Erjavec <ales.erjavec@…>, 14 months ago (diff)

obiKEGG code style fixes.

RevLine 
[1532]1"""
2KEGG api interface.
3
4"""
5from __future__ import absolute_import
6
[1733]7from datetime import datetime
[1532]8from contextlib import closing
[1733]9from operator import itemgetter
10import warnings
[1532]11
12from .service import web_service
[1733]13from .types import OrganismSummary, Definition, BInfo, Link
14
15
16DATABASES = [
17    ("KEGG Pathway", "pathway", "path", None),
18    ("KEGG Brite", "brite", "br", None),
19    ("KEGG Module", "module", "md", "M"),
20    ("KEGG Disease", "disease", "ds", "H"),
21    ("KEGG Drug", "drug", "dr", "D"),
22    ("KEGG Orthology", "orthology", "ko", "K"),
23    ("KEGG Genome", "genome", "genome", "T"),
24    ("KEGG Genomes", "genomes", "gn", "T"),
25    ("KEGG Genes", "genes", None, None),
26    ("KEGG Ligand", "ligand", "ligand", None),
27    ("KEGG Compound", "compound", "cpd", "C"),
28    ("KEGG Glycan", "glycan", "gl", "G"),
29    ("KEGG Reaction", "reaction", "rn", "R"),
30    ("KEGG RPair", "rpair", "rp", "RP"),
31    ("KEGG RClass", "rclass", "rc", "RC"),
32    ("KEGG Enzyme", "enzyme", "ec", "E")
33]
34
35
36def _link_targets(links):
37    return sorted(set(map(itemgetter(1), links)))
38
[1532]39
40class KeggApi(object):
[1733]41    """
42    An abstraction of a kegg api.
43    """
44
[1532]45    def __init__(self):
46        self.service = web_service()
[1733]47
48    def list_organisms(self):
[1532]49        """
[1733]50        Return a list of all available organisms
51
[1532]52        >>> api.list_organisms()
53        [Definition(entry_id='hsa',...
[1733]54
[1532]55        """
[1733]56        return map(OrganismSummary.from_str,
57                   self.service.list.organism.get().splitlines())
58
[1532]59    def list_pathways(self, organism):
[1733]60        """
61        Return a list of all available pathways for `organism`
62
[1532]63        >>> api.list_pathways("hsa")
64        [Definition(entry_id=',...
[1733]65
[1532]66        """
[1733]67        return map(Definition.from_str,
68                   self.service.list.pathway(organism).get().splitlines())
69
70    def list(self, db):
71        """
72        Return a list of all available entries in database `db`.
73        """
74        return map(Definition.from_str,
75                   self.service.list(db).get().splitlines())
76
[1532]77    #######
78    # DBGET
79    #######
[1733]80
81    def info(self, db):
[1532]82        """
[1733]83        Return info for database `db`
84
85        >>> print api.info("pathway")
86        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
87
[1532]88        """
[1733]89        result = self.service.info(db).get()
90        return BInfo.from_text(str(result))
91
92    def find(self, db, keywords):
[1532]93        """
[1733]94        Search database 'db' for keywords.
95        """
96        if isinstance(keywords, basestring):
97            keywords = [keywords]
98
99        return self.service.find(db)("+".join(keywords)).get()
100
101    def get(self, ids):
102        """
103        Retrieve database entries for `ids` list.
[1532]104        """
105        if not isinstance(ids, basestring):
106            # Sequence of ids
[1733]107            ids = "+".join(ids)
108
109        return self.service.get(ids).get()
110
111    def conv(self, ids):
112        raise NotImplementedError()
113
114    def link(self, target_db, source_db=None, ids=None):
115        if not (source_db or ids):
116            raise ValueError("One of 'source_db' or 'ids' must be supplied")
117        if source_db and ids:
118            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
119
120        if source_db:
121            result = self.service.link(target_db)(source_db).get()
[1532]122        else:
[1733]123            result = self.service.link(target_db)("+".join(ids)).get()
124
125        return map(Link._make, map(str.split, result.splitlines()))
126
[1532]127    def get_genes_by_enzyme(self, enzyme_id, org):
[1733]128        return _link_targets(self.link(org, ids=[enzyme_id]))
129
130    def get_enzymes_by_gene(self, gene_id):
131        return _link_targets(self.link("ec", ids=[gene_id]))
132
[1532]133    def get_enzymes_by_compound(self, compound_id):
[1733]134        return _link_targets(self.link("ec", ids=[compound_id]))
135
[1532]136    def get_enzymes_by_glycan(self, glycan_id):
[1733]137        return _link_targets(self.link("ec", ids=[glycan_id]))
138
[1532]139    def get_enzymes_by_reaction(self, reaction_id):
[1733]140        return _link_targets(self.link("ec", ids=[reaction_id]))
141
[1532]142    def get_compounds_by_enzyme(self, enzyme_id):
[1733]143        return _link_targets(self.link("compound", ids=[enzyme_id]))
144
[1532]145    def get_compounds_by_reaction(self, reaction_id):
[1733]146        return _link_targets(self.link("compound", ids=[reaction_id]))
147
[1532]148    def get_glycans_by_enzyme(self, enzyme_id):
[1733]149        return _link_targets(self.link("gl", ids=[enzyme_id]))
150
[1532]151    def get_glycans_by_reaction(self, reaction_id):
[1733]152        return _link_targets(self.link("gl", ids=[reaction_id]))
153
[1532]154    def get_reactions_by_enzyme(self, enzyme_id):
[1733]155        return _link_targets(self.link("rn", ids=[enzyme_id]))
156
[1532]157    def get_reactions_by_compound(self, compound_id):
[1733]158        return _link_targets(self.link("rn", ids=[compound_id]))
159
[1532]160    def get_reactions_by_glycan(self, glycan_id):
[1733]161        return _link_targets(self.link("rn", ids=[glycan_id]))
162
[1532]163    ######
164    # SSDB
165    ######
[1733]166
167    # No replacement api in the KEGG REST api.
[1532]168    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]169        raise NotImplementedError
170
[1532]171    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]172        raise NotImplementedError
173
[1532]174    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]175        raise NotImplementedError
176
[1532]177    def get_paralogs_by_gene(self, genes_id, offset, limit):
[1733]178        raise NotImplementedError
179
[1532]180    #######
181    # Motif
182    #######
[1733]183
184    # No replacement api in KEGG REST api
[1532]185    def get_motifs_by_gene(self, genes_id, db):
[1733]186        raise NotImplementedError
187
[1532]188    def get_genes_by_motifs(self, motif_id_list, offset, limit):
[1733]189        raise NotImplementedError
190
[1532]191    ####
192    # KO
193    ####
[1733]194
[1532]195    def get_ko_by_gene(self, genes_id):
[1733]196        raise NotImplementedError
197
[1532]198    def get_ko_by_ko_class(self, ko_class_id):
[1733]199        raise NotImplementedError
200
[1532]201    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
[1733]202        raise NotImplementedError
203
[1532]204    def get_genes_by_ko(self, ko_id, org):
[1733]205        raise NotImplementedError
206
[1532]207    #########
208    # Pathway
209    #########
[1733]210
[1532]211    def mark_pathway_by_objects(self, pathway_id, object_id_list):
[1733]212        raise NotImplementedError
213
214    def color_pathway_by_objects(self, pathway_id, object_id_list,
215                                 fg_color_list, bg_color_list):
216        raise NotImplementedError
217
218    def color_pathway_by_elements(self, pathway_id, element_id_list,
219                                  fg_color_list, bg_color_list):
220        raise NotImplementedError
221
222    def get_html_of_marked_pathway_by_objects(self, pathway_id,
223                                              object_id_list):
224        raise NotImplementedError
225
226    def get_html_of_colored_pathway_by_objects(self, pathway_id,
227                                               object_id_list, fg_color_list,
228                                               bg_color_list):
229        raise NotImplementedError
230
231    def get_html_of_colored_pathway_by_elements(self, pathway_id,
232                                                element_id_list, fg_color_list,
233                                                bg_color_list):
234        raise NotImplementedError
235
[1532]236    def get_references_by_pathway(self, pathway_id):
237        return self.service.get_references_by_pathway(pathway_id)
[1733]238
[1532]239    def get_element_relations_by_pathway(self, pathway_id):
240        return self.service.get_element_relations_by_pathway(pathway_id)
[1733]241
[1532]242    def get_genes_by_organism(self, organism, offset=None, limit=None):
[1733]243        if offset is not None:
244            raise NotImplementedError("offset is no longer supported")
245        if limit is not None:
246            raise NotImplementedError("limit is no longer supported.")
247
248        res = self.service.list(organism).get().splitlines()
249        return [r.split(None, 1)[0] for r in res]
250
[1532]251    def get_number_of_genes_by_organism(self, organism):
[1733]252        raise NotImplementedError
253
[1532]254    ####################
255    # Objects by pathway
256    ####################
[1733]257
[1532]258    def get_elements_by_pathway(self, pathway_id):
[1733]259        raise NotImplementedError
260
[1532]261    def get_genes_by_pathway(self, pathway_id):
[1733]262        return _link_targets(self.link("genes", ids=[pathway_id]))
263
[1532]264    def get_enzymes_by_pathway(self, pathway_id):
[1733]265        return _link_targets(self.link("ec", ids=[pathway_id]))
266
[1532]267    def get_compounds_by_pathway(self, pathway_id):
[1733]268        return _link_targets(self.link("compound", ids=[pathway_id]))
269
[1532]270    def get_drugs_by_pathway(self, pathway_id):
[1733]271        return _link_targets(self.link("drug", ids=[pathway_id]))
272
[1532]273    def get_glycans_by_pathway(self, pathway_id):
[1733]274        return _link_targets(self.link("gl", ids=[pathway_id]))
275
[1532]276    def get_reactions_by_pathway(self, pathway_id):
[1733]277        return _link_targets(self.link("rn", ids=[pathway_id]))
278
[1532]279    def get_kos_by_pathway(self, pathway_id):
[1733]280        return _link_targets(self.link("ko", ids=[pathway_id]))
281
[1532]282    #####################
283    # Pathways by objects
284    #####################
[1733]285
286    # These functions returned results intersections.
[1532]287    def get_pathways_by_genes(self, gene_list):
[1733]288        raise NotImplementedError
289
[1532]290    def get_pathways_by_enzymes(self, enzyme_list):
[1733]291        raise NotImplementedError
292
[1532]293    def get_pathways_by_compounds(self, compound_list):
[1733]294        raise NotImplementedError
295
[1532]296    def get_pathways_by_drugs(self, drug_list):
[1733]297        raise NotImplementedError
298
[1532]299    def get_pathways_by_glycans(self, glycan_list):
[1733]300        raise NotImplementedError
301
[1532]302    def get_pathways_by_reactions(self, reaction_list):
[1733]303        raise NotImplementedError
304
[1532]305    def get_pathways_by_kos(self, ko_list):
[1733]306        raise NotImplementedError
307
[1532]308    ##########################
309    # Relations among pathways
310    ##########################
[1733]311
[1532]312    def get_linked_pathways(self, pathway_id):
313        if not pathway_id.startswith("path:"):
314            pathway_id = "path:" + pathway_id
[1733]315        return _link_targets(self.link("pathway", ids=[pathway_id]))
316
317
[1532]318"""
319KEGG api with caching
320"""
321
322import os
323
324from . import caching
325from .caching import cached_method, cache_entry, touch_dir
326
327try:
328    from functools import lru_cache
329except ImportError:
330    # TODO: move a copy of lru_cache in .caching if distributing this as a
331    # standalone package
[1601]332    from Orange.utils import lru_cache
[1532]333
[1734]334
[1532]335class CachedKeggApi(KeggApi):
336    def __init__(self, store=None):
337        KeggApi.__init__(self)
338        if store is None:
339            self.store = {}
[1734]340
[1532]341    # Needed API for cached decorator.
342    def cache_store(self):
343        from . import conf
344        path = conf.params["cache.path"]
345        touch_dir(path)
346        return caching.Sqlite3Store(os.path.join(path,
[1716]347                                                 "kegg_api_cache_1.sqlite3"))
[1734]348
[1532]349    def last_modified(self, args, kwargs=None):
350        return getattr(self, "default_release", "")
[1734]351
[1532]352    def set_default_release(self, release):
353        self.default_release = release
[1733]354
[1532]355    @cached_method
356    def list_organisms(self):
357        return KeggApi.list_organisms(self)
[1734]358
[1532]359    @cached_method
360    def list_pathways(self, organism):
361        return KeggApi.list_pathways(self, organism)
[1733]362
[1532]363    @cached_method
[1733]364    def list(self, db):
365        return KeggApi.list(self, db)
366
367    @lru_cache()  # not persistently cached
368    def info(self, db):
369        return KeggApi.info(self, db)
370
[1532]371    @cached_method
[1733]372    def find(self, db, keywords):
373        return KeggApi.find(self, db, keywords)
374
[1532]375    @cached_method
[1733]376    def get(self, ids):
[1532]377        if not isinstance(ids, basestring):
[1733]378            return self._batch_get(ids)
[1532]379        else:
[1733]380            return KeggApi.get(self, ids)
381
382    def _batch_get(self, ids):
383        if len(ids) > 10:
384            raise ValueError("Can batch at most 10 ids at a time.")
385
386        get = self.get
[1532]387        uncached = []
[1733]388        unmatched = set()
389
390        with closing(get.cache_store()) as store:
[1532]391            # Which ids are already cached
392            # TODO: Invalidate entries by release string.
393            for id in ids:
[1733]394                key = get.key_from_args((id,))
[1532]395                if key not in store:
396                    uncached.append(id)
[1733]397
[1532]398        if uncached:
[1539]399            # in case there are duplicate ids
400            uncached = sorted(set(uncached))
[1733]401            rval = KeggApi.get(self, uncached)
402
[1532]403            if rval is not None:
[1733]404                entries = rval.split("///\n")
[1532]405            else:
[1733]406                entries = []
407
408            if entries and not entries[-1].strip():
409                # Delete the last single newline entry if present
410                del entries[-1]
411
412            if len(entries) != len(uncached):
413                new_uncached, entries = match_by_ids(uncached, entries)
414                unmatched = set(uncached) - set(new_uncached)
415                uncached = new_uncached
416                warnings.warn("Unable to match entries for keys: %s." %
417                              ", ".join(map(repr, unmatched)))
418
419            with closing(get.cache_store()) as store:
420                for id, entry in zip(uncached, entries):
421                    key = get.key_from_args((id,))
422                    if entry is not None:
423                        entry = entry + "///\n"
424                    store[key] = cache_entry(entry, mtime=datetime.now())
425
[1532]426        # Finally join all the results, but drop all None objects
[1733]427        entries = filter(lambda e: e is not None, map(get, ids))
428
[1532]429        rval = "".join(entries)
430        return rval
[1733]431
[1532]432    @cached_method
[1733]433    def conv(self, ids):
434        return KeggApi.conv(self, ids)
435
[1532]436    ########
437    # LinkDB
438    ########
[1733]439
[1532]440    @cached_method
441    def get_genes_by_enzyme(self, enzyme_id, org):
442        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
[1733]443
[1532]444    @cached_method
445    def get_enzymes_by_gene(self, genes_id):
446        return KeggApi.get_enzymes_by_gene(self, genes_id)
[1733]447
[1532]448    @cached_method
449    def get_enzymes_by_compound(self, compound_id):
450        return KeggApi.get_enzymes_by_compound(self, compound_id)
[1733]451
[1532]452    @cached_method
453    def get_enzymes_by_glycan(self, glycan_id):
454        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
[1733]455
[1532]456    @cached_method
457    def get_enzymes_by_reaction(self, reaction_id):
458        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
[1733]459
[1532]460    @cached_method
461    def get_compounds_by_enzyme(self, enzyme_id):
462        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
[1733]463
[1532]464    @cached_method
465    def get_compounds_by_reaction(self, reaction_id):
466        return KeggApi.get_compounds_by_reaction(self, reaction_id)
[1733]467
[1532]468    @cached_method
469    def get_glycans_by_enzyme(self, enzyme_id):
470        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
[1733]471
[1532]472    @cached_method
473    def get_glycans_by_reaction(self, reaction_id):
474        return KeggApi.get_glycans_by_reaction(self, reaction_id)
[1733]475
[1532]476    @cached_method
477    def get_reactions_by_enzyme(self, enzyme_id):
478        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
[1733]479
[1532]480    @cached_method
481    def get_reactions_by_compound(self, compound_id):
482        return KeggApi.get_reactions_by_compound(self, compound_id)
[1733]483
[1532]484    @cached_method
485    def get_reactions_by_glycan(self, glycan_id):
486        return KeggApi.get_reactions_by_glycan(self, glycan_id)
[1733]487
[1532]488    ######
489    # SSDB
490    ######
[1734]491
[1532]492    @cached_method
493    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]494        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset,
495                                                       limit)
496
[1532]497    @cached_method
498    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]499        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset,
500                                                  limit)
501
[1532]502    @cached_method
503    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]504        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id,
505                                                          offset, limit)
506
[1532]507    @cached_method
508    def get_paralogs_by_gene(self, genes_id, offset, limit):
509        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
[1734]510
[1532]511    #######
512    # Motif
513    #######
[1734]514
[1532]515    @cached_method
516    def get_motifs_by_gene(self, genes_id, db):
517        return KeggApi.get_motifs_by_gene(self, genes_id, db)
[1734]518
[1532]519    @cached_method
520    def get_genes_by_motifs(self, motif_id_list, offset, limit):
521        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
522
523    ####
524    # KO
525    ####
[1734]526
[1532]527    @cached_method
528    def get_ko_by_gene(self, genes_id):
529        return KeggApi.get_ko_by_gene(self, genes_id)
[1734]530
[1532]531    @cached_method
532    def get_ko_by_ko_class(self, ko_class_id):
533        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
[1734]534
[1532]535    @cached_method
536    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
[1734]537        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset,
538                                             limit)
539
[1532]540    @cached_method
541    def get_genes_by_ko(self, ko_id, org):
542        return KeggApi.get_genes_by_ko(self, ko_id, org)
[1734]543
[1532]544    #########
545    # Pathway
546    #########
[1733]547
[1532]548    @cached_method
549    def get_genes_by_organism(self, organism, offset=None, limit=None):
[1734]550        return KeggApi.get_genes_by_organism(self, organism, offset=offset,
551                                             limit=limit)
552
[1532]553    @cached_method
554    def get_number_of_genes_by_organism(self, organism):
555        return KeggApi.get_number_of_genes_by_organism(self, organism)
[1734]556
[1532]557    @cached_method
558    def get_pathways_by_genes(self, gene_list):
559        return KeggApi.get_pathways_by_genes(self, gene_list)
[1734]560
[1532]561    @cached_method
562    def get_pathways_by_enzymes(self, enzyme_list):
563        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
[1734]564
[1532]565    @cached_method
566    def get_pathways_by_compounds(self, compound_list):
567        return KeggApi.get_pathways_by_compounds(self, compound_list)
[1734]568
[1532]569    @cached_method
570    def get_pathways_by_drugs(self, drug_list):
571        return KeggApi.get_pathways_by_drugs(self, drug_list)
[1734]572
[1532]573    @cached_method
574    def get_pathways_by_glycans(self, glycan_list):
575        return KeggApi.get_pathways_by_glycans(self, glycan_list)
[1734]576
[1532]577    @cached_method
578    def get_pathways_by_reactions(self, reaction_list):
579        return KeggApi.get_pathways_by_reactions(self, reaction_list)
[1734]580
[1532]581    @cached_method
582    def get_pathways_by_kos(self, ko_list):
583        return KeggApi.get_pathways_by_kos(self, ko_list)
[1734]584
[1532]585    @cached_method
586    def get_elements_by_pathway(self, pathway_id):
587        return KeggApi.get_elements_by_pathway(self, pathway_id)
[1734]588
[1532]589    @cached_method
590    def get_genes_by_pathway(self, pathway_id):
591        return KeggApi.get_genes_by_pathway(self, pathway_id)
[1734]592
[1532]593    @cached_method
594    def get_enzymes_by_pathway(self, pathway_id):
595        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
[1734]596
[1532]597    @cached_method
598    def get_compounds_by_pathway(self, pathway_id):
599        return KeggApi.get_compounds_by_pathway(self, pathway_id)
[1734]600
[1532]601    @cached_method
602    def get_drugs_by_pathway(self, pathway_id):
603        return KeggApi.get_drugs_by_pathway(self, pathway_id)
[1734]604
[1532]605    @cached_method
606    def get_glycans_by_pathway(self, pathway_id):
607        return KeggApi.get_glycans_by_pathway(self, pathway_id)
[1734]608
[1532]609    @cached_method
610    def get_reactions_by_pathway(self, pathway_id):
611        return KeggApi.get_reactions_by_pathway(self, pathway_id)
[1734]612
[1532]613    @cached_method
614    def get_kos_by_pathway(self, pathway_id):
615        return KeggApi.get_kos_by_pathway(self, pathway_id)
[1733]616
617
618def match_by_ids(ids, entries):
619    """
620
621    """
622
623    unmatched_ids = set(ids)
624    unmatched_entries = set(entries)
625
626    matched_ids = []
627    matched_entries = []
628
629    def match_add(search_id, entry):
630        """
631        Move search_id and entry to the matched lists.
632        """
633        matched_ids.append(search_id)
634        matched_entries.append(entry)
635
636        # Remove from the unmatched set
637        unmatched_ids.remove(search_id)
638        unmatched_entries.remove(entry)
639
640    def entry_split(entry_text):
641        line, _ = entry_text.split("\n", 1)
642        return line.split(None, 2)
643
644    entries_by_id = {}
645
646    for entry in entries:
647        _, eid, _ = entry_split(entry)
648        entries_by_id[eid] = entry
649
650    # First match full search ids
651    for search_id in list(unmatched_ids):
652        if search_id in entries_by_id:
653            entry = entries_by_id.pop(search_id)
654            match_add(search_id, entry)
655
656    # Second pass, split the search ids by ':' to db and identifier part,
657    # match by identifier
658    for search_id in list(unmatched_ids):
659        if ":" in search_id:
660            db_id, rest = search_id.split(":", 1)
661            if rest in entries_by_id:
662                entry = entries_by_id.pop(rest)
663                match_add(search_id, entry)
664
665    return matched_ids, matched_entries
Note: See TracBrowser for help on using the repository browser.