source: orange-bioinformatics/orangecontrib/bio/obiKEGG/api.py @ 1873:0810c5708cc5

Revision 1873:0810c5708cc5, 20.7 KB checked in by Ales Erjavec <ales.erjavec@…>, 6 months ago (diff)

Moved '_bioinformatics' into orangecontrib namespace.

Line 
1"""
2KEGG api interface.
3
4"""
5from __future__ import absolute_import
6
7from datetime import datetime
8from contextlib import closing
9from operator import itemgetter
10import warnings
11
12from .service import web_service
13from .types import OrganismSummary, Definition, BInfo, Link
14
15
16# A list of all databases with names, abbreviations
17DATABASES = [
18    ("KEGG Pathway", "pathway", "path", None),
19    ("KEGG Brite", "brite", "br", None),
20    ("KEGG Module", "module", "md", "M"),
21    ("KEGG Disease", "disease", "ds", "H"),
22    ("KEGG Drug", "drug", "dr", "D"),
23    ("KEGG Orthology", "orthology", "ko", "K"),
24    ("KEGG Genome", "genome", "genome", "T"),
25    ("KEGG Genomes", "genomes", "gn", "T"),
26    ("KEGG Genes", "genes", None, None),
27    ("KEGG Ligand", "ligand", "ligand", None),
28    ("KEGG Compound", "compound", "cpd", "C"),
29    ("KEGG Glycan", "glycan", "gl", "G"),
30    ("KEGG Reaction", "reaction", "rn", "R"),
31    ("KEGG RPair", "rpair", "rp", "RP"),
32    ("KEGG RClass", "rclass", "rc", "RC"),
33    ("KEGG Enzyme", "enzyme", "ec", "E")
34]
35
36
37def _link_targets(links):
38    return sorted(set(map(itemgetter(1), links)))
39
40
41class KeggApi(object):
42    """
43    An abstraction of a rest KEGG API.
44    """
45
46    def __init__(self):
47        self.service = web_service()
48
49    def list_organisms(self):
50        """
51        Return a list of all available organisms
52
53        >>> api.list_organisms()
54        [OrganismSummary(entry_id=T0..
55
56        """
57        return map(OrganismSummary.from_str,
58                   self.service.list.organism.get().splitlines())
59
60    def list_pathways(self, organism):
61        """
62        Return a list of all available pathways for `organism`
63
64        >>> api.list_pathways("hsa")
65        [Definition(entry_id=',...
66
67        """
68        return map(Definition.from_str,
69                   self.service.list.pathway(organism).get().splitlines())
70
71    def list(self, db):
72        """
73        Return a list of all available entries in database `db`.
74        """
75        return map(Definition.from_str,
76                   self.service.list(db).get().splitlines())
77
78    #######
79    # DBGET
80    #######
81
82    def info(self, db):
83        """
84        Return info for database `db`
85
86        >>> print api.info("pathway")
87        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
88
89        """
90        result = self.service.info(db).get()
91        return BInfo.from_text(str(result))
92
93    def find(self, db, keywords):
94        """
95        Search database 'db' for keywords.
96        """
97        if isinstance(keywords, basestring):
98            keywords = [keywords]
99
100        return self.service.find(db)("+".join(keywords)).get()
101
102    def get(self, ids):
103        """
104        Retrieve database entries for `ids` list.
105        """
106        if not isinstance(ids, basestring):
107            # Sequence of ids
108            ids = "+".join(ids)
109
110        return self.service.get(ids).get()
111
112    def conv(self, target_db, source):
113        """
114        Return a mapping from source to target_db ids as a list of two
115        tuples [(source_id, target_id), ...].
116
117        """
118        if not isinstance(source, basestring):
119            source = "+".join(source)
120
121        res = self.service.conv(target_db)(source).get()
122        return [tuple(line.split("\t")) for line in res.splitlines()]
123
124    def link(self, target_db, source_db=None, ids=None):
125        if not (source_db or ids):
126            raise ValueError("One of 'source_db' or 'ids' must be supplied")
127        if source_db and ids:
128            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
129
130        if source_db:
131            result = self.service.link(target_db)(source_db).get()
132        else:
133            result = self.service.link(target_db)("+".join(ids)).get()
134
135        return map(Link._make, map(str.split, result.splitlines()))
136
137    def get_genes_by_enzyme(self, enzyme_id, org):
138        return _link_targets(self.link(org, ids=[enzyme_id]))
139
140    def get_enzymes_by_gene(self, gene_id):
141        return _link_targets(self.link("ec", ids=[gene_id]))
142
143    def get_enzymes_by_compound(self, compound_id):
144        return _link_targets(self.link("ec", ids=[compound_id]))
145
146    def get_enzymes_by_glycan(self, glycan_id):
147        return _link_targets(self.link("ec", ids=[glycan_id]))
148
149    def get_enzymes_by_reaction(self, reaction_id):
150        return _link_targets(self.link("ec", ids=[reaction_id]))
151
152    def get_compounds_by_enzyme(self, enzyme_id):
153        return _link_targets(self.link("compound", ids=[enzyme_id]))
154
155    def get_compounds_by_reaction(self, reaction_id):
156        return _link_targets(self.link("compound", ids=[reaction_id]))
157
158    def get_glycans_by_enzyme(self, enzyme_id):
159        return _link_targets(self.link("gl", ids=[enzyme_id]))
160
161    def get_glycans_by_reaction(self, reaction_id):
162        return _link_targets(self.link("gl", ids=[reaction_id]))
163
164    def get_reactions_by_enzyme(self, enzyme_id):
165        return _link_targets(self.link("rn", ids=[enzyme_id]))
166
167    def get_reactions_by_compound(self, compound_id):
168        return _link_targets(self.link("rn", ids=[compound_id]))
169
170    def get_reactions_by_glycan(self, glycan_id):
171        return _link_targets(self.link("rn", ids=[glycan_id]))
172
173    ######
174    # SSDB
175    ######
176
177    # No replacement api in the KEGG REST api.
178    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
179        raise NotImplementedError
180
181    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
182        raise NotImplementedError
183
184    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
185        raise NotImplementedError
186
187    def get_paralogs_by_gene(self, genes_id, offset, limit):
188        raise NotImplementedError
189
190    #######
191    # Motif
192    #######
193
194    # No replacement api in KEGG REST api
195    def get_motifs_by_gene(self, genes_id, db):
196        raise NotImplementedError
197
198    def get_genes_by_motifs(self, motif_id_list, offset, limit):
199        raise NotImplementedError
200
201    ####
202    # KO
203    ####
204
205    def get_ko_by_gene(self, genes_id):
206        raise NotImplementedError
207
208    def get_ko_by_ko_class(self, ko_class_id):
209        raise NotImplementedError
210
211    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
212        raise NotImplementedError
213
214    def get_genes_by_ko(self, ko_id, org):
215        raise NotImplementedError
216
217    #########
218    # Pathway
219    #########
220
221    def mark_pathway_by_objects(self, pathway_id, object_id_list):
222        raise NotImplementedError
223
224    def color_pathway_by_objects(self, pathway_id, object_id_list,
225                                 fg_color_list, bg_color_list):
226        raise NotImplementedError
227
228    def color_pathway_by_elements(self, pathway_id, element_id_list,
229                                  fg_color_list, bg_color_list):
230        raise NotImplementedError
231
232    def get_html_of_marked_pathway_by_objects(self, pathway_id,
233                                              object_id_list):
234        raise NotImplementedError
235
236    def get_html_of_colored_pathway_by_objects(self, pathway_id,
237                                               object_id_list, fg_color_list,
238                                               bg_color_list):
239        raise NotImplementedError
240
241    def get_html_of_colored_pathway_by_elements(self, pathway_id,
242                                                element_id_list, fg_color_list,
243                                                bg_color_list):
244        raise NotImplementedError
245
246    def get_references_by_pathway(self, pathway_id):
247        return self.service.get_references_by_pathway(pathway_id)
248
249    def get_element_relations_by_pathway(self, pathway_id):
250        return self.service.get_element_relations_by_pathway(pathway_id)
251
252    def get_genes_by_organism(self, organism, offset=None, limit=None):
253        if offset is not None:
254            raise NotImplementedError("offset is no longer supported")
255        if limit is not None:
256            raise NotImplementedError("limit is no longer supported.")
257
258        res = self.service.list(organism).get().splitlines()
259        return [r.split(None, 1)[0] for r in res]
260
261    def get_number_of_genes_by_organism(self, organism):
262        raise NotImplementedError
263
264    ####################
265    # Objects by pathway
266    ####################
267
268    def get_elements_by_pathway(self, pathway_id):
269        raise NotImplementedError
270
271    def get_genes_by_pathway(self, pathway_id):
272        return _link_targets(self.link("genes", ids=[pathway_id]))
273
274    def get_enzymes_by_pathway(self, pathway_id):
275        return _link_targets(self.link("ec", ids=[pathway_id]))
276
277    def get_compounds_by_pathway(self, pathway_id):
278        return _link_targets(self.link("compound", ids=[pathway_id]))
279
280    def get_drugs_by_pathway(self, pathway_id):
281        return _link_targets(self.link("drug", ids=[pathway_id]))
282
283    def get_glycans_by_pathway(self, pathway_id):
284        return _link_targets(self.link("gl", ids=[pathway_id]))
285
286    def get_reactions_by_pathway(self, pathway_id):
287        return _link_targets(self.link("rn", ids=[pathway_id]))
288
289    def get_kos_by_pathway(self, pathway_id):
290        return _link_targets(self.link("ko", ids=[pathway_id]))
291
292    #####################
293    # Pathways by objects
294    #####################
295
296    # These functions returned results intersections.
297    def get_pathways_by_genes(self, gene_list):
298        raise NotImplementedError
299
300    def get_pathways_by_enzymes(self, enzyme_list):
301        raise NotImplementedError
302
303    def get_pathways_by_compounds(self, compound_list):
304        raise NotImplementedError
305
306    def get_pathways_by_drugs(self, drug_list):
307        raise NotImplementedError
308
309    def get_pathways_by_glycans(self, glycan_list):
310        raise NotImplementedError
311
312    def get_pathways_by_reactions(self, reaction_list):
313        raise NotImplementedError
314
315    def get_pathways_by_kos(self, ko_list):
316        raise NotImplementedError
317
318    ##########################
319    # Relations among pathways
320    ##########################
321
322    def get_linked_pathways(self, pathway_id):
323        if not pathway_id.startswith("path:"):
324            pathway_id = "path:" + pathway_id
325        return _link_targets(self.link("pathway", ids=[pathway_id]))
326
327
328"""
329KEGG api with caching
330"""
331
332import os
333
334from . import caching
335from .caching import cached_method, cache_entry, touch_dir
336
337try:
338    from functools import lru_cache
339except ImportError:
340    # TODO: move a copy of lru_cache in .caching if distributing this as a
341    # standalone package
342    from Orange.utils import lru_cache
343
344
345class CachedKeggApi(KeggApi):
346    def __init__(self, store=None):
347        KeggApi.__init__(self)
348        if store is None:
349            self.store = {}
350
351    # Needed API for cached decorator.
352    def cache_store(self):
353        from . import conf
354        path = conf.params["cache.path"]
355        touch_dir(path)
356        return caching.Sqlite3Store(os.path.join(path,
357                                                 "kegg_api_cache_1.sqlite3"))
358
359    def last_modified(self, args, kwargs=None):
360        return getattr(self, "default_release", "")
361
362    def set_default_release(self, release):
363        self.default_release = release
364
365    @cached_method
366    def list_organisms(self):
367        return KeggApi.list_organisms(self)
368
369    @cached_method
370    def list_pathways(self, organism):
371        return KeggApi.list_pathways(self, organism)
372
373    @cached_method
374    def list(self, db):
375        return KeggApi.list(self, db)
376
377    @lru_cache()  # not persistently cached
378    def info(self, db):
379        return KeggApi.info(self, db)
380
381    @cached_method
382    def find(self, db, keywords):
383        return KeggApi.find(self, db, keywords)
384
385    @cached_method
386    def get(self, ids):
387        if not isinstance(ids, basestring):
388            return self._batch_get(ids)
389        else:
390            return KeggApi.get(self, ids)
391
392    def _batch_get(self, ids):
393        if len(ids) > 10:
394            raise ValueError("Can batch at most 10 ids at a time.")
395
396        get = self.get
397        uncached = []
398        unmatched = set()
399
400        with closing(get.cache_store()) as store:
401            # Which ids are already cached
402            # TODO: Invalidate entries by release string.
403            for id in ids:
404                key = get.key_from_args((id,))
405                if not get.key_has_valid_cache(key, store):
406                    uncached.append(id)
407
408        if uncached:
409            # in case there are duplicate ids
410            uncached = sorted(set(uncached))
411
412            rval = KeggApi.get(self, uncached)
413
414            if rval is not None:
415                entries = rval.split("///\n")
416            else:
417                entries = []
418
419            if entries and not entries[-1].strip():
420                # Delete the last single newline entry if present
421                del entries[-1]
422
423            if len(entries) != len(uncached):
424                new_uncached, entries = match_by_ids(uncached, entries)
425                unmatched = set(uncached) - set(new_uncached)
426                uncached = new_uncached
427                warnings.warn("Unable to match entries for keys: %s." %
428                              ", ".join(map(repr, unmatched)))
429
430            with closing(get.cache_store()) as store:
431                for id, entry in zip(uncached, entries):
432                    key = get.key_from_args((id,))
433                    if entry is not None:
434                        entry = entry + "///\n"
435                    store[key] = cache_entry(entry, mtime=datetime.now())
436
437        # Finally join all the results, but drop all None objects
438
439        with closing(get.cache_store()):
440            keys = [get.key_from_args((id,)) for id in ids]
441            entries = [store[key].value for key in keys]
442
443        entries = filter(lambda e: e is not None, entries)
444
445        rval = "".join(entries)
446        return rval
447
448    @cached_method
449    def conv(self, target_db, source):
450        return KeggApi.conv(self, target_db, source)
451
452    ########
453    # LinkDB
454    ########
455
456    @cached_method
457    def get_genes_by_enzyme(self, enzyme_id, org):
458        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
459
460    @cached_method
461    def get_enzymes_by_gene(self, genes_id):
462        return KeggApi.get_enzymes_by_gene(self, genes_id)
463
464    @cached_method
465    def get_enzymes_by_compound(self, compound_id):
466        return KeggApi.get_enzymes_by_compound(self, compound_id)
467
468    @cached_method
469    def get_enzymes_by_glycan(self, glycan_id):
470        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
471
472    @cached_method
473    def get_enzymes_by_reaction(self, reaction_id):
474        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
475
476    @cached_method
477    def get_compounds_by_enzyme(self, enzyme_id):
478        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
479
480    @cached_method
481    def get_compounds_by_reaction(self, reaction_id):
482        return KeggApi.get_compounds_by_reaction(self, reaction_id)
483
484    @cached_method
485    def get_glycans_by_enzyme(self, enzyme_id):
486        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
487
488    @cached_method
489    def get_glycans_by_reaction(self, reaction_id):
490        return KeggApi.get_glycans_by_reaction(self, reaction_id)
491
492    @cached_method
493    def get_reactions_by_enzyme(self, enzyme_id):
494        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
495
496    @cached_method
497    def get_reactions_by_compound(self, compound_id):
498        return KeggApi.get_reactions_by_compound(self, compound_id)
499
500    @cached_method
501    def get_reactions_by_glycan(self, glycan_id):
502        return KeggApi.get_reactions_by_glycan(self, glycan_id)
503
504    ######
505    # SSDB
506    ######
507
508    @cached_method
509    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
510        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset,
511                                                       limit)
512
513    @cached_method
514    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
515        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset,
516                                                  limit)
517
518    @cached_method
519    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
520        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id,
521                                                          offset, limit)
522
523    @cached_method
524    def get_paralogs_by_gene(self, genes_id, offset, limit):
525        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
526
527    #######
528    # Motif
529    #######
530
531    @cached_method
532    def get_motifs_by_gene(self, genes_id, db):
533        return KeggApi.get_motifs_by_gene(self, genes_id, db)
534
535    @cached_method
536    def get_genes_by_motifs(self, motif_id_list, offset, limit):
537        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
538
539    ####
540    # KO
541    ####
542
543    @cached_method
544    def get_ko_by_gene(self, genes_id):
545        return KeggApi.get_ko_by_gene(self, genes_id)
546
547    @cached_method
548    def get_ko_by_ko_class(self, ko_class_id):
549        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
550
551    @cached_method
552    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
553        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset,
554                                             limit)
555
556    @cached_method
557    def get_genes_by_ko(self, ko_id, org):
558        return KeggApi.get_genes_by_ko(self, ko_id, org)
559
560    #########
561    # Pathway
562    #########
563
564    @cached_method
565    def get_genes_by_organism(self, organism, offset=None, limit=None):
566        return KeggApi.get_genes_by_organism(self, organism, offset=offset,
567                                             limit=limit)
568
569    @cached_method
570    def get_number_of_genes_by_organism(self, organism):
571        return KeggApi.get_number_of_genes_by_organism(self, organism)
572
573    @cached_method
574    def get_pathways_by_genes(self, gene_list):
575        return KeggApi.get_pathways_by_genes(self, gene_list)
576
577    @cached_method
578    def get_pathways_by_enzymes(self, enzyme_list):
579        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
580
581    @cached_method
582    def get_pathways_by_compounds(self, compound_list):
583        return KeggApi.get_pathways_by_compounds(self, compound_list)
584
585    @cached_method
586    def get_pathways_by_drugs(self, drug_list):
587        return KeggApi.get_pathways_by_drugs(self, drug_list)
588
589    @cached_method
590    def get_pathways_by_glycans(self, glycan_list):
591        return KeggApi.get_pathways_by_glycans(self, glycan_list)
592
593    @cached_method
594    def get_pathways_by_reactions(self, reaction_list):
595        return KeggApi.get_pathways_by_reactions(self, reaction_list)
596
597    @cached_method
598    def get_pathways_by_kos(self, ko_list):
599        return KeggApi.get_pathways_by_kos(self, ko_list)
600
601    @cached_method
602    def get_elements_by_pathway(self, pathway_id):
603        return KeggApi.get_elements_by_pathway(self, pathway_id)
604
605    @cached_method
606    def get_genes_by_pathway(self, pathway_id):
607        return KeggApi.get_genes_by_pathway(self, pathway_id)
608
609    @cached_method
610    def get_enzymes_by_pathway(self, pathway_id):
611        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
612
613    @cached_method
614    def get_compounds_by_pathway(self, pathway_id):
615        return KeggApi.get_compounds_by_pathway(self, pathway_id)
616
617    @cached_method
618    def get_drugs_by_pathway(self, pathway_id):
619        return KeggApi.get_drugs_by_pathway(self, pathway_id)
620
621    @cached_method
622    def get_glycans_by_pathway(self, pathway_id):
623        return KeggApi.get_glycans_by_pathway(self, pathway_id)
624
625    @cached_method
626    def get_reactions_by_pathway(self, pathway_id):
627        return KeggApi.get_reactions_by_pathway(self, pathway_id)
628
629    @cached_method
630    def get_kos_by_pathway(self, pathway_id):
631        return KeggApi.get_kos_by_pathway(self, pathway_id)
632
633
634def match_by_ids(ids, entries):
635    """
636
637    """
638
639    unmatched_ids = set(ids)
640    unmatched_entries = set(entries)
641
642    matched_ids = []
643    matched_entries = []
644
645    def match_add(search_id, entry):
646        """
647        Move search_id and entry to the matched lists.
648        """
649        matched_ids.append(search_id)
650        matched_entries.append(entry)
651
652        # Remove from the unmatched set
653        unmatched_ids.remove(search_id)
654        unmatched_entries.remove(entry)
655
656    def entry_split(entry_text):
657        line, _ = entry_text.split("\n", 1)
658        return line.split(None, 2)
659
660    entries_by_id = {}
661
662    for entry in entries:
663        _, eid, _ = entry_split(entry)
664        entries_by_id[eid] = entry
665
666    # First match full search ids
667    for search_id in list(unmatched_ids):
668        if search_id in entries_by_id:
669            entry = entries_by_id.pop(search_id)
670            match_add(search_id, entry)
671
672    # Second pass, split the search ids by ':' to db and identifier part,
673    # match by identifier
674    for search_id in list(unmatched_ids):
675        if ":" in search_id:
676            db_id, rest = search_id.split(":", 1)
677            if rest in entries_by_id:
678                entry = entries_by_id.pop(rest)
679                match_add(search_id, entry)
680
681    return matched_ids, matched_entries
Note: See TracBrowser for help on using the repository browser.