source: orange-bioinformatics/_bioinformatics/obiKEGG/api.py @ 1735:50499d1dc55a

Revision 1735:50499d1dc55a, 20.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Changed the Organism.gene_aliases method.

Line 
1"""
2KEGG api interface.
3
4"""
5from __future__ import absolute_import
6
7from datetime import datetime
8from contextlib import closing
9from operator import itemgetter
10import warnings
11
12from .service import web_service
13from .types import OrganismSummary, Definition, BInfo, Link
14
15
16DATABASES = [
17    ("KEGG Pathway", "pathway", "path", None),
18    ("KEGG Brite", "brite", "br", None),
19    ("KEGG Module", "module", "md", "M"),
20    ("KEGG Disease", "disease", "ds", "H"),
21    ("KEGG Drug", "drug", "dr", "D"),
22    ("KEGG Orthology", "orthology", "ko", "K"),
23    ("KEGG Genome", "genome", "genome", "T"),
24    ("KEGG Genomes", "genomes", "gn", "T"),
25    ("KEGG Genes", "genes", None, None),
26    ("KEGG Ligand", "ligand", "ligand", None),
27    ("KEGG Compound", "compound", "cpd", "C"),
28    ("KEGG Glycan", "glycan", "gl", "G"),
29    ("KEGG Reaction", "reaction", "rn", "R"),
30    ("KEGG RPair", "rpair", "rp", "RP"),
31    ("KEGG RClass", "rclass", "rc", "RC"),
32    ("KEGG Enzyme", "enzyme", "ec", "E")
33]
34
35
36def _link_targets(links):
37    return sorted(set(map(itemgetter(1), links)))
38
39
40class KeggApi(object):
41    """
42    An abstraction of a kegg api.
43    """
44
45    def __init__(self):
46        self.service = web_service()
47
48    def list_organisms(self):
49        """
50        Return a list of all available organisms
51
52        >>> api.list_organisms()
53        [Definition(entry_id='hsa',...
54
55        """
56        return map(OrganismSummary.from_str,
57                   self.service.list.organism.get().splitlines())
58
59    def list_pathways(self, organism):
60        """
61        Return a list of all available pathways for `organism`
62
63        >>> api.list_pathways("hsa")
64        [Definition(entry_id=',...
65
66        """
67        return map(Definition.from_str,
68                   self.service.list.pathway(organism).get().splitlines())
69
70    def list(self, db):
71        """
72        Return a list of all available entries in database `db`.
73        """
74        return map(Definition.from_str,
75                   self.service.list(db).get().splitlines())
76
77    #######
78    # DBGET
79    #######
80
81    def info(self, db):
82        """
83        Return info for database `db`
84
85        >>> print api.info("pathway")
86        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
87
88        """
89        result = self.service.info(db).get()
90        return BInfo.from_text(str(result))
91
92    def find(self, db, keywords):
93        """
94        Search database 'db' for keywords.
95        """
96        if isinstance(keywords, basestring):
97            keywords = [keywords]
98
99        return self.service.find(db)("+".join(keywords)).get()
100
101    def get(self, ids):
102        """
103        Retrieve database entries for `ids` list.
104        """
105        if not isinstance(ids, basestring):
106            # Sequence of ids
107            ids = "+".join(ids)
108
109        return self.service.get(ids).get()
110
111    def conv(self, target_db, source):
112        """
113        Return a mapping from source to target_db ids as a list of two
114        tuples [(source_id, target_id), ...].
115
116        """
117        if not isinstance(source, basestring):
118            source = "+".join(source)
119
120        res = self.service.conv(target_db)(source).get()
121        return [tuple(line.split("\t")) for line in res.splitlines()]
122
123    def link(self, target_db, source_db=None, ids=None):
124        if not (source_db or ids):
125            raise ValueError("One of 'source_db' or 'ids' must be supplied")
126        if source_db and ids:
127            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
128
129        if source_db:
130            result = self.service.link(target_db)(source_db).get()
131        else:
132            result = self.service.link(target_db)("+".join(ids)).get()
133
134        return map(Link._make, map(str.split, result.splitlines()))
135
136    def get_genes_by_enzyme(self, enzyme_id, org):
137        return _link_targets(self.link(org, ids=[enzyme_id]))
138
139    def get_enzymes_by_gene(self, gene_id):
140        return _link_targets(self.link("ec", ids=[gene_id]))
141
142    def get_enzymes_by_compound(self, compound_id):
143        return _link_targets(self.link("ec", ids=[compound_id]))
144
145    def get_enzymes_by_glycan(self, glycan_id):
146        return _link_targets(self.link("ec", ids=[glycan_id]))
147
148    def get_enzymes_by_reaction(self, reaction_id):
149        return _link_targets(self.link("ec", ids=[reaction_id]))
150
151    def get_compounds_by_enzyme(self, enzyme_id):
152        return _link_targets(self.link("compound", ids=[enzyme_id]))
153
154    def get_compounds_by_reaction(self, reaction_id):
155        return _link_targets(self.link("compound", ids=[reaction_id]))
156
157    def get_glycans_by_enzyme(self, enzyme_id):
158        return _link_targets(self.link("gl", ids=[enzyme_id]))
159
160    def get_glycans_by_reaction(self, reaction_id):
161        return _link_targets(self.link("gl", ids=[reaction_id]))
162
163    def get_reactions_by_enzyme(self, enzyme_id):
164        return _link_targets(self.link("rn", ids=[enzyme_id]))
165
166    def get_reactions_by_compound(self, compound_id):
167        return _link_targets(self.link("rn", ids=[compound_id]))
168
169    def get_reactions_by_glycan(self, glycan_id):
170        return _link_targets(self.link("rn", ids=[glycan_id]))
171
172    ######
173    # SSDB
174    ######
175
176    # No replacement api in the KEGG REST api.
177    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
178        raise NotImplementedError
179
180    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
181        raise NotImplementedError
182
183    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
184        raise NotImplementedError
185
186    def get_paralogs_by_gene(self, genes_id, offset, limit):
187        raise NotImplementedError
188
189    #######
190    # Motif
191    #######
192
193    # No replacement api in KEGG REST api
194    def get_motifs_by_gene(self, genes_id, db):
195        raise NotImplementedError
196
197    def get_genes_by_motifs(self, motif_id_list, offset, limit):
198        raise NotImplementedError
199
200    ####
201    # KO
202    ####
203
204    def get_ko_by_gene(self, genes_id):
205        raise NotImplementedError
206
207    def get_ko_by_ko_class(self, ko_class_id):
208        raise NotImplementedError
209
210    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
211        raise NotImplementedError
212
213    def get_genes_by_ko(self, ko_id, org):
214        raise NotImplementedError
215
216    #########
217    # Pathway
218    #########
219
220    def mark_pathway_by_objects(self, pathway_id, object_id_list):
221        raise NotImplementedError
222
223    def color_pathway_by_objects(self, pathway_id, object_id_list,
224                                 fg_color_list, bg_color_list):
225        raise NotImplementedError
226
227    def color_pathway_by_elements(self, pathway_id, element_id_list,
228                                  fg_color_list, bg_color_list):
229        raise NotImplementedError
230
231    def get_html_of_marked_pathway_by_objects(self, pathway_id,
232                                              object_id_list):
233        raise NotImplementedError
234
235    def get_html_of_colored_pathway_by_objects(self, pathway_id,
236                                               object_id_list, fg_color_list,
237                                               bg_color_list):
238        raise NotImplementedError
239
240    def get_html_of_colored_pathway_by_elements(self, pathway_id,
241                                                element_id_list, fg_color_list,
242                                                bg_color_list):
243        raise NotImplementedError
244
245    def get_references_by_pathway(self, pathway_id):
246        return self.service.get_references_by_pathway(pathway_id)
247
248    def get_element_relations_by_pathway(self, pathway_id):
249        return self.service.get_element_relations_by_pathway(pathway_id)
250
251    def get_genes_by_organism(self, organism, offset=None, limit=None):
252        if offset is not None:
253            raise NotImplementedError("offset is no longer supported")
254        if limit is not None:
255            raise NotImplementedError("limit is no longer supported.")
256
257        res = self.service.list(organism).get().splitlines()
258        return [r.split(None, 1)[0] for r in res]
259
260    def get_number_of_genes_by_organism(self, organism):
261        raise NotImplementedError
262
263    ####################
264    # Objects by pathway
265    ####################
266
267    def get_elements_by_pathway(self, pathway_id):
268        raise NotImplementedError
269
270    def get_genes_by_pathway(self, pathway_id):
271        return _link_targets(self.link("genes", ids=[pathway_id]))
272
273    def get_enzymes_by_pathway(self, pathway_id):
274        return _link_targets(self.link("ec", ids=[pathway_id]))
275
276    def get_compounds_by_pathway(self, pathway_id):
277        return _link_targets(self.link("compound", ids=[pathway_id]))
278
279    def get_drugs_by_pathway(self, pathway_id):
280        return _link_targets(self.link("drug", ids=[pathway_id]))
281
282    def get_glycans_by_pathway(self, pathway_id):
283        return _link_targets(self.link("gl", ids=[pathway_id]))
284
285    def get_reactions_by_pathway(self, pathway_id):
286        return _link_targets(self.link("rn", ids=[pathway_id]))
287
288    def get_kos_by_pathway(self, pathway_id):
289        return _link_targets(self.link("ko", ids=[pathway_id]))
290
291    #####################
292    # Pathways by objects
293    #####################
294
295    # These functions returned results intersections.
296    def get_pathways_by_genes(self, gene_list):
297        raise NotImplementedError
298
299    def get_pathways_by_enzymes(self, enzyme_list):
300        raise NotImplementedError
301
302    def get_pathways_by_compounds(self, compound_list):
303        raise NotImplementedError
304
305    def get_pathways_by_drugs(self, drug_list):
306        raise NotImplementedError
307
308    def get_pathways_by_glycans(self, glycan_list):
309        raise NotImplementedError
310
311    def get_pathways_by_reactions(self, reaction_list):
312        raise NotImplementedError
313
314    def get_pathways_by_kos(self, ko_list):
315        raise NotImplementedError
316
317    ##########################
318    # Relations among pathways
319    ##########################
320
321    def get_linked_pathways(self, pathway_id):
322        if not pathway_id.startswith("path:"):
323            pathway_id = "path:" + pathway_id
324        return _link_targets(self.link("pathway", ids=[pathway_id]))
325
326
327"""
328KEGG api with caching
329"""
330
331import os
332
333from . import caching
334from .caching import cached_method, cache_entry, touch_dir
335
336try:
337    from functools import lru_cache
338except ImportError:
339    # TODO: move a copy of lru_cache in .caching if distributing this as a
340    # standalone package
341    from Orange.utils import lru_cache
342
343
344class CachedKeggApi(KeggApi):
345    def __init__(self, store=None):
346        KeggApi.__init__(self)
347        if store is None:
348            self.store = {}
349
350    # Needed API for cached decorator.
351    def cache_store(self):
352        from . import conf
353        path = conf.params["cache.path"]
354        touch_dir(path)
355        return caching.Sqlite3Store(os.path.join(path,
356                                                 "kegg_api_cache_1.sqlite3"))
357
358    def last_modified(self, args, kwargs=None):
359        return getattr(self, "default_release", "")
360
361    def set_default_release(self, release):
362        self.default_release = release
363
364    @cached_method
365    def list_organisms(self):
366        return KeggApi.list_organisms(self)
367
368    @cached_method
369    def list_pathways(self, organism):
370        return KeggApi.list_pathways(self, organism)
371
372    @cached_method
373    def list(self, db):
374        return KeggApi.list(self, db)
375
376    @lru_cache()  # not persistently cached
377    def info(self, db):
378        return KeggApi.info(self, db)
379
380    @cached_method
381    def find(self, db, keywords):
382        return KeggApi.find(self, db, keywords)
383
384    @cached_method
385    def get(self, ids):
386        if not isinstance(ids, basestring):
387            return self._batch_get(ids)
388        else:
389            return KeggApi.get(self, ids)
390
391    def _batch_get(self, ids):
392        if len(ids) > 10:
393            raise ValueError("Can batch at most 10 ids at a time.")
394
395        get = self.get
396        uncached = []
397        unmatched = set()
398
399        with closing(get.cache_store()) as store:
400            # Which ids are already cached
401            # TODO: Invalidate entries by release string.
402            for id in ids:
403                key = get.key_from_args((id,))
404                if key not in store:
405                    uncached.append(id)
406
407        if uncached:
408            # in case there are duplicate ids
409            uncached = sorted(set(uncached))
410            rval = KeggApi.get(self, uncached)
411
412            if rval is not None:
413                entries = rval.split("///\n")
414            else:
415                entries = []
416
417            if entries and not entries[-1].strip():
418                # Delete the last single newline entry if present
419                del entries[-1]
420
421            if len(entries) != len(uncached):
422                new_uncached, entries = match_by_ids(uncached, entries)
423                unmatched = set(uncached) - set(new_uncached)
424                uncached = new_uncached
425                warnings.warn("Unable to match entries for keys: %s." %
426                              ", ".join(map(repr, unmatched)))
427
428            with closing(get.cache_store()) as store:
429                for id, entry in zip(uncached, entries):
430                    key = get.key_from_args((id,))
431                    if entry is not None:
432                        entry = entry + "///\n"
433                    store[key] = cache_entry(entry, mtime=datetime.now())
434
435        # Finally join all the results, but drop all None objects
436        entries = filter(lambda e: e is not None, map(get, ids))
437
438        rval = "".join(entries)
439        return rval
440
441    @cached_method
442    def conv(self, target_db, source):
443        return KeggApi.conv(self, target_db, source)
444
445    ########
446    # LinkDB
447    ########
448
449    @cached_method
450    def get_genes_by_enzyme(self, enzyme_id, org):
451        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
452
453    @cached_method
454    def get_enzymes_by_gene(self, genes_id):
455        return KeggApi.get_enzymes_by_gene(self, genes_id)
456
457    @cached_method
458    def get_enzymes_by_compound(self, compound_id):
459        return KeggApi.get_enzymes_by_compound(self, compound_id)
460
461    @cached_method
462    def get_enzymes_by_glycan(self, glycan_id):
463        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
464
465    @cached_method
466    def get_enzymes_by_reaction(self, reaction_id):
467        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
468
469    @cached_method
470    def get_compounds_by_enzyme(self, enzyme_id):
471        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
472
473    @cached_method
474    def get_compounds_by_reaction(self, reaction_id):
475        return KeggApi.get_compounds_by_reaction(self, reaction_id)
476
477    @cached_method
478    def get_glycans_by_enzyme(self, enzyme_id):
479        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
480
481    @cached_method
482    def get_glycans_by_reaction(self, reaction_id):
483        return KeggApi.get_glycans_by_reaction(self, reaction_id)
484
485    @cached_method
486    def get_reactions_by_enzyme(self, enzyme_id):
487        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
488
489    @cached_method
490    def get_reactions_by_compound(self, compound_id):
491        return KeggApi.get_reactions_by_compound(self, compound_id)
492
493    @cached_method
494    def get_reactions_by_glycan(self, glycan_id):
495        return KeggApi.get_reactions_by_glycan(self, glycan_id)
496
497    ######
498    # SSDB
499    ######
500
501    @cached_method
502    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
503        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset,
504                                                       limit)
505
506    @cached_method
507    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
508        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset,
509                                                  limit)
510
511    @cached_method
512    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
513        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id,
514                                                          offset, limit)
515
516    @cached_method
517    def get_paralogs_by_gene(self, genes_id, offset, limit):
518        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
519
520    #######
521    # Motif
522    #######
523
524    @cached_method
525    def get_motifs_by_gene(self, genes_id, db):
526        return KeggApi.get_motifs_by_gene(self, genes_id, db)
527
528    @cached_method
529    def get_genes_by_motifs(self, motif_id_list, offset, limit):
530        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
531
532    ####
533    # KO
534    ####
535
536    @cached_method
537    def get_ko_by_gene(self, genes_id):
538        return KeggApi.get_ko_by_gene(self, genes_id)
539
540    @cached_method
541    def get_ko_by_ko_class(self, ko_class_id):
542        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
543
544    @cached_method
545    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
546        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset,
547                                             limit)
548
549    @cached_method
550    def get_genes_by_ko(self, ko_id, org):
551        return KeggApi.get_genes_by_ko(self, ko_id, org)
552
553    #########
554    # Pathway
555    #########
556
557    @cached_method
558    def get_genes_by_organism(self, organism, offset=None, limit=None):
559        return KeggApi.get_genes_by_organism(self, organism, offset=offset,
560                                             limit=limit)
561
562    @cached_method
563    def get_number_of_genes_by_organism(self, organism):
564        return KeggApi.get_number_of_genes_by_organism(self, organism)
565
566    @cached_method
567    def get_pathways_by_genes(self, gene_list):
568        return KeggApi.get_pathways_by_genes(self, gene_list)
569
570    @cached_method
571    def get_pathways_by_enzymes(self, enzyme_list):
572        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
573
574    @cached_method
575    def get_pathways_by_compounds(self, compound_list):
576        return KeggApi.get_pathways_by_compounds(self, compound_list)
577
578    @cached_method
579    def get_pathways_by_drugs(self, drug_list):
580        return KeggApi.get_pathways_by_drugs(self, drug_list)
581
582    @cached_method
583    def get_pathways_by_glycans(self, glycan_list):
584        return KeggApi.get_pathways_by_glycans(self, glycan_list)
585
586    @cached_method
587    def get_pathways_by_reactions(self, reaction_list):
588        return KeggApi.get_pathways_by_reactions(self, reaction_list)
589
590    @cached_method
591    def get_pathways_by_kos(self, ko_list):
592        return KeggApi.get_pathways_by_kos(self, ko_list)
593
594    @cached_method
595    def get_elements_by_pathway(self, pathway_id):
596        return KeggApi.get_elements_by_pathway(self, pathway_id)
597
598    @cached_method
599    def get_genes_by_pathway(self, pathway_id):
600        return KeggApi.get_genes_by_pathway(self, pathway_id)
601
602    @cached_method
603    def get_enzymes_by_pathway(self, pathway_id):
604        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
605
606    @cached_method
607    def get_compounds_by_pathway(self, pathway_id):
608        return KeggApi.get_compounds_by_pathway(self, pathway_id)
609
610    @cached_method
611    def get_drugs_by_pathway(self, pathway_id):
612        return KeggApi.get_drugs_by_pathway(self, pathway_id)
613
614    @cached_method
615    def get_glycans_by_pathway(self, pathway_id):
616        return KeggApi.get_glycans_by_pathway(self, pathway_id)
617
618    @cached_method
619    def get_reactions_by_pathway(self, pathway_id):
620        return KeggApi.get_reactions_by_pathway(self, pathway_id)
621
622    @cached_method
623    def get_kos_by_pathway(self, pathway_id):
624        return KeggApi.get_kos_by_pathway(self, pathway_id)
625
626
627def match_by_ids(ids, entries):
628    """
629
630    """
631
632    unmatched_ids = set(ids)
633    unmatched_entries = set(entries)
634
635    matched_ids = []
636    matched_entries = []
637
638    def match_add(search_id, entry):
639        """
640        Move search_id and entry to the matched lists.
641        """
642        matched_ids.append(search_id)
643        matched_entries.append(entry)
644
645        # Remove from the unmatched set
646        unmatched_ids.remove(search_id)
647        unmatched_entries.remove(entry)
648
649    def entry_split(entry_text):
650        line, _ = entry_text.split("\n", 1)
651        return line.split(None, 2)
652
653    entries_by_id = {}
654
655    for entry in entries:
656        _, eid, _ = entry_split(entry)
657        entries_by_id[eid] = entry
658
659    # First match full search ids
660    for search_id in list(unmatched_ids):
661        if search_id in entries_by_id:
662            entry = entries_by_id.pop(search_id)
663            match_add(search_id, entry)
664
665    # Second pass, split the search ids by ':' to db and identifier part,
666    # match by identifier
667    for search_id in list(unmatched_ids):
668        if ":" in search_id:
669            db_id, rest = search_id.split(":", 1)
670            if rest in entries_by_id:
671                entry = entries_by_id.pop(rest)
672                match_add(search_id, entry)
673
674    return matched_ids, matched_entries
Note: See TracBrowser for help on using the repository browser.