source: orange-bioinformatics/_bioinformatics/obiKEGG/api.py @ 1735:50499d1dc55a

Revision 1735:50499d1dc55a, 20.5 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

Changed the Organism.gene_aliases method.

RevLine 
[1532]1"""
2KEGG api interface.
3
4"""
5from __future__ import absolute_import
6
[1733]7from datetime import datetime
[1532]8from contextlib import closing
[1733]9from operator import itemgetter
10import warnings
[1532]11
12from .service import web_service
[1733]13from .types import OrganismSummary, Definition, BInfo, Link
14
15
16DATABASES = [
17    ("KEGG Pathway", "pathway", "path", None),
18    ("KEGG Brite", "brite", "br", None),
19    ("KEGG Module", "module", "md", "M"),
20    ("KEGG Disease", "disease", "ds", "H"),
21    ("KEGG Drug", "drug", "dr", "D"),
22    ("KEGG Orthology", "orthology", "ko", "K"),
23    ("KEGG Genome", "genome", "genome", "T"),
24    ("KEGG Genomes", "genomes", "gn", "T"),
25    ("KEGG Genes", "genes", None, None),
26    ("KEGG Ligand", "ligand", "ligand", None),
27    ("KEGG Compound", "compound", "cpd", "C"),
28    ("KEGG Glycan", "glycan", "gl", "G"),
29    ("KEGG Reaction", "reaction", "rn", "R"),
30    ("KEGG RPair", "rpair", "rp", "RP"),
31    ("KEGG RClass", "rclass", "rc", "RC"),
32    ("KEGG Enzyme", "enzyme", "ec", "E")
33]
34
35
36def _link_targets(links):
37    return sorted(set(map(itemgetter(1), links)))
38
[1532]39
40class KeggApi(object):
[1733]41    """
42    An abstraction of a kegg api.
43    """
44
[1532]45    def __init__(self):
46        self.service = web_service()
[1733]47
48    def list_organisms(self):
[1532]49        """
[1733]50        Return a list of all available organisms
51
[1532]52        >>> api.list_organisms()
53        [Definition(entry_id='hsa',...
[1733]54
[1532]55        """
[1733]56        return map(OrganismSummary.from_str,
57                   self.service.list.organism.get().splitlines())
58
[1532]59    def list_pathways(self, organism):
[1733]60        """
61        Return a list of all available pathways for `organism`
62
[1532]63        >>> api.list_pathways("hsa")
64        [Definition(entry_id=',...
[1733]65
[1532]66        """
[1733]67        return map(Definition.from_str,
68                   self.service.list.pathway(organism).get().splitlines())
69
70    def list(self, db):
71        """
72        Return a list of all available entries in database `db`.
73        """
74        return map(Definition.from_str,
75                   self.service.list(db).get().splitlines())
76
[1532]77    #######
78    # DBGET
79    #######
[1733]80
81    def info(self, db):
[1532]82        """
[1733]83        Return info for database `db`
84
85        >>> print api.info("pathway")
86        BInfo(entry_id='path', definition='KEGG Pathway Database', ...
87
[1532]88        """
[1733]89        result = self.service.info(db).get()
90        return BInfo.from_text(str(result))
91
92    def find(self, db, keywords):
[1532]93        """
[1733]94        Search database 'db' for keywords.
95        """
96        if isinstance(keywords, basestring):
97            keywords = [keywords]
98
99        return self.service.find(db)("+".join(keywords)).get()
100
101    def get(self, ids):
102        """
103        Retrieve database entries for `ids` list.
[1532]104        """
105        if not isinstance(ids, basestring):
106            # Sequence of ids
[1733]107            ids = "+".join(ids)
108
109        return self.service.get(ids).get()
110
[1735]111    def conv(self, target_db, source):
112        """
113        Return a mapping from source to target_db ids as a list of two
114        tuples [(source_id, target_id), ...].
115
116        """
117        if not isinstance(source, basestring):
118            source = "+".join(source)
119
120        res = self.service.conv(target_db)(source).get()
121        return [tuple(line.split("\t")) for line in res.splitlines()]
[1733]122
123    def link(self, target_db, source_db=None, ids=None):
124        if not (source_db or ids):
125            raise ValueError("One of 'source_db' or 'ids' must be supplied")
126        if source_db and ids:
127            raise ValueError("Only one 'source_db' or 'ids' must be supplied")
128
129        if source_db:
130            result = self.service.link(target_db)(source_db).get()
[1532]131        else:
[1733]132            result = self.service.link(target_db)("+".join(ids)).get()
133
134        return map(Link._make, map(str.split, result.splitlines()))
135
[1532]136    def get_genes_by_enzyme(self, enzyme_id, org):
[1733]137        return _link_targets(self.link(org, ids=[enzyme_id]))
138
139    def get_enzymes_by_gene(self, gene_id):
140        return _link_targets(self.link("ec", ids=[gene_id]))
141
[1532]142    def get_enzymes_by_compound(self, compound_id):
[1733]143        return _link_targets(self.link("ec", ids=[compound_id]))
144
[1532]145    def get_enzymes_by_glycan(self, glycan_id):
[1733]146        return _link_targets(self.link("ec", ids=[glycan_id]))
147
[1532]148    def get_enzymes_by_reaction(self, reaction_id):
[1733]149        return _link_targets(self.link("ec", ids=[reaction_id]))
150
[1532]151    def get_compounds_by_enzyme(self, enzyme_id):
[1733]152        return _link_targets(self.link("compound", ids=[enzyme_id]))
153
[1532]154    def get_compounds_by_reaction(self, reaction_id):
[1733]155        return _link_targets(self.link("compound", ids=[reaction_id]))
156
[1532]157    def get_glycans_by_enzyme(self, enzyme_id):
[1733]158        return _link_targets(self.link("gl", ids=[enzyme_id]))
159
[1532]160    def get_glycans_by_reaction(self, reaction_id):
[1733]161        return _link_targets(self.link("gl", ids=[reaction_id]))
162
[1532]163    def get_reactions_by_enzyme(self, enzyme_id):
[1733]164        return _link_targets(self.link("rn", ids=[enzyme_id]))
165
[1532]166    def get_reactions_by_compound(self, compound_id):
[1733]167        return _link_targets(self.link("rn", ids=[compound_id]))
168
[1532]169    def get_reactions_by_glycan(self, glycan_id):
[1733]170        return _link_targets(self.link("rn", ids=[glycan_id]))
171
[1532]172    ######
173    # SSDB
174    ######
[1733]175
176    # No replacement api in the KEGG REST api.
[1532]177    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]178        raise NotImplementedError
179
[1532]180    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]181        raise NotImplementedError
182
[1532]183    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
[1733]184        raise NotImplementedError
185
[1532]186    def get_paralogs_by_gene(self, genes_id, offset, limit):
[1733]187        raise NotImplementedError
188
[1532]189    #######
190    # Motif
191    #######
[1733]192
193    # No replacement api in KEGG REST api
[1532]194    def get_motifs_by_gene(self, genes_id, db):
[1733]195        raise NotImplementedError
196
[1532]197    def get_genes_by_motifs(self, motif_id_list, offset, limit):
[1733]198        raise NotImplementedError
199
[1532]200    ####
201    # KO
202    ####
[1733]203
[1532]204    def get_ko_by_gene(self, genes_id):
[1733]205        raise NotImplementedError
206
[1532]207    def get_ko_by_ko_class(self, ko_class_id):
[1733]208        raise NotImplementedError
209
[1532]210    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
[1733]211        raise NotImplementedError
212
[1532]213    def get_genes_by_ko(self, ko_id, org):
[1733]214        raise NotImplementedError
215
[1532]216    #########
217    # Pathway
218    #########
[1733]219
[1532]220    def mark_pathway_by_objects(self, pathway_id, object_id_list):
[1733]221        raise NotImplementedError
222
223    def color_pathway_by_objects(self, pathway_id, object_id_list,
224                                 fg_color_list, bg_color_list):
225        raise NotImplementedError
226
227    def color_pathway_by_elements(self, pathway_id, element_id_list,
228                                  fg_color_list, bg_color_list):
229        raise NotImplementedError
230
231    def get_html_of_marked_pathway_by_objects(self, pathway_id,
232                                              object_id_list):
233        raise NotImplementedError
234
235    def get_html_of_colored_pathway_by_objects(self, pathway_id,
236                                               object_id_list, fg_color_list,
237                                               bg_color_list):
238        raise NotImplementedError
239
240    def get_html_of_colored_pathway_by_elements(self, pathway_id,
241                                                element_id_list, fg_color_list,
242                                                bg_color_list):
243        raise NotImplementedError
244
[1532]245    def get_references_by_pathway(self, pathway_id):
246        return self.service.get_references_by_pathway(pathway_id)
[1733]247
[1532]248    def get_element_relations_by_pathway(self, pathway_id):
249        return self.service.get_element_relations_by_pathway(pathway_id)
[1733]250
[1532]251    def get_genes_by_organism(self, organism, offset=None, limit=None):
[1733]252        if offset is not None:
253            raise NotImplementedError("offset is no longer supported")
254        if limit is not None:
255            raise NotImplementedError("limit is no longer supported.")
256
257        res = self.service.list(organism).get().splitlines()
258        return [r.split(None, 1)[0] for r in res]
259
[1532]260    def get_number_of_genes_by_organism(self, organism):
[1733]261        raise NotImplementedError
262
[1532]263    ####################
264    # Objects by pathway
265    ####################
[1733]266
[1532]267    def get_elements_by_pathway(self, pathway_id):
[1733]268        raise NotImplementedError
269
[1532]270    def get_genes_by_pathway(self, pathway_id):
[1733]271        return _link_targets(self.link("genes", ids=[pathway_id]))
272
[1532]273    def get_enzymes_by_pathway(self, pathway_id):
[1733]274        return _link_targets(self.link("ec", ids=[pathway_id]))
275
[1532]276    def get_compounds_by_pathway(self, pathway_id):
[1733]277        return _link_targets(self.link("compound", ids=[pathway_id]))
278
[1532]279    def get_drugs_by_pathway(self, pathway_id):
[1733]280        return _link_targets(self.link("drug", ids=[pathway_id]))
281
[1532]282    def get_glycans_by_pathway(self, pathway_id):
[1733]283        return _link_targets(self.link("gl", ids=[pathway_id]))
284
[1532]285    def get_reactions_by_pathway(self, pathway_id):
[1733]286        return _link_targets(self.link("rn", ids=[pathway_id]))
287
[1532]288    def get_kos_by_pathway(self, pathway_id):
[1733]289        return _link_targets(self.link("ko", ids=[pathway_id]))
290
[1532]291    #####################
292    # Pathways by objects
293    #####################
[1733]294
295    # These functions returned results intersections.
[1532]296    def get_pathways_by_genes(self, gene_list):
[1733]297        raise NotImplementedError
298
[1532]299    def get_pathways_by_enzymes(self, enzyme_list):
[1733]300        raise NotImplementedError
301
[1532]302    def get_pathways_by_compounds(self, compound_list):
[1733]303        raise NotImplementedError
304
[1532]305    def get_pathways_by_drugs(self, drug_list):
[1733]306        raise NotImplementedError
307
[1532]308    def get_pathways_by_glycans(self, glycan_list):
[1733]309        raise NotImplementedError
310
[1532]311    def get_pathways_by_reactions(self, reaction_list):
[1733]312        raise NotImplementedError
313
[1532]314    def get_pathways_by_kos(self, ko_list):
[1733]315        raise NotImplementedError
316
[1532]317    ##########################
318    # Relations among pathways
319    ##########################
[1733]320
[1532]321    def get_linked_pathways(self, pathway_id):
322        if not pathway_id.startswith("path:"):
323            pathway_id = "path:" + pathway_id
[1733]324        return _link_targets(self.link("pathway", ids=[pathway_id]))
325
326
[1532]327"""
328KEGG api with caching
329"""
330
331import os
332
333from . import caching
334from .caching import cached_method, cache_entry, touch_dir
335
336try:
337    from functools import lru_cache
338except ImportError:
339    # TODO: move a copy of lru_cache in .caching if distributing this as a
340    # standalone package
[1601]341    from Orange.utils import lru_cache
[1532]342
[1734]343
[1532]344class CachedKeggApi(KeggApi):
345    def __init__(self, store=None):
346        KeggApi.__init__(self)
347        if store is None:
348            self.store = {}
[1734]349
[1532]350    # Needed API for cached decorator.
351    def cache_store(self):
352        from . import conf
353        path = conf.params["cache.path"]
354        touch_dir(path)
355        return caching.Sqlite3Store(os.path.join(path,
[1716]356                                                 "kegg_api_cache_1.sqlite3"))
[1734]357
[1532]358    def last_modified(self, args, kwargs=None):
359        return getattr(self, "default_release", "")
[1734]360
[1532]361    def set_default_release(self, release):
362        self.default_release = release
[1733]363
[1532]364    @cached_method
365    def list_organisms(self):
366        return KeggApi.list_organisms(self)
[1734]367
[1532]368    @cached_method
369    def list_pathways(self, organism):
370        return KeggApi.list_pathways(self, organism)
[1733]371
[1532]372    @cached_method
[1733]373    def list(self, db):
374        return KeggApi.list(self, db)
375
376    @lru_cache()  # not persistently cached
377    def info(self, db):
378        return KeggApi.info(self, db)
379
[1532]380    @cached_method
[1733]381    def find(self, db, keywords):
382        return KeggApi.find(self, db, keywords)
383
[1532]384    @cached_method
[1733]385    def get(self, ids):
[1532]386        if not isinstance(ids, basestring):
[1733]387            return self._batch_get(ids)
[1532]388        else:
[1733]389            return KeggApi.get(self, ids)
390
391    def _batch_get(self, ids):
392        if len(ids) > 10:
393            raise ValueError("Can batch at most 10 ids at a time.")
394
395        get = self.get
[1532]396        uncached = []
[1733]397        unmatched = set()
398
399        with closing(get.cache_store()) as store:
[1532]400            # Which ids are already cached
401            # TODO: Invalidate entries by release string.
402            for id in ids:
[1733]403                key = get.key_from_args((id,))
[1532]404                if key not in store:
405                    uncached.append(id)
[1733]406
[1532]407        if uncached:
[1539]408            # in case there are duplicate ids
409            uncached = sorted(set(uncached))
[1733]410            rval = KeggApi.get(self, uncached)
411
[1532]412            if rval is not None:
[1733]413                entries = rval.split("///\n")
[1532]414            else:
[1733]415                entries = []
416
417            if entries and not entries[-1].strip():
418                # Delete the last single newline entry if present
419                del entries[-1]
420
421            if len(entries) != len(uncached):
422                new_uncached, entries = match_by_ids(uncached, entries)
423                unmatched = set(uncached) - set(new_uncached)
424                uncached = new_uncached
425                warnings.warn("Unable to match entries for keys: %s." %
426                              ", ".join(map(repr, unmatched)))
427
428            with closing(get.cache_store()) as store:
429                for id, entry in zip(uncached, entries):
430                    key = get.key_from_args((id,))
431                    if entry is not None:
432                        entry = entry + "///\n"
433                    store[key] = cache_entry(entry, mtime=datetime.now())
434
[1532]435        # Finally join all the results, but drop all None objects
[1733]436        entries = filter(lambda e: e is not None, map(get, ids))
437
[1532]438        rval = "".join(entries)
439        return rval
[1733]440
[1532]441    @cached_method
[1735]442    def conv(self, target_db, source):
443        return KeggApi.conv(self, target_db, source)
[1733]444
[1532]445    ########
446    # LinkDB
447    ########
[1733]448
[1532]449    @cached_method
450    def get_genes_by_enzyme(self, enzyme_id, org):
451        return KeggApi.get_genes_by_enzyme(self, enzyme_id, org)
[1733]452
[1532]453    @cached_method
454    def get_enzymes_by_gene(self, genes_id):
455        return KeggApi.get_enzymes_by_gene(self, genes_id)
[1733]456
[1532]457    @cached_method
458    def get_enzymes_by_compound(self, compound_id):
459        return KeggApi.get_enzymes_by_compound(self, compound_id)
[1733]460
[1532]461    @cached_method
462    def get_enzymes_by_glycan(self, glycan_id):
463        return KeggApi.get_enzymes_by_glycan(self, glycan_id)
[1733]464
[1532]465    @cached_method
466    def get_enzymes_by_reaction(self, reaction_id):
467        return KeggApi.get_enzymes_by_reaction(self, reaction_id)
[1733]468
[1532]469    @cached_method
470    def get_compounds_by_enzyme(self, enzyme_id):
471        return KeggApi.get_compounds_by_enzyme(self, enzyme_id)
[1733]472
[1532]473    @cached_method
474    def get_compounds_by_reaction(self, reaction_id):
475        return KeggApi.get_compounds_by_reaction(self, reaction_id)
[1733]476
[1532]477    @cached_method
478    def get_glycans_by_enzyme(self, enzyme_id):
479        return KeggApi.get_glycans_by_enzyme(self, enzyme_id)
[1733]480
[1532]481    @cached_method
482    def get_glycans_by_reaction(self, reaction_id):
483        return KeggApi.get_glycans_by_reaction(self, reaction_id)
[1733]484
[1532]485    @cached_method
486    def get_reactions_by_enzyme(self, enzyme_id):
487        return KeggApi.get_reactions_by_enzyme(self, enzyme_id)
[1733]488
[1532]489    @cached_method
490    def get_reactions_by_compound(self, compound_id):
491        return KeggApi.get_reactions_by_compound(self, compound_id)
[1733]492
[1532]493    @cached_method
494    def get_reactions_by_glycan(self, glycan_id):
495        return KeggApi.get_reactions_by_glycan(self, glycan_id)
[1733]496
[1532]497    ######
498    # SSDB
499    ######
[1734]500
[1532]501    @cached_method
502    def get_best_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]503        return KeggApi.get_best_best_neighbors_by_gene(self, genes_id, offset,
504                                                       limit)
505
[1532]506    @cached_method
507    def get_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]508        return KeggApi.get_best_neighbors_by_gene(self, genes_id, offset,
509                                                  limit)
510
[1532]511    @cached_method
512    def get_reverse_best_neighbors_by_gene(self, genes_id, offset, limit):
[1734]513        return KeggApi.get_reverse_best_neighbors_by_gene(self, genes_id,
514                                                          offset, limit)
515
[1532]516    @cached_method
517    def get_paralogs_by_gene(self, genes_id, offset, limit):
518        return KeggApi.get_paralogs_by_gene(self, genes_id, offset, limit)
[1734]519
[1532]520    #######
521    # Motif
522    #######
[1734]523
[1532]524    @cached_method
525    def get_motifs_by_gene(self, genes_id, db):
526        return KeggApi.get_motifs_by_gene(self, genes_id, db)
[1734]527
[1532]528    @cached_method
529    def get_genes_by_motifs(self, motif_id_list, offset, limit):
530        return KeggApi.get_genes_by_motifs(self, motif_id_list, offset, limit)
531
532    ####
533    # KO
534    ####
[1734]535
[1532]536    @cached_method
537    def get_ko_by_gene(self, genes_id):
538        return KeggApi.get_ko_by_gene(self, genes_id)
[1734]539
[1532]540    @cached_method
541    def get_ko_by_ko_class(self, ko_class_id):
542        return KeggApi.service.get_ko_by_ko_class(self, ko_class_id)
[1734]543
[1532]544    @cached_method
545    def get_genes_by_ko_class(self, ko_class_id, org, offset, limit):
[1734]546        return KeggApi.get_genes_by_ko_class(self, ko_class_id, org, offset,
547                                             limit)
548
[1532]549    @cached_method
550    def get_genes_by_ko(self, ko_id, org):
551        return KeggApi.get_genes_by_ko(self, ko_id, org)
[1734]552
[1532]553    #########
554    # Pathway
555    #########
[1733]556
[1532]557    @cached_method
558    def get_genes_by_organism(self, organism, offset=None, limit=None):
[1734]559        return KeggApi.get_genes_by_organism(self, organism, offset=offset,
560                                             limit=limit)
561
[1532]562    @cached_method
563    def get_number_of_genes_by_organism(self, organism):
564        return KeggApi.get_number_of_genes_by_organism(self, organism)
[1734]565
[1532]566    @cached_method
567    def get_pathways_by_genes(self, gene_list):
568        return KeggApi.get_pathways_by_genes(self, gene_list)
[1734]569
[1532]570    @cached_method
571    def get_pathways_by_enzymes(self, enzyme_list):
572        return KeggApi.get_pathways_by_enzymes(self, enzyme_list)
[1734]573
[1532]574    @cached_method
575    def get_pathways_by_compounds(self, compound_list):
576        return KeggApi.get_pathways_by_compounds(self, compound_list)
[1734]577
[1532]578    @cached_method
579    def get_pathways_by_drugs(self, drug_list):
580        return KeggApi.get_pathways_by_drugs(self, drug_list)
[1734]581
[1532]582    @cached_method
583    def get_pathways_by_glycans(self, glycan_list):
584        return KeggApi.get_pathways_by_glycans(self, glycan_list)
[1734]585
[1532]586    @cached_method
587    def get_pathways_by_reactions(self, reaction_list):
588        return KeggApi.get_pathways_by_reactions(self, reaction_list)
[1734]589
[1532]590    @cached_method
591    def get_pathways_by_kos(self, ko_list):
592        return KeggApi.get_pathways_by_kos(self, ko_list)
[1734]593
[1532]594    @cached_method
595    def get_elements_by_pathway(self, pathway_id):
596        return KeggApi.get_elements_by_pathway(self, pathway_id)
[1734]597
[1532]598    @cached_method
599    def get_genes_by_pathway(self, pathway_id):
600        return KeggApi.get_genes_by_pathway(self, pathway_id)
[1734]601
[1532]602    @cached_method
603    def get_enzymes_by_pathway(self, pathway_id):
604        return KeggApi.get_enzymes_by_pathway(self, pathway_id)
[1734]605
[1532]606    @cached_method
607    def get_compounds_by_pathway(self, pathway_id):
608        return KeggApi.get_compounds_by_pathway(self, pathway_id)
[1734]609
[1532]610    @cached_method
611    def get_drugs_by_pathway(self, pathway_id):
612        return KeggApi.get_drugs_by_pathway(self, pathway_id)
[1734]613
[1532]614    @cached_method
615    def get_glycans_by_pathway(self, pathway_id):
616        return KeggApi.get_glycans_by_pathway(self, pathway_id)
[1734]617
[1532]618    @cached_method
619    def get_reactions_by_pathway(self, pathway_id):
620        return KeggApi.get_reactions_by_pathway(self, pathway_id)
[1734]621
[1532]622    @cached_method
623    def get_kos_by_pathway(self, pathway_id):
624        return KeggApi.get_kos_by_pathway(self, pathway_id)
[1733]625
626
627def match_by_ids(ids, entries):
628    """
629
630    """
631
632    unmatched_ids = set(ids)
633    unmatched_entries = set(entries)
634
635    matched_ids = []
636    matched_entries = []
637
638    def match_add(search_id, entry):
639        """
640        Move search_id and entry to the matched lists.
641        """
642        matched_ids.append(search_id)
643        matched_entries.append(entry)
644
645        # Remove from the unmatched set
646        unmatched_ids.remove(search_id)
647        unmatched_entries.remove(entry)
648
649    def entry_split(entry_text):
650        line, _ = entry_text.split("\n", 1)
651        return line.split(None, 2)
652
653    entries_by_id = {}
654
655    for entry in entries:
656        _, eid, _ = entry_split(entry)
657        entries_by_id[eid] = entry
658
659    # First match full search ids
660    for search_id in list(unmatched_ids):
661        if search_id in entries_by_id:
662            entry = entries_by_id.pop(search_id)
663            match_add(search_id, entry)
664
665    # Second pass, split the search ids by ':' to db and identifier part,
666    # match by identifier
667    for search_id in list(unmatched_ids):
668        if ":" in search_id:
669            db_id, rest = search_id.split(":", 1)
670            if rest in entries_by_id:
671                entry = entries_by_id.pop(rest)
672                match_add(search_id, entry)
673
674    return matched_ids, matched_entries
Note: See TracBrowser for help on using the repository browser.