source: orange-bioinformatics/_bioinformatics/obiKEGG/databases.py @ 1760:165df4199ec9

Revision 1760:165df4199ec9, 16.6 KB checked in by Ales Erjavec <ales.erjavec@…>, 12 months ago (diff)

Optimized DBDatabase batch_get/pre_cache w.r.t. number of separate network requests.

RevLine 
[1532]1"""
[1736]2DBGET Database Interface
3========================
4
[1532]5"""
[1535]6from __future__ import absolute_import
7
8import re
[1760]9from contextlib import closing
[1532]10
11from . import entry
12from .entry import fields
13from . import api
14
[1733]15
[1536]16def iter_take(source_iter, n):
[1736]17    """
18    Return a list of the first `n` items in `source_iter`.
19    """
[1536]20    source_iter = iter(source_iter)
21    return [item for _, item in zip(range(n), source_iter)]
22
[1733]23
[1536]24def batch_iter(source_iter, n):
[1736]25    """
26    Split the `source_iter` into batches of size `n`.
27    """
[1536]28    source_iter = iter(source_iter)
29    while True:
30        batch = iter_take(source_iter, n)
31        if batch:
32            yield batch
33        else:
34            break
[1733]35
36
[1536]37def chain_iter(chains_iter):
38    for iter in chains_iter:
39        for element in iter:
40            yield element
41
[1733]42
43# TODO: DBDataBase should be able to be constructed from a flat text
44# entry file. The precache etc. should be moved in caching api, that creates
45# simple file system hierarchy where the flat database is saved (with db
46# release string), e.g.
47# genes/hsa.dbget
48# genes/hsa.release
49# genes/sce.dbget
50# path.dbget
51# module.dbget
52# ligand/compound.dbget
53
54
[1532]55class DBDataBase(object):
[1733]56    """
[1736]57    Base class for a DBGET database interface.
[1733]58
[1532]59    """
[1736]60    #: ENTRY_TYPE constructor (a :class:`~.entry.DBEntry` subclass). This
61    #: should be redefined in subclasses.
[1532]62    ENTRY_TYPE = entry.DBEntry
[1733]63
[1736]64    #: A database name/abbreviation (e.g. 'pathway'). Needs to be set in a
65    #: subclass or object instance's constructor before calling the base.
66    #: __init__
[1532]67    DB = None
[1733]68
[1532]69    def __init__(self, **kwargs):
70        if not self.DB:
[1733]71            raise TypeError("Cannot make an instance of abstract base "
72                            "class %r." % type(self).__name__)
73
[1532]74        self.api = api.CachedKeggApi()
[1733]75        self.info = self.api.info(self.DB)
[1532]76        release = self.info.release
77        self.api.set_default_release(release)
78        self._keys = []
[1733]79
[1532]80    def keys(self):
[1733]81        """
[1736]82        Return a list of database keys. These are unique KEGG identifiers
[1733]83        that can be used to query the database.
84
85        """
[1532]86        return list(self._keys)
[1733]87
[1532]88    def iterkeys(self):
[1733]89        """
[1736]90        Return an iterator over the `keys`.
[1733]91        """
[1532]92        return iter(self._keys)
[1733]93
[1532]94    def items(self):
[1733]95        """
[1736]96        Return a list of all (key, :obj:`DBDataBase.ENTRY_TYPE` instance)
97        tuples.
98
[1733]99        """
[1536]100        return list(zip(self.keys(), self.batch_get(self.keys())))
[1733]101
[1532]102    def iteritems(self):
[1733]103        """
104        Return an iterator over the `items`.
105        """
[1536]106        batch_size = 100
107        iterkeys = self.iterkeys()
108        return chain_iter(zip(batch, self.batch_get(batch))
109                          for batch in batch_iter(iterkeys, batch_size))
[1733]110
[1532]111    def values(self):
[1733]112        """
[1736]113        Return a list of all :obj:`DBDataBase.ENTRY_TYPE` instances.
[1733]114        """
[1536]115        return self.batch_get(self.keys())
[1733]116
[1532]117    def itervalues(self):
[1733]118        """
[1736]119        Return an iterator over all :obj:`DBDataBase.ENTRY_TYPE` instances.
[1733]120        """
[1536]121        batch_size = 100
122        iterkeys = self.iterkeys()
123        return chain_iter(self.batch_get(batch)
124                          for batch in batch_iter(iterkeys, batch_size))
[1733]125
[1532]126    def get(self, key, default=None):
[1733]127        """
[1736]128        Return an :obj:`DBDataBase.ENTRY_TYPE` instance for the `key`.
129        Raises :class:`KeyError` if not found.
[1733]130
131        """
[1538]132        try:
[1532]133            return self.__getitem__(key)
[1538]134        except KeyError:
[1532]135            return default
[1733]136
[1532]137    def has_key(self, key):
138        return self.__contains__(key)
[1733]139
[1532]140    def __getitem__(self, key):
141        e = self.get_entry(key)
142        if e is None:
143            raise KeyError(key)
144        else:
145            return e
[1733]146
[1532]147    def __contains__(self, key):
148        return key in set(self.keys())
[1733]149
[1532]150    def __len__(self):
151        return len(self.keys())
[1733]152
[1532]153    def __iter__(self):
154        return iter(self.keys())
[1733]155
[1532]156    def get_text(self, key):
[1733]157        """
158        Return the database entry for `key` as plain text.
159        """
[1535]160        key = self._add_db(key)
[1733]161        return self.api.get([key])
162
[1532]163    def get_entry(self, key):
[1733]164        """
165        Return the database entry for `key` as an instance of `ENTRY_TYPE`.
166        """
[1532]167        text = self.get_text(key)
168        if not text or text == "None":
169            return None
170        else:
171            return self.ENTRY_TYPE(text)
[1733]172
[1532]173    def find(self, name):
174        """
[1736]175        Find `name` using kegg `find` api.
[1733]176        """
177        res = self.api.find(self.DB, name).splitlines()
178        return [r.split(" ", 1)[0] for r in res]
179
180    def pre_cache(self, keys=None, batch_size=10, progress_callback=None):
181        """
[1736]182        Retrieve all the entries for `keys` and cache them locally for faster
183        subsequent retrieval. If `keys` is ``None`` then all entries will be
184        retrieved.
185
[1532]186        """
187        if not isinstance(self.api, api.CachedKeggApi):
[1733]188            raise TypeError("Not an instance of api.CachedKeggApi")
189
190        if batch_size > 10 or batch_size < 1:
[1532]191            raise ValueError("Invalid batch_size")
[1733]192
[1532]193        if keys is None:
194            keys = self.keys()
[1733]195
[1760]196        keys = map(self._add_db, keys)
197
198        get = self.api.get
199
200        # drop all keys with a valid cache entry to minimize the number
201        # of 'get' requests.
202        with closing(get.cache_store()) as store:
203            def is_uncached(key):
204                return not get.key_has_valid_cache(get.key_from_args((key,)),
205                                                   store)
206            keys = filter(is_uncached, keys)
207
[1532]208        start = 0
[1760]209
[1532]210        while start < len(keys):
211            batch = keys[start: start + batch_size]
[1733]212            self.api.get(batch)
213
[1532]214            if progress_callback:
215                progress_callback(100.0 * start / len(keys))
[1733]216
[1532]217            start += batch_size
[1733]218
[1535]219    def batch_get(self, keys):
[1733]220        """
221        Batch retrieve all entries for keys. This can be significantly
222        faster then getting each entry separately especially if entries
223        are not yet cached.
224
[1535]225        """
226        entries = []
[1737]227        batch_size = 10
[1760]228        keys = map(self._add_db, keys)
229
230        # Precache the entries first
231        self.pre_cache(keys)
232
[1535]233        start = 0
234        while start < len(keys):
235            batch = keys[start: start + batch_size]
[1733]236            batch_entries = self.api.get(batch)
[1535]237            if batch_entries is not None:
238                batch_entries = batch_entries.split("///\n")
[1733]239                # Remove possible empty last line
[1538]240                batch_entries = [e for e in batch_entries if e.strip()]
[1535]241                entries.extend(map(self.ENTRY_TYPE, batch_entries))
242            start += batch_size
[1733]243
[1535]244        return entries
[1733]245
[1532]246    def _add_db(self, key):
[1733]247        """
248        Prefix the key with '%(DB)s:' string if not already prefixed.
[1532]249        """
250        if not key.startswith(self.DB + ":"):
251            return self.DB + ":" + key
252        else:
253            return key
[1733]254
255
[1532]256@entry.entry_decorate
257class GenomeEntry(entry.DBEntry):
[1733]258    """
259    Entry for a KEGG Genome database.
260    """
261    FIELDS = [
262        ("ENTRY", fields.DBEntryField),
263        ("NAME", fields.DBNameField),
264        ("DEFINITION", fields.DBDefinitionField),
265        ("ANNOTATION", fields.DBSimpleField),
266        ("TAXONOMY", fields.DBTaxonomyField),
267        ("DATA_SOURCE", fields.DBSimpleField),
268        ("ORIGINAL_DB", fields.DBSimpleField),
269        ("KEYWORDS", fields.DBSimpleField),
270        ("DISEASE", fields.DBSimpleField),
271        ("COMMENT", fields.DBSimpleField),
272        ("CHROMOSOME", fields.DBFieldWithSubsections),
273        ("PLASMID", fields.DBSimpleField),
274        ("STATISTICS", fields.DBSimpleField),
275        ("REFERENCE", fields.DBReference)
276    ]
277
[1532]278    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]279
[1532]280    def __init__(self, text):
281        entry.DBEntry.__init__(self, text)
[1733]282
[1532]283    @property
[1733]284    def organism_code(self):
[1532]285        """
[1733]286        A three or four letter KEGG organism code (e.g. 'hsa', 'sce', ...)
287        """
[1532]288        return self.name.split(",", 1)[0]
289
290    @property
291    def taxid(self):
[1733]292        """
293        Organism NCBI taxonomy id.
294        """
[1532]295        return self.TAXONOMY.taxid
[1733]296
[1736]297    def org_code(self):
298        # for backwards compatibility; return the `organism_code`
299        return self.organism_code
[1733]300
[1532]301
302class Genome(DBDataBase):
[1733]303    """
304    An interface to the A KEGG GENOME database.
305    """
[1532]306    DB = "genome"
307    ENTRY_TYPE = GenomeEntry
[1733]308
[1532]309    # For obiTaxonomy.common_taxids mapping
[1733]310    TAXID_MAP = {
311        "562": "511145",   # Escherichia coli K-12 MG1655
312        "2104": "272634",  # Mycoplasma pneumoniae M129
313        "4530": "39947",   # Oryza sativa ssp. japonica cultivar Nipponbare (Japanese rice)
314        "4932": "559292",  # Saccharomyces cerevisiae S288C
315        "4896": "284812",  # Schizosaccharomyces pombe 972h-
316    }
317
[1532]318    def __init__(self):
319        DBDataBase.__init__(self)
[1733]320        self._org_list = self.api.list_organisms()
321        self._keys = [org.entry_id for org in self._org_list]
322
[1532]323    def _key_to_gn_entry_id(self, key):
324        res = self.find(key)
325        if len(res) == 0:
326            raise KeyError("Unknown key")
327        elif len(res) > 1:
328            raise ValueError("Not a unique key")
329        else:
330            return res[0]
[1733]331
[1532]332    @classmethod
333    def common_organisms(cls):
334        return ['ath', 'bta', 'cel', 'cre', 'dre', 'ddi',
335                'dme', 'eco', 'hsa', 'mmu', 'mpn', 'osa',
336                'pfa', 'rno', 'sce', 'spo', 'zma', 'xla']
[1733]337
[1532]338    @classmethod
339    def essential_organisms(cls):
340        return ['ddi', 'dme', 'hsa', 'mmu', 'sce']
[1733]341
342    def org_code_to_entry_key(self, code):
343        """
344        Map an organism code ('hsa', 'sce', ...) to the corresponding kegg
345        identifier (T + 5 digit number).
346
347        """
348        for org in self._org_list:
349            if org.org_code == code:
350                return org.entry_id
351        else:
352            raise ValueError("Unknown organism code '%s'" % code)
353
[1532]354    def search(self, string, relevance=False):
[1733]355        """
356        Search the genome database for string using ``bfind``.
[1546]357        """
[1532]358        if relevance:
359            raise NotImplementedError("relevance is no longer supported")
[1733]360
[1532]361        if string in self.TAXID_MAP:
362            string = self.TAXID_MAP[string]
[1733]363
[1747]364        res = self.api.find(self.DB, string).strip()
[1532]365        if not res:
366            return []
[1733]367
[1532]368        res = res.splitlines()
369        res = [r.split(",", 1)[0] for r in res]
[1733]370        res = [r.split(None, 1)[1] for r in res]
[1532]371        return res
[1733]372
373
[1532]374@entry.entry_decorate
375class GeneEntry(entry.DBEntry):
[1733]376    FIELDS = [
377        ("ENTRY", fields.DBEntryField),
378        ("NAME", fields.DBNameField),
379        ("DEFINITION", fields.DBDefinitionField),
380        ("ORTHOLOGY", fields.DBSimpleField),
381        ("ORGANISM", fields.DBSimpleField),
382        ("PATHWAY", fields.DBPathway),
383        ("MODULE", fields.DBSimpleField),
384        ("DISEASE", fields.DBSimpleField),
385        ("DRUG_TARGET", fields.DBSimpleField),
386        ("CLASS", fields.DBSimpleField),
387        ("MOTIF", fields.DBSimpleField),
388        ("DBLINKS", fields.DBDBLinks),
389        ("STRUCTURE", fields.DBSimpleField),
390        ("POSITION", fields.DBSimpleField),
391        ("AASEQ", fields.DBAASeq),
392        ("NTSEQ", fields.DBNTSeq)
393    ]
394
[1532]395    def aliases(self):
[1733]396        return [self.entry_key] + \
397               (self.name.split(",") if self.name else []) + \
398               ([link[1][0] for link in self.dblinks.items()]
399                if self.dblinks else [])
[1532]400
401    @property
402    def alt_names(self):
[1733]403        """
404        For backwards compatibility.
[1532]405        """
406        return self.aliases()
[1733]407
408
[1532]409class Genes(DBDataBase):
[1736]410    """
411    Interface to the KEGG Genes database.
412
[1741]413    :param str org_code: KEGG organism code (e.g. 'hsa').
[1736]414
415    """
[1733]416    DB = None  # Needs to be set in __init__
[1532]417    ENTRY_TYPE = GeneEntry
[1733]418
[1532]419    def __init__(self, org_code):
[1733]420        # TODO: Map to org code from kegg id (T + 5 digits)
[1532]421        self.DB = org_code
422        self.org_code = org_code
423        DBDataBase.__init__(self)
424        self._keys = self.api.get_genes_by_organism(org_code)
[1733]425
[1532]426    def gene_aliases(self):
427        aliases = {}
428        for entry in self.itervalues():
[1734]429            aliases.update(
430                dict.fromkeys(entry.aliases(),
431                              self.org_code + ":" + entry.entry_key)
432            )
433
[1532]434        return aliases
[1733]435
[1532]436
437@entry.entry_decorate
438class CompoundEntry(entry.DBEntry):
[1733]439    FIELDS = [
440        ("ENTRY", fields.DBEntryField),
441        ("NAME", fields.DBNameField),
442        ("FORMULA", fields.DBSimpleField),
443        ("EXACT_MASS", fields.DBSimpleField),
444        ("MOL_WEIGHT", fields.DBSimpleField),
445        ("REMARK", fields.DBSimpleField),
446        ("COMMENT", fields.DBSimpleField),
447        ("REACTION", fields.DBSimpleField),
448        ("PATHWAY", fields.DBPathway),
449        ("ENZYME", fields.DBSimpleField),
450        ("BRITE", fields.DBSimpleField),
451        ("REFERENCE", fields.DBSimpleField),
452        ("DBLINKS", fields.DBDBLinks),
453        ("ATOM", fields.DBSimpleField),
454        ("BOND", fields.DBSimpleField)
455    ]
456
457
458class Compound(DBDataBase):
[1532]459    DB = "cpd"
460    ENTRY_TYPE = CompoundEntry
[1733]461
[1532]462    def __init__(self):
463        DBDataBase.__init__(self)
[1733]464        self._keys = [d.entry_id for d in self.api.list("cpd")]
[1532]465
466
[1733]467@entry.entry_decorate
[1532]468class ReactionEntry(entry.DBEntry):
[1733]469    FIELDS = [
470        ("ENTRY", fields.DBEntryField),
471        ("NAME", fields.DBNameField),
472        ("DEFINITION", fields.DBDefinitionField),
473        ("EQUATION", fields.DBSimpleField),
474        ("ENZYME", fields.DBSimpleField)
475    ]
476
477
478class Reaction(DBDataBase):
[1532]479    DB = "rn"
480    ENTRY_TYPE = ReactionEntry
[1733]481
[1532]482    def __init__(self):
483        DBDataBase.__init__(self)
[1733]484        self._keys = [d.entry_id for d in self.api.list("rn")]
485
486
[1532]487class Brite(DBDataBase):
488    DB = "br"
[1733]489
490
[1532]491class Disease(DBDataBase):
492    DB = "ds"
[1733]493
494
[1532]495class Drug(DBDataBase):
496    DB = "dr"
[1733]497
498
[1546]499@entry.entry_decorate
500class EnzymeEntry(entry.DBEntry):
[1733]501    FIELDS = [
502        ("ENTRY", fields.DBEntryField),
503        ("NAME", fields.DBNameField),
504        ("CLASS", fields.DBSimpleField),
505        ("SYSNAME", fields.DBSimpleField),
506        ("REACTION", fields.DBSimpleField),
507        ("ALL_REAC", fields.DBSimpleField),
508        ("SUBSTRATE", fields.DBSimpleField),
509        ("PRODUCT", fields.DBSimpleField),
510        ("COMMENT", fields.DBSimpleField),
511        ("REFERENCE", fields.DBReference),
512        ("PATHWAY", fields.DBPathway),
513        ("ORTHOLOGY", fields.DBSimpleField),
514        ("GENES", fields.DBSimpleField),
515        ("DBLINKS", fields.DBDBLinks)
516    ]
517
[1546]518    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]519
520
521class Enzyme(DBDataBase):
[1532]522    DB = "ec"
[1546]523    ENTRY_TYPE = EnzymeEntry
[1733]524
525    def __init__(self):
526        DBDataBase.__init__(self)
527        self._keys = [d.entry_id for d in self.api.list("ec")]
528
529
[1532]530@entry.entry_decorate
531class OrthologyEntry(entry.DBEntry):
[1733]532    FIELDS = [
533        ("ENTRY", fields.DBEntryField),
534        ("NAME", fields.DBNameField),
535        ("CLASS", fields.DBSimpleField),
536        ("DBLINKS", fields.DBDBLinks),
537        ("GENES", fields.DBSimpleField),
538    ]
539
540
[1532]541class Orthology(DBDataBase):
542    DB = "ko"
543    ENTRY_TYPE = OrthologyEntry
[1733]544
545    def __init__(self):
546        DBDataBase.__init__(self)
547        self._keys = [d.entry_id for d in self.api.list("ko")]
548
549
[1532]550@entry.entry_decorate
551class PathwayEntry(entry.DBEntry):
[1733]552    FIELDS = [
553        ("ENTRY", fields.DBEntryField),
554        ("NAME", fields.DBNameField),
555        ("DESCRIPTION", fields.DBSimpleField),
556        ("CLASS", fields.DBSimpleField),
557        ("PATHWAY_MAP", fields.DBPathwayMapField),
558        ("MODULE", fields.DBSimpleField),
559        ("DISEASE", fields.DBSimpleField),
560        ("DRUG", fields.DBSimpleField),
561        ("DBLINKS", fields.DBDBLinks),
562        ("ORGANISM", fields.DBSimpleField),
563        ("GENE", fields.DBGeneField),
564        ("ENZYME", fields.DBEnzymeField),
565        ("COMPOUND", fields.DBCompoundField),
566        ("REFERENCE", fields.DBReference),
567        ("REL_PATHWAY", fields.DBSimpleField),
568        ("KO_PATHWAY", fields.DBSimpleField),
569    ]
570
[1532]571    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]572
[1532]573    @property
574    def gene(self):
575        if hasattr(self, "GENE"):
576            genes = self.GENE._convert()
577        else:
578            return None
[1733]579
[1532]580        org = self.organism
581        org_prefix = ""
582        if org:
583            match = re.findall(r"\[GN:([a-z]+)\]", org)
584            if match:
585                org_prefix = match[0] + ":"
586        genes = [org_prefix + g for g in genes]
[1733]587        return genes
588
589
590class Pathway(DBDataBase):
[1741]591    """
592    KEGG Pathway database
593
594    :param str prefix:
595        KEGG Organism code ('hsa', ...) or 'map', 'ko', 'ec' or 'rn'
596
597    """
[1532]598    DB = "path"
599    ENTRY_TYPE = PathwayEntry
[1733]600
[1741]601    def __init__(self, prefix="map"):
[1532]602        DBDataBase.__init__(self)
[1741]603        self.prefix = prefix
604        valid = [d.org_code for d in self.api.list_organisms()] + \
605                ["map", "ko", "ec", "rn"]
606
607        if prefix not in valid:
608            raise ValueError("Invalid prefix %r" % prefix)
609
610        self._keys = [d.entry_id for d in self.api.list("pathway/" + prefix)]
Note: See TracBrowser for help on using the repository browser.