source: orange-bioinformatics/_bioinformatics/obiKEGG/databases.py @ 1737:729925f75820

Revision 1737:729925f75820, 15.9 KB checked in by Ales Erjavec <ales.erjavec@…>, 13 months ago (diff)

KEGG bug fixes.

RevLine 
[1532]1"""
[1736]2DBGET Database Interface
3========================
4
[1532]5"""
[1535]6from __future__ import absolute_import
7
8import re
[1532]9
10from . import entry
11from .entry import fields
12from . import api
13
[1733]14
[1536]15def iter_take(source_iter, n):
[1736]16    """
17    Return a list of the first `n` items in `source_iter`.
18    """
[1536]19    source_iter = iter(source_iter)
20    return [item for _, item in zip(range(n), source_iter)]
21
[1733]22
[1536]23def batch_iter(source_iter, n):
[1736]24    """
25    Split the `source_iter` into batches of size `n`.
26    """
[1536]27    source_iter = iter(source_iter)
28    while True:
29        batch = iter_take(source_iter, n)
30        if batch:
31            yield batch
32        else:
33            break
[1733]34
35
[1536]36def chain_iter(chains_iter):
37    for iter in chains_iter:
38        for element in iter:
39            yield element
40
[1733]41
42# TODO: DBDataBase should be able to be constructed from a flat text
43# entry file. The precache etc. should be moved in caching api, that creates
44# simple file system hierarchy where the flat database is saved (with db
45# release string), e.g.
46# genes/hsa.dbget
47# genes/hsa.release
48# genes/sce.dbget
49# path.dbget
50# module.dbget
51# ligand/compound.dbget
52
53
[1532]54class DBDataBase(object):
[1733]55    """
[1736]56    Base class for a DBGET database interface.
[1733]57
[1532]58    """
[1736]59    #: ENTRY_TYPE constructor (a :class:`~.entry.DBEntry` subclass). This
60    #: should be redefined in subclasses.
[1532]61    ENTRY_TYPE = entry.DBEntry
[1733]62
[1736]63    #: A database name/abbreviation (e.g. 'pathway'). Needs to be set in a
64    #: subclass or object instance's constructor before calling the base.
65    #: __init__
[1532]66    DB = None
[1733]67
[1532]68    def __init__(self, **kwargs):
69        if not self.DB:
[1733]70            raise TypeError("Cannot make an instance of abstract base "
71                            "class %r." % type(self).__name__)
72
[1532]73        self.api = api.CachedKeggApi()
[1733]74        self.info = self.api.info(self.DB)
[1532]75        release = self.info.release
76        self.api.set_default_release(release)
77        self._keys = []
[1733]78
[1532]79    def keys(self):
[1733]80        """
[1736]81        Return a list of database keys. These are unique KEGG identifiers
[1733]82        that can be used to query the database.
83
84        """
[1532]85        return list(self._keys)
[1733]86
[1532]87    def iterkeys(self):
[1733]88        """
[1736]89        Return an iterator over the `keys`.
[1733]90        """
[1532]91        return iter(self._keys)
[1733]92
[1532]93    def items(self):
[1733]94        """
[1736]95        Return a list of all (key, :obj:`DBDataBase.ENTRY_TYPE` instance)
96        tuples.
97
[1733]98        """
[1536]99        return list(zip(self.keys(), self.batch_get(self.keys())))
[1733]100
[1532]101    def iteritems(self):
[1733]102        """
103        Return an iterator over the `items`.
104        """
[1536]105        batch_size = 100
106        iterkeys = self.iterkeys()
107        return chain_iter(zip(batch, self.batch_get(batch))
108                          for batch in batch_iter(iterkeys, batch_size))
[1733]109
[1532]110    def values(self):
[1733]111        """
[1736]112        Return a list of all :obj:`DBDataBase.ENTRY_TYPE` instances.
[1733]113        """
[1536]114        return self.batch_get(self.keys())
[1733]115
[1532]116    def itervalues(self):
[1733]117        """
[1736]118        Return an iterator over all :obj:`DBDataBase.ENTRY_TYPE` instances.
[1733]119        """
[1536]120        batch_size = 100
121        iterkeys = self.iterkeys()
122        return chain_iter(self.batch_get(batch)
123                          for batch in batch_iter(iterkeys, batch_size))
[1733]124
[1532]125    def get(self, key, default=None):
[1733]126        """
[1736]127        Return an :obj:`DBDataBase.ENTRY_TYPE` instance for the `key`.
128        Raises :class:`KeyError` if not found.
[1733]129
130        """
[1538]131        try:
[1532]132            return self.__getitem__(key)
[1538]133        except KeyError:
[1532]134            return default
[1733]135
[1532]136    def has_key(self, key):
137        return self.__contains__(key)
[1733]138
[1532]139    def __getitem__(self, key):
140        e = self.get_entry(key)
141        if e is None:
142            raise KeyError(key)
143        else:
144            return e
[1733]145
[1532]146    def __contains__(self, key):
147        return key in set(self.keys())
[1733]148
[1532]149    def __len__(self):
150        return len(self.keys())
[1733]151
[1532]152    def __iter__(self):
153        return iter(self.keys())
[1733]154
[1532]155    def get_text(self, key):
[1733]156        """
157        Return the database entry for `key` as plain text.
158        """
[1535]159        key = self._add_db(key)
[1733]160        return self.api.get([key])
161
[1532]162    def get_entry(self, key):
[1733]163        """
164        Return the database entry for `key` as an instance of `ENTRY_TYPE`.
165        """
[1532]166        text = self.get_text(key)
167        if not text or text == "None":
168            return None
169        else:
170            return self.ENTRY_TYPE(text)
[1733]171
[1532]172    def find(self, name):
173        """
[1736]174        Find `name` using kegg `find` api.
[1733]175        """
176        res = self.api.find(self.DB, name).splitlines()
177        return [r.split(" ", 1)[0] for r in res]
178
179    def pre_cache(self, keys=None, batch_size=10, progress_callback=None):
180        """
[1736]181        Retrieve all the entries for `keys` and cache them locally for faster
182        subsequent retrieval. If `keys` is ``None`` then all entries will be
183        retrieved.
184
[1532]185        """
186        if not isinstance(self.api, api.CachedKeggApi):
[1733]187            raise TypeError("Not an instance of api.CachedKeggApi")
188
189        if batch_size > 10 or batch_size < 1:
[1532]190            raise ValueError("Invalid batch_size")
[1733]191
[1532]192        if keys is None:
193            keys = self.keys()
[1733]194
[1535]195        keys = list(keys)
[1532]196        start = 0
197        while start < len(keys):
198            batch = keys[start: start + batch_size]
199            batch = map(self._add_db, batch)
[1733]200
201            self.api.get(batch)
202
[1532]203            if progress_callback:
204                progress_callback(100.0 * start / len(keys))
[1733]205
[1532]206            start += batch_size
[1733]207
[1535]208    def batch_get(self, keys):
[1733]209        """
210        Batch retrieve all entries for keys. This can be significantly
211        faster then getting each entry separately especially if entries
212        are not yet cached.
213
[1535]214        """
215        entries = []
[1737]216        batch_size = 10
[1535]217        keys = list(keys)
218        start = 0
219        while start < len(keys):
220            batch = keys[start: start + batch_size]
221            batch = map(self._add_db, batch)
[1733]222            batch_entries = self.api.get(batch)
[1535]223            if batch_entries is not None:
224                batch_entries = batch_entries.split("///\n")
[1733]225                # Remove possible empty last line
[1538]226                batch_entries = [e for e in batch_entries if e.strip()]
[1535]227                entries.extend(map(self.ENTRY_TYPE, batch_entries))
228            start += batch_size
[1733]229
[1535]230        return entries
[1733]231
[1532]232    def _add_db(self, key):
[1733]233        """
234        Prefix the key with '%(DB)s:' string if not already prefixed.
[1532]235        """
236        if not key.startswith(self.DB + ":"):
237            return self.DB + ":" + key
238        else:
239            return key
[1733]240
241
[1532]242@entry.entry_decorate
243class GenomeEntry(entry.DBEntry):
[1733]244    """
245    Entry for a KEGG Genome database.
246    """
247    FIELDS = [
248        ("ENTRY", fields.DBEntryField),
249        ("NAME", fields.DBNameField),
250        ("DEFINITION", fields.DBDefinitionField),
251        ("ANNOTATION", fields.DBSimpleField),
252        ("TAXONOMY", fields.DBTaxonomyField),
253        ("DATA_SOURCE", fields.DBSimpleField),
254        ("ORIGINAL_DB", fields.DBSimpleField),
255        ("KEYWORDS", fields.DBSimpleField),
256        ("DISEASE", fields.DBSimpleField),
257        ("COMMENT", fields.DBSimpleField),
258        ("CHROMOSOME", fields.DBFieldWithSubsections),
259        ("PLASMID", fields.DBSimpleField),
260        ("STATISTICS", fields.DBSimpleField),
261        ("REFERENCE", fields.DBReference)
262    ]
263
[1532]264    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]265
[1532]266    def __init__(self, text):
267        entry.DBEntry.__init__(self, text)
[1733]268
[1532]269    @property
[1733]270    def organism_code(self):
[1532]271        """
[1733]272        A three or four letter KEGG organism code (e.g. 'hsa', 'sce', ...)
273        """
[1532]274        return self.name.split(",", 1)[0]
275
276    @property
277    def taxid(self):
[1733]278        """
279        Organism NCBI taxonomy id.
280        """
[1532]281        return self.TAXONOMY.taxid
[1733]282
[1736]283    def org_code(self):
284        # for backwards compatibility; return the `organism_code`
285        return self.organism_code
[1733]286
[1532]287
288class Genome(DBDataBase):
[1733]289    """
290    An interface to the A KEGG GENOME database.
291    """
[1532]292    DB = "genome"
293    ENTRY_TYPE = GenomeEntry
[1733]294
[1532]295    # For obiTaxonomy.common_taxids mapping
[1733]296    TAXID_MAP = {
297        "562": "511145",   # Escherichia coli K-12 MG1655
298        "2104": "272634",  # Mycoplasma pneumoniae M129
299        "4530": "39947",   # Oryza sativa ssp. japonica cultivar Nipponbare (Japanese rice)
300        "4932": "559292",  # Saccharomyces cerevisiae S288C
301        "4896": "284812",  # Schizosaccharomyces pombe 972h-
302    }
303
[1532]304    def __init__(self):
305        DBDataBase.__init__(self)
[1733]306        self._org_list = self.api.list_organisms()
307        self._keys = [org.entry_id for org in self._org_list]
308
[1532]309    def _key_to_gn_entry_id(self, key):
310        res = self.find(key)
311        if len(res) == 0:
312            raise KeyError("Unknown key")
313        elif len(res) > 1:
314            raise ValueError("Not a unique key")
315        else:
316            return res[0]
[1733]317
[1532]318    @classmethod
319    def common_organisms(cls):
320        return ['ath', 'bta', 'cel', 'cre', 'dre', 'ddi',
321                'dme', 'eco', 'hsa', 'mmu', 'mpn', 'osa',
322                'pfa', 'rno', 'sce', 'spo', 'zma', 'xla']
[1733]323
[1532]324    @classmethod
325    def essential_organisms(cls):
326        return ['ddi', 'dme', 'hsa', 'mmu', 'sce']
[1733]327
328    def org_code_to_entry_key(self, code):
329        """
330        Map an organism code ('hsa', 'sce', ...) to the corresponding kegg
331        identifier (T + 5 digit number).
332
333        """
334        for org in self._org_list:
335            if org.org_code == code:
336                return org.entry_id
337        else:
338            raise ValueError("Unknown organism code '%s'" % code)
339
[1532]340    def search(self, string, relevance=False):
[1733]341        """
342        Search the genome database for string using ``bfind``.
[1546]343        """
[1532]344        if relevance:
345            raise NotImplementedError("relevance is no longer supported")
[1733]346
[1532]347        if string in self.TAXID_MAP:
348            string = self.TAXID_MAP[string]
[1733]349
350        res = self.api.find(self.DB, string)
[1532]351        if not res:
352            return []
[1733]353
[1532]354        res = res.splitlines()
355        res = [r.split(",", 1)[0] for r in res]
[1733]356        res = [r.split(None, 1)[1] for r in res]
[1532]357        return res
[1733]358
359
[1532]360@entry.entry_decorate
361class GeneEntry(entry.DBEntry):
[1733]362    FIELDS = [
363        ("ENTRY", fields.DBEntryField),
364        ("NAME", fields.DBNameField),
365        ("DEFINITION", fields.DBDefinitionField),
366        ("ORTHOLOGY", fields.DBSimpleField),
367        ("ORGANISM", fields.DBSimpleField),
368        ("PATHWAY", fields.DBPathway),
369        ("MODULE", fields.DBSimpleField),
370        ("DISEASE", fields.DBSimpleField),
371        ("DRUG_TARGET", fields.DBSimpleField),
372        ("CLASS", fields.DBSimpleField),
373        ("MOTIF", fields.DBSimpleField),
374        ("DBLINKS", fields.DBDBLinks),
375        ("STRUCTURE", fields.DBSimpleField),
376        ("POSITION", fields.DBSimpleField),
377        ("AASEQ", fields.DBAASeq),
378        ("NTSEQ", fields.DBNTSeq)
379    ]
380
[1532]381    def aliases(self):
[1733]382        return [self.entry_key] + \
383               (self.name.split(",") if self.name else []) + \
384               ([link[1][0] for link in self.dblinks.items()]
385                if self.dblinks else [])
[1532]386
387    @property
388    def alt_names(self):
[1733]389        """
390        For backwards compatibility.
[1532]391        """
392        return self.aliases()
[1733]393
394
[1532]395class Genes(DBDataBase):
[1736]396    """
397    Interface to the KEGG Genes database.
398
399    :param org_code: KEGG organism code (e.g. 'hsa').
400    :type org_code: str
401
402    """
[1733]403    DB = None  # Needs to be set in __init__
[1532]404    ENTRY_TYPE = GeneEntry
[1733]405
[1532]406    def __init__(self, org_code):
[1733]407        # TODO: Map to org code from kegg id (T + 5 digits)
[1532]408        self.DB = org_code
409        self.org_code = org_code
410        DBDataBase.__init__(self)
411        self._keys = self.api.get_genes_by_organism(org_code)
[1733]412
[1532]413    def gene_aliases(self):
414        aliases = {}
415        for entry in self.itervalues():
[1734]416            aliases.update(
417                dict.fromkeys(entry.aliases(),
418                              self.org_code + ":" + entry.entry_key)
419            )
420
[1532]421        return aliases
[1733]422
[1532]423
424@entry.entry_decorate
425class CompoundEntry(entry.DBEntry):
[1733]426    FIELDS = [
427        ("ENTRY", fields.DBEntryField),
428        ("NAME", fields.DBNameField),
429        ("FORMULA", fields.DBSimpleField),
430        ("EXACT_MASS", fields.DBSimpleField),
431        ("MOL_WEIGHT", fields.DBSimpleField),
432        ("REMARK", fields.DBSimpleField),
433        ("COMMENT", fields.DBSimpleField),
434        ("REACTION", fields.DBSimpleField),
435        ("PATHWAY", fields.DBPathway),
436        ("ENZYME", fields.DBSimpleField),
437        ("BRITE", fields.DBSimpleField),
438        ("REFERENCE", fields.DBSimpleField),
439        ("DBLINKS", fields.DBDBLinks),
440        ("ATOM", fields.DBSimpleField),
441        ("BOND", fields.DBSimpleField)
442    ]
443
444
445class Compound(DBDataBase):
[1532]446    DB = "cpd"
447    ENTRY_TYPE = CompoundEntry
[1733]448
[1532]449    def __init__(self):
450        DBDataBase.__init__(self)
[1733]451        self._keys = [d.entry_id for d in self.api.list("cpd")]
[1532]452
453
[1733]454@entry.entry_decorate
[1532]455class ReactionEntry(entry.DBEntry):
[1733]456    FIELDS = [
457        ("ENTRY", fields.DBEntryField),
458        ("NAME", fields.DBNameField),
459        ("DEFINITION", fields.DBDefinitionField),
460        ("EQUATION", fields.DBSimpleField),
461        ("ENZYME", fields.DBSimpleField)
462    ]
463
464
465class Reaction(DBDataBase):
[1532]466    DB = "rn"
467    ENTRY_TYPE = ReactionEntry
[1733]468
[1532]469    def __init__(self):
470        DBDataBase.__init__(self)
[1733]471        self._keys = [d.entry_id for d in self.api.list("rn")]
472
473
[1532]474class Brite(DBDataBase):
475    DB = "br"
[1733]476
477
[1532]478class Disease(DBDataBase):
479    DB = "ds"
[1733]480
481
[1532]482class Drug(DBDataBase):
483    DB = "dr"
[1733]484
485
[1546]486@entry.entry_decorate
487class EnzymeEntry(entry.DBEntry):
[1733]488    FIELDS = [
489        ("ENTRY", fields.DBEntryField),
490        ("NAME", fields.DBNameField),
491        ("CLASS", fields.DBSimpleField),
492        ("SYSNAME", fields.DBSimpleField),
493        ("REACTION", fields.DBSimpleField),
494        ("ALL_REAC", fields.DBSimpleField),
495        ("SUBSTRATE", fields.DBSimpleField),
496        ("PRODUCT", fields.DBSimpleField),
497        ("COMMENT", fields.DBSimpleField),
498        ("REFERENCE", fields.DBReference),
499        ("PATHWAY", fields.DBPathway),
500        ("ORTHOLOGY", fields.DBSimpleField),
501        ("GENES", fields.DBSimpleField),
502        ("DBLINKS", fields.DBDBLinks)
503    ]
504
[1546]505    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]506
507
508class Enzyme(DBDataBase):
[1532]509    DB = "ec"
[1546]510    ENTRY_TYPE = EnzymeEntry
[1733]511
512    def __init__(self):
513        DBDataBase.__init__(self)
514        self._keys = [d.entry_id for d in self.api.list("ec")]
515
516
[1532]517@entry.entry_decorate
518class OrthologyEntry(entry.DBEntry):
[1733]519    FIELDS = [
520        ("ENTRY", fields.DBEntryField),
521        ("NAME", fields.DBNameField),
522        ("CLASS", fields.DBSimpleField),
523        ("DBLINKS", fields.DBDBLinks),
524        ("GENES", fields.DBSimpleField),
525    ]
526
527
[1532]528class Orthology(DBDataBase):
529    DB = "ko"
530    ENTRY_TYPE = OrthologyEntry
[1733]531
532    def __init__(self):
533        DBDataBase.__init__(self)
534        self._keys = [d.entry_id for d in self.api.list("ko")]
535
536
[1532]537@entry.entry_decorate
538class PathwayEntry(entry.DBEntry):
[1733]539    FIELDS = [
540        ("ENTRY", fields.DBEntryField),
541        ("NAME", fields.DBNameField),
542        ("DESCRIPTION", fields.DBSimpleField),
543        ("CLASS", fields.DBSimpleField),
544        ("PATHWAY_MAP", fields.DBPathwayMapField),
545        ("MODULE", fields.DBSimpleField),
546        ("DISEASE", fields.DBSimpleField),
547        ("DRUG", fields.DBSimpleField),
548        ("DBLINKS", fields.DBDBLinks),
549        ("ORGANISM", fields.DBSimpleField),
550        ("GENE", fields.DBGeneField),
551        ("ENZYME", fields.DBEnzymeField),
552        ("COMPOUND", fields.DBCompoundField),
553        ("REFERENCE", fields.DBReference),
554        ("REL_PATHWAY", fields.DBSimpleField),
555        ("KO_PATHWAY", fields.DBSimpleField),
556    ]
557
[1532]558    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]559
[1532]560    @property
561    def gene(self):
562        if hasattr(self, "GENE"):
563            genes = self.GENE._convert()
564        else:
565            return None
[1733]566
[1532]567        org = self.organism
568        org_prefix = ""
569        if org:
570            match = re.findall(r"\[GN:([a-z]+)\]", org)
571            if match:
572                org_prefix = match[0] + ":"
573        genes = [org_prefix + g for g in genes]
[1733]574        return genes
575
576
577class Pathway(DBDataBase):
[1532]578    DB = "path"
579    ENTRY_TYPE = PathwayEntry
[1733]580
[1532]581    def __init__(self):
582        DBDataBase.__init__(self)
[1733]583        self._keys = [d.entry_id for d in self.api.list("path")]
Note: See TracBrowser for help on using the repository browser.