source: orange-bioinformatics/_bioinformatics/obiKEGG/databases.py @ 1747:fee3c71716ef

Revision 1747:fee3c71716ef, 16.2 KB checked in by markotoplak, 12 months ago (diff)

KEGG organism name finding fixes.

RevLine 
[1532]1"""
[1736]2DBGET Database Interface
3========================
4
[1532]5"""
[1535]6from __future__ import absolute_import
7
8import re
[1532]9
10from . import entry
11from .entry import fields
12from . import api
13
[1733]14
[1536]15def iter_take(source_iter, n):
[1736]16    """
17    Return a list of the first `n` items in `source_iter`.
18    """
[1536]19    source_iter = iter(source_iter)
20    return [item for _, item in zip(range(n), source_iter)]
21
[1733]22
[1536]23def batch_iter(source_iter, n):
[1736]24    """
25    Split the `source_iter` into batches of size `n`.
26    """
[1536]27    source_iter = iter(source_iter)
28    while True:
29        batch = iter_take(source_iter, n)
30        if batch:
31            yield batch
32        else:
33            break
[1733]34
35
[1536]36def chain_iter(chains_iter):
37    for iter in chains_iter:
38        for element in iter:
39            yield element
40
[1733]41
42# TODO: DBDataBase should be able to be constructed from a flat text
43# entry file. The precache etc. should be moved in caching api, that creates
44# simple file system hierarchy where the flat database is saved (with db
45# release string), e.g.
46# genes/hsa.dbget
47# genes/hsa.release
48# genes/sce.dbget
49# path.dbget
50# module.dbget
51# ligand/compound.dbget
52
53
[1532]54class DBDataBase(object):
[1733]55    """
[1736]56    Base class for a DBGET database interface.
[1733]57
[1532]58    """
[1736]59    #: ENTRY_TYPE constructor (a :class:`~.entry.DBEntry` subclass). This
60    #: should be redefined in subclasses.
[1532]61    ENTRY_TYPE = entry.DBEntry
[1733]62
[1736]63    #: A database name/abbreviation (e.g. 'pathway'). Needs to be set in a
64    #: subclass or object instance's constructor before calling the base.
65    #: __init__
[1532]66    DB = None
[1733]67
[1532]68    def __init__(self, **kwargs):
69        if not self.DB:
[1733]70            raise TypeError("Cannot make an instance of abstract base "
71                            "class %r." % type(self).__name__)
72
[1532]73        self.api = api.CachedKeggApi()
[1733]74        self.info = self.api.info(self.DB)
[1532]75        release = self.info.release
76        self.api.set_default_release(release)
77        self._keys = []
[1733]78
[1532]79    def keys(self):
[1733]80        """
[1736]81        Return a list of database keys. These are unique KEGG identifiers
[1733]82        that can be used to query the database.
83
84        """
[1532]85        return list(self._keys)
[1733]86
[1532]87    def iterkeys(self):
[1733]88        """
[1736]89        Return an iterator over the `keys`.
[1733]90        """
[1532]91        return iter(self._keys)
[1733]92
[1532]93    def items(self):
[1733]94        """
[1736]95        Return a list of all (key, :obj:`DBDataBase.ENTRY_TYPE` instance)
96        tuples.
97
[1733]98        """
[1536]99        return list(zip(self.keys(), self.batch_get(self.keys())))
[1733]100
[1532]101    def iteritems(self):
[1733]102        """
103        Return an iterator over the `items`.
104        """
[1536]105        batch_size = 100
106        iterkeys = self.iterkeys()
107        return chain_iter(zip(batch, self.batch_get(batch))
108                          for batch in batch_iter(iterkeys, batch_size))
[1733]109
[1532]110    def values(self):
[1733]111        """
[1736]112        Return a list of all :obj:`DBDataBase.ENTRY_TYPE` instances.
[1733]113        """
[1536]114        return self.batch_get(self.keys())
[1733]115
[1532]116    def itervalues(self):
[1733]117        """
[1736]118        Return an iterator over all :obj:`DBDataBase.ENTRY_TYPE` instances.
[1733]119        """
[1536]120        batch_size = 100
121        iterkeys = self.iterkeys()
122        return chain_iter(self.batch_get(batch)
123                          for batch in batch_iter(iterkeys, batch_size))
[1733]124
[1532]125    def get(self, key, default=None):
[1733]126        """
[1736]127        Return an :obj:`DBDataBase.ENTRY_TYPE` instance for the `key`.
128        Raises :class:`KeyError` if not found.
[1733]129
130        """
[1538]131        try:
[1532]132            return self.__getitem__(key)
[1538]133        except KeyError:
[1532]134            return default
[1733]135
[1532]136    def has_key(self, key):
137        return self.__contains__(key)
[1733]138
[1532]139    def __getitem__(self, key):
140        e = self.get_entry(key)
141        if e is None:
142            raise KeyError(key)
143        else:
144            return e
[1733]145
[1532]146    def __contains__(self, key):
147        return key in set(self.keys())
[1733]148
[1532]149    def __len__(self):
150        return len(self.keys())
[1733]151
[1532]152    def __iter__(self):
153        return iter(self.keys())
[1733]154
[1532]155    def get_text(self, key):
[1733]156        """
157        Return the database entry for `key` as plain text.
158        """
[1535]159        key = self._add_db(key)
[1733]160        return self.api.get([key])
161
[1532]162    def get_entry(self, key):
[1733]163        """
164        Return the database entry for `key` as an instance of `ENTRY_TYPE`.
165        """
[1532]166        text = self.get_text(key)
167        if not text or text == "None":
168            return None
169        else:
170            return self.ENTRY_TYPE(text)
[1733]171
[1532]172    def find(self, name):
173        """
[1736]174        Find `name` using kegg `find` api.
[1733]175        """
176        res = self.api.find(self.DB, name).splitlines()
177        return [r.split(" ", 1)[0] for r in res]
178
179    def pre_cache(self, keys=None, batch_size=10, progress_callback=None):
180        """
[1736]181        Retrieve all the entries for `keys` and cache them locally for faster
182        subsequent retrieval. If `keys` is ``None`` then all entries will be
183        retrieved.
184
[1532]185        """
186        if not isinstance(self.api, api.CachedKeggApi):
[1733]187            raise TypeError("Not an instance of api.CachedKeggApi")
188
189        if batch_size > 10 or batch_size < 1:
[1532]190            raise ValueError("Invalid batch_size")
[1733]191
[1532]192        if keys is None:
193            keys = self.keys()
[1733]194
[1535]195        keys = list(keys)
[1532]196        start = 0
197        while start < len(keys):
198            batch = keys[start: start + batch_size]
199            batch = map(self._add_db, batch)
[1733]200
201            self.api.get(batch)
202
[1532]203            if progress_callback:
204                progress_callback(100.0 * start / len(keys))
[1733]205
[1532]206            start += batch_size
[1733]207
[1535]208    def batch_get(self, keys):
[1733]209        """
210        Batch retrieve all entries for keys. This can be significantly
211        faster then getting each entry separately especially if entries
212        are not yet cached.
213
[1535]214        """
215        entries = []
[1737]216        batch_size = 10
[1535]217        keys = list(keys)
218        start = 0
219        while start < len(keys):
220            batch = keys[start: start + batch_size]
221            batch = map(self._add_db, batch)
[1733]222            batch_entries = self.api.get(batch)
[1535]223            if batch_entries is not None:
224                batch_entries = batch_entries.split("///\n")
[1733]225                # Remove possible empty last line
[1538]226                batch_entries = [e for e in batch_entries if e.strip()]
[1535]227                entries.extend(map(self.ENTRY_TYPE, batch_entries))
228            start += batch_size
[1733]229
[1535]230        return entries
[1733]231
[1532]232    def _add_db(self, key):
[1733]233        """
234        Prefix the key with '%(DB)s:' string if not already prefixed.
[1532]235        """
236        if not key.startswith(self.DB + ":"):
237            return self.DB + ":" + key
238        else:
239            return key
[1733]240
241
[1532]242@entry.entry_decorate
243class GenomeEntry(entry.DBEntry):
[1733]244    """
245    Entry for a KEGG Genome database.
246    """
247    FIELDS = [
248        ("ENTRY", fields.DBEntryField),
249        ("NAME", fields.DBNameField),
250        ("DEFINITION", fields.DBDefinitionField),
251        ("ANNOTATION", fields.DBSimpleField),
252        ("TAXONOMY", fields.DBTaxonomyField),
253        ("DATA_SOURCE", fields.DBSimpleField),
254        ("ORIGINAL_DB", fields.DBSimpleField),
255        ("KEYWORDS", fields.DBSimpleField),
256        ("DISEASE", fields.DBSimpleField),
257        ("COMMENT", fields.DBSimpleField),
258        ("CHROMOSOME", fields.DBFieldWithSubsections),
259        ("PLASMID", fields.DBSimpleField),
260        ("STATISTICS", fields.DBSimpleField),
261        ("REFERENCE", fields.DBReference)
262    ]
263
[1532]264    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]265
[1532]266    def __init__(self, text):
267        entry.DBEntry.__init__(self, text)
[1733]268
[1532]269    @property
[1733]270    def organism_code(self):
[1532]271        """
[1733]272        A three or four letter KEGG organism code (e.g. 'hsa', 'sce', ...)
273        """
[1532]274        return self.name.split(",", 1)[0]
275
276    @property
277    def taxid(self):
[1733]278        """
279        Organism NCBI taxonomy id.
280        """
[1532]281        return self.TAXONOMY.taxid
[1733]282
[1736]283    def org_code(self):
284        # for backwards compatibility; return the `organism_code`
285        return self.organism_code
[1733]286
[1532]287
288class Genome(DBDataBase):
[1733]289    """
290    An interface to the A KEGG GENOME database.
291    """
[1532]292    DB = "genome"
293    ENTRY_TYPE = GenomeEntry
[1733]294
[1532]295    # For obiTaxonomy.common_taxids mapping
[1733]296    TAXID_MAP = {
297        "562": "511145",   # Escherichia coli K-12 MG1655
298        "2104": "272634",  # Mycoplasma pneumoniae M129
299        "4530": "39947",   # Oryza sativa ssp. japonica cultivar Nipponbare (Japanese rice)
300        "4932": "559292",  # Saccharomyces cerevisiae S288C
301        "4896": "284812",  # Schizosaccharomyces pombe 972h-
302    }
303
[1532]304    def __init__(self):
305        DBDataBase.__init__(self)
[1733]306        self._org_list = self.api.list_organisms()
307        self._keys = [org.entry_id for org in self._org_list]
308
[1532]309    def _key_to_gn_entry_id(self, key):
310        res = self.find(key)
311        if len(res) == 0:
312            raise KeyError("Unknown key")
313        elif len(res) > 1:
314            raise ValueError("Not a unique key")
315        else:
316            return res[0]
[1733]317
[1532]318    @classmethod
319    def common_organisms(cls):
320        return ['ath', 'bta', 'cel', 'cre', 'dre', 'ddi',
321                'dme', 'eco', 'hsa', 'mmu', 'mpn', 'osa',
322                'pfa', 'rno', 'sce', 'spo', 'zma', 'xla']
[1733]323
[1532]324    @classmethod
325    def essential_organisms(cls):
326        return ['ddi', 'dme', 'hsa', 'mmu', 'sce']
[1733]327
328    def org_code_to_entry_key(self, code):
329        """
330        Map an organism code ('hsa', 'sce', ...) to the corresponding kegg
331        identifier (T + 5 digit number).
332
333        """
334        for org in self._org_list:
335            if org.org_code == code:
336                return org.entry_id
337        else:
338            raise ValueError("Unknown organism code '%s'" % code)
339
[1532]340    def search(self, string, relevance=False):
[1733]341        """
342        Search the genome database for string using ``bfind``.
[1546]343        """
[1532]344        if relevance:
345            raise NotImplementedError("relevance is no longer supported")
[1733]346
[1532]347        if string in self.TAXID_MAP:
348            string = self.TAXID_MAP[string]
[1733]349
[1747]350        res = self.api.find(self.DB, string).strip()
[1532]351        if not res:
352            return []
[1733]353
[1532]354        res = res.splitlines()
355        res = [r.split(",", 1)[0] for r in res]
[1733]356        res = [r.split(None, 1)[1] for r in res]
[1532]357        return res
[1733]358
359
[1532]360@entry.entry_decorate
361class GeneEntry(entry.DBEntry):
[1733]362    FIELDS = [
363        ("ENTRY", fields.DBEntryField),
364        ("NAME", fields.DBNameField),
365        ("DEFINITION", fields.DBDefinitionField),
366        ("ORTHOLOGY", fields.DBSimpleField),
367        ("ORGANISM", fields.DBSimpleField),
368        ("PATHWAY", fields.DBPathway),
369        ("MODULE", fields.DBSimpleField),
370        ("DISEASE", fields.DBSimpleField),
371        ("DRUG_TARGET", fields.DBSimpleField),
372        ("CLASS", fields.DBSimpleField),
373        ("MOTIF", fields.DBSimpleField),
374        ("DBLINKS", fields.DBDBLinks),
375        ("STRUCTURE", fields.DBSimpleField),
376        ("POSITION", fields.DBSimpleField),
377        ("AASEQ", fields.DBAASeq),
378        ("NTSEQ", fields.DBNTSeq)
379    ]
380
[1532]381    def aliases(self):
[1733]382        return [self.entry_key] + \
383               (self.name.split(",") if self.name else []) + \
384               ([link[1][0] for link in self.dblinks.items()]
385                if self.dblinks else [])
[1532]386
387    @property
388    def alt_names(self):
[1733]389        """
390        For backwards compatibility.
[1532]391        """
392        return self.aliases()
[1733]393
394
[1532]395class Genes(DBDataBase):
[1736]396    """
397    Interface to the KEGG Genes database.
398
[1741]399    :param str org_code: KEGG organism code (e.g. 'hsa').
[1736]400
401    """
[1733]402    DB = None  # Needs to be set in __init__
[1532]403    ENTRY_TYPE = GeneEntry
[1733]404
[1532]405    def __init__(self, org_code):
[1733]406        # TODO: Map to org code from kegg id (T + 5 digits)
[1532]407        self.DB = org_code
408        self.org_code = org_code
409        DBDataBase.__init__(self)
410        self._keys = self.api.get_genes_by_organism(org_code)
[1733]411
[1532]412    def gene_aliases(self):
413        aliases = {}
414        for entry in self.itervalues():
[1734]415            aliases.update(
416                dict.fromkeys(entry.aliases(),
417                              self.org_code + ":" + entry.entry_key)
418            )
419
[1532]420        return aliases
[1733]421
[1532]422
423@entry.entry_decorate
424class CompoundEntry(entry.DBEntry):
[1733]425    FIELDS = [
426        ("ENTRY", fields.DBEntryField),
427        ("NAME", fields.DBNameField),
428        ("FORMULA", fields.DBSimpleField),
429        ("EXACT_MASS", fields.DBSimpleField),
430        ("MOL_WEIGHT", fields.DBSimpleField),
431        ("REMARK", fields.DBSimpleField),
432        ("COMMENT", fields.DBSimpleField),
433        ("REACTION", fields.DBSimpleField),
434        ("PATHWAY", fields.DBPathway),
435        ("ENZYME", fields.DBSimpleField),
436        ("BRITE", fields.DBSimpleField),
437        ("REFERENCE", fields.DBSimpleField),
438        ("DBLINKS", fields.DBDBLinks),
439        ("ATOM", fields.DBSimpleField),
440        ("BOND", fields.DBSimpleField)
441    ]
442
443
444class Compound(DBDataBase):
[1532]445    DB = "cpd"
446    ENTRY_TYPE = CompoundEntry
[1733]447
[1532]448    def __init__(self):
449        DBDataBase.__init__(self)
[1733]450        self._keys = [d.entry_id for d in self.api.list("cpd")]
[1532]451
452
[1733]453@entry.entry_decorate
[1532]454class ReactionEntry(entry.DBEntry):
[1733]455    FIELDS = [
456        ("ENTRY", fields.DBEntryField),
457        ("NAME", fields.DBNameField),
458        ("DEFINITION", fields.DBDefinitionField),
459        ("EQUATION", fields.DBSimpleField),
460        ("ENZYME", fields.DBSimpleField)
461    ]
462
463
464class Reaction(DBDataBase):
[1532]465    DB = "rn"
466    ENTRY_TYPE = ReactionEntry
[1733]467
[1532]468    def __init__(self):
469        DBDataBase.__init__(self)
[1733]470        self._keys = [d.entry_id for d in self.api.list("rn")]
471
472
[1532]473class Brite(DBDataBase):
474    DB = "br"
[1733]475
476
[1532]477class Disease(DBDataBase):
478    DB = "ds"
[1733]479
480
[1532]481class Drug(DBDataBase):
482    DB = "dr"
[1733]483
484
[1546]485@entry.entry_decorate
486class EnzymeEntry(entry.DBEntry):
[1733]487    FIELDS = [
488        ("ENTRY", fields.DBEntryField),
489        ("NAME", fields.DBNameField),
490        ("CLASS", fields.DBSimpleField),
491        ("SYSNAME", fields.DBSimpleField),
492        ("REACTION", fields.DBSimpleField),
493        ("ALL_REAC", fields.DBSimpleField),
494        ("SUBSTRATE", fields.DBSimpleField),
495        ("PRODUCT", fields.DBSimpleField),
496        ("COMMENT", fields.DBSimpleField),
497        ("REFERENCE", fields.DBReference),
498        ("PATHWAY", fields.DBPathway),
499        ("ORTHOLOGY", fields.DBSimpleField),
500        ("GENES", fields.DBSimpleField),
501        ("DBLINKS", fields.DBDBLinks)
502    ]
503
[1546]504    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]505
506
507class Enzyme(DBDataBase):
[1532]508    DB = "ec"
[1546]509    ENTRY_TYPE = EnzymeEntry
[1733]510
511    def __init__(self):
512        DBDataBase.__init__(self)
513        self._keys = [d.entry_id for d in self.api.list("ec")]
514
515
[1532]516@entry.entry_decorate
517class OrthologyEntry(entry.DBEntry):
[1733]518    FIELDS = [
519        ("ENTRY", fields.DBEntryField),
520        ("NAME", fields.DBNameField),
521        ("CLASS", fields.DBSimpleField),
522        ("DBLINKS", fields.DBDBLinks),
523        ("GENES", fields.DBSimpleField),
524    ]
525
526
[1532]527class Orthology(DBDataBase):
528    DB = "ko"
529    ENTRY_TYPE = OrthologyEntry
[1733]530
531    def __init__(self):
532        DBDataBase.__init__(self)
533        self._keys = [d.entry_id for d in self.api.list("ko")]
534
535
[1532]536@entry.entry_decorate
537class PathwayEntry(entry.DBEntry):
[1733]538    FIELDS = [
539        ("ENTRY", fields.DBEntryField),
540        ("NAME", fields.DBNameField),
541        ("DESCRIPTION", fields.DBSimpleField),
542        ("CLASS", fields.DBSimpleField),
543        ("PATHWAY_MAP", fields.DBPathwayMapField),
544        ("MODULE", fields.DBSimpleField),
545        ("DISEASE", fields.DBSimpleField),
546        ("DRUG", fields.DBSimpleField),
547        ("DBLINKS", fields.DBDBLinks),
548        ("ORGANISM", fields.DBSimpleField),
549        ("GENE", fields.DBGeneField),
550        ("ENZYME", fields.DBEnzymeField),
551        ("COMPOUND", fields.DBCompoundField),
552        ("REFERENCE", fields.DBReference),
553        ("REL_PATHWAY", fields.DBSimpleField),
554        ("KO_PATHWAY", fields.DBSimpleField),
555    ]
556
[1532]557    MULTIPLE_FIELDS = ["REFERENCE"]
[1733]558
[1532]559    @property
560    def gene(self):
561        if hasattr(self, "GENE"):
562            genes = self.GENE._convert()
563        else:
564            return None
[1733]565
[1532]566        org = self.organism
567        org_prefix = ""
568        if org:
569            match = re.findall(r"\[GN:([a-z]+)\]", org)
570            if match:
571                org_prefix = match[0] + ":"
572        genes = [org_prefix + g for g in genes]
[1733]573        return genes
574
575
576class Pathway(DBDataBase):
[1741]577    """
578    KEGG Pathway database
579
580    :param str prefix:
581        KEGG Organism code ('hsa', ...) or 'map', 'ko', 'ec' or 'rn'
582
583    """
[1532]584    DB = "path"
585    ENTRY_TYPE = PathwayEntry
[1733]586
[1741]587    def __init__(self, prefix="map"):
[1532]588        DBDataBase.__init__(self)
[1741]589        self.prefix = prefix
590        valid = [d.org_code for d in self.api.list_organisms()] + \
591                ["map", "ko", "ec", "rn"]
592
593        if prefix not in valid:
594            raise ValueError("Invalid prefix %r" % prefix)
595
596        self._keys = [d.entry_id for d in self.api.list("pathway/" + prefix)]
Note: See TracBrowser for help on using the repository browser.