Changeset 1733:548d1187a29f in orange-bioinformatics for _bioinformatics/obiKEGG/__init__.py


Ignore:
Timestamp:
03/05/13 19:48:00 (14 months ago)
Author:
Ales Erjavec <ales.erjavec@…>
Branch:
default
Message:

Porting obiKEGG to use the new REST KEGG API.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • _bioinformatics/obiKEGG/__init__.py

    r1716 r1733  
    44============================================== 
    55 
    6 This is a python module for access to `KEGG`_ using its web services. To use this module you need to have 
    7 `SUDS`_ python library installed (other backends are planed).  
     6This is a python module for access to `KEGG`_ using its web services. 
     7 
     8To use this module you need to have `slumber` and `requests` package 
     9installed. 
    810 
    911.. _`KEGG`: http://www.genome.jp/kegg/ 
    1012 
    11 .. _`SUDS`: http://pypi.python.org/pypi/suds/ 
    1213 
    1314""" 
    1415from __future__ import absolute_import 
    1516 
     17import os 
     18import sys 
    1619import urllib2 
    17 import os, sys 
     20 
    1821from collections import defaultdict 
    1922 
     
    2124 
    2225from Orange.utils import lru_cache 
     26from Orange.utils import progress_bar_milestones 
     27from Orange.utils import deprecated_keywords, deprecated_attribute, \ 
     28                         deprecated_function_name 
     29 
     30from .. import obiProb 
    2331 
    2432from . import databases 
     
    3341KEGGGenome = databases.Genome 
    3442KEGGGenes = databases.Genes 
    35 KEGGEnzymes = databases.Enzymes 
    36 KEGGReaction = databases.Reactions 
    37 KEGGPathways = databases.Pathways 
     43KEGGEnzyme = databases.Enzyme 
     44KEGGReaction = databases.Reaction 
     45KEGGPathways = databases.Pathway 
     46KEGGCompound = databases.Compound 
    3847 
    3948KEGGBrite = Brite 
     
    4554 
    4655 
    47 from .. import obiProb 
    48 from Orange.utils import deprecated_keywords, deprecated_attribute 
    49  
    50 class OrganismNotFoundError(Exception): pass 
     56class OrganismNotFoundError(Exception): 
     57    pass 
     58 
    5159 
    5260class Organism(object): 
     61    """ 
     62    A convenience class for retrieving information regarding an 
     63    organism in the KEGG Genes database. 
     64 
     65    :param org: KEGGG organism code (e.g. "hsa", "sce") 
     66    :type org: str 
     67 
     68    """ 
    5369    def __init__(self, org, genematcher=None): 
    5470        self.org_code = self.organism_name_search(org) 
    5571        self.genematcher = genematcher 
    5672        self.api = api.CachedKeggApi() 
    57          
     73 
    5874    @property 
    5975    def org(self): 
     76        """ 
     77        KEGG organism code. 
     78        """ 
    6079        return self.org_code 
    61      
     80 
    6281    @property 
    6382    def genes(self): 
     83        """ 
     84        An :class:`Genes` database instance for this organism. 
     85        """ 
     86        # TODO: This should not be a property but a method. 
     87        # I think it was only put here as back compatibility with old obiKEGG. 
    6488        if not hasattr(self, "_genes"): 
    6589            genes = KEGGGenes(self.org_code) 
    6690            self._genes = genes 
    6791        return self._genes 
    68      
     92 
    6993    def gene_aliases(self): 
    70         return self.genes().gene_aliases() 
    71      
     94        """ 
     95        Return known gene aliases (synonyms in other databases). 
     96        """ 
     97        return self.genes.gene_aliases() 
     98 
    7299    def pathways(self, with_ids=None): 
     100        """ 
     101        Return a list of all pathways for this organism. 
     102        """ 
    73103        if with_ids is not None: 
    74104            return self.api.get_pathways_by_genes(with_ids) 
     
    77107     
    78108    def list_pathways(self): 
     109        """ 
     110        List all pathways. 
     111        """ 
     112        # NOTE: remove/deprecate and use pathways() 
    79113        return self.pathways() 
    80114     
     
    97131                tabs = l.split("\t") 
    98132                cset = set([tabs[0]]) 
     133 
     134                if ":" in tabs[0]: 
     135                    # also add 'identifier' from 'org_code:identifier' 
     136                    cset.add(tabs[0].split(":", 1)[-1]) 
     137 
    99138                try: 
    100139                    rest = tabs[1].split(";")[0] 
    101140                    cset |= set(rest.split(", ")) 
    102141                except: 
    103                     pass #do not crash if a line does not conform 
     142                    pass  # do not crash if a line does not conform 
    104143                out.append(cset) 
    105144        return out 
    106145 
    107     def get_enriched_pathways(self, genes, reference=None, prob=obiProb.Binomial(), callback=None): 
    108         """ Return a dictionary with enriched pathways ids as keys 
    109         and (list_of_genes, p_value, num_of_reference_genes) tuples  
     146    def get_enriched_pathways(self, genes, reference=None, 
     147                              prob=obiProb.Binomial(), callback=None): 
     148        """ 
     149        Return a dictionary with enriched pathways ids as keys 
     150        and (list_of_genes, p_value, num_of_reference_genes) tuples 
    110151        as items. 
    111          
    112         """ 
    113         allPathways = defaultdict(lambda :[[], 1.0, []]) 
    114         from Orange.orng import orngMisc 
    115         milestones = orngMisc.progressBarMilestones(len(genes), 100) 
     152 
     153        """ 
     154        if reference is None: 
     155            reference = self.genes.keys() 
     156        reference = set(reference) 
     157 
     158        allPathways = defaultdict(lambda: [[], 1.0, []]) 
     159        milestones = progress_bar_milestones(len(genes), 100) 
    116160        pathways_db = KEGGPathways() 
    117          
     161 
    118162        pathways_for_gene = [] 
    119163        for i, gene in enumerate(genes): 
    120164            pathways_for_gene.append(self.pathways([gene])) 
    121165            if callback and i in milestones: 
    122                 callback(i*50.0/len(genes)) 
    123                  
    124         # precache for speed  
    125         pathways_db.pre_cache([pid for pfg in pathways_for_gene for pid in pfg])  
     166                callback(i * 50.0 / len(genes)) 
     167 
     168        # pre-cache for speed 
     169        pathways_db.pre_cache([pid for pfg in pathways_for_gene 
     170                               for pid in pfg]) 
    126171        for i, (gene, pathways) in enumerate(zip(genes, pathways_for_gene)): 
    127172            for pathway in pathways: 
    128                 if pathways_db.get_entry(pathway).gene:  
     173                if pathways_db.get_entry(pathway).gene: 
    129174                    allPathways[pathway][0].append(gene) 
    130175            if callback and i in milestones: 
    131                 callback(50.0 + i*50.0/len(genes)) 
    132         reference = set(reference if reference is not None else self.genes.keys()) 
    133          
     176                callback(50.0 + i * 50.0 / len(genes)) 
     177 
    134178        pItems = allPathways.items() 
    135          
     179 
    136180        for i, (p_id, entry) in enumerate(pItems): 
    137181            pathway = pathways_db.get_entry(p_id) 
    138182            entry[2].extend(reference.intersection(pathway.gene or [])) 
    139             entry[1] = prob.p_value(len(entry[0]), len(reference), len(entry[2]), len(genes)) 
    140         return dict([(pid, (genes, p, len(ref))) for pid, (genes, p, ref) in allPathways.items()]) 
    141          
     183            entry[1] = prob.p_value(len(entry[0]), len(reference), 
     184                                    len(entry[2]), len(genes)) 
     185        return dict([(pid, (genes, p, len(ref))) 
     186                     for pid, (genes, p, ref) in allPathways.items()]) 
     187 
    142188    def get_genes_by_enzyme(self, enzyme): 
    143         enzyme = Enzymes().get_entry(enzyme) 
     189        enzyme = KEGGEnzyme().get_entry(enzyme) 
    144190        return enzyme.genes.get(self.org_code, []) if enzyme.genes else [] 
    145      
     191 
    146192    def get_genes_by_pathway(self, pathway_id): 
    147193        return KEGGPathway(pathway_id).genes() 
    148      
     194 
    149195    def get_enzymes_by_pathway(self, pathway_id): 
    150196        return KEGGPathway(pathway_id).enzymes() 
     
    156202        return self.api.get_pathways_by_genes(gene_ids) 
    157203        gene_ids = set(gene_ids) 
    158         pathways = [self.genes[id].pathway for id in gene_ids if self.genes[id].pathway] 
     204        pathways = [self.genes[id].pathway for id in gene_ids 
     205                    if self.genes[id].pathway] 
    159206        pathways = reduce(set.union, pathways, set()) 
    160         return [id for id in pathways if gene_ids.issubset(KEGGPathway(id).genes())]  
    161      
     207        return [id for id in pathways 
     208                if gene_ids.issubset(KEGGPathway(id).genes())] 
     209 
    162210    def get_pathways_by_enzymes(self, enzyme_ids): 
    163211        enzyme_ids = set(enzyme_ids) 
    164         pathways = [KEGGEnzymes()[id].pathway for id in enzyme_ids] 
    165         pathwats = reduce(set.union, pathways, set()) 
    166         return [id for id in pathways if enzyme_ids.issubset(KEGGPathway(id).enzymes())] 
    167      
     212        pathways = [KEGGEnzyme()[id].pathway for id in enzyme_ids] 
     213        pathways = reduce(set.union, pathways, set()) 
     214        return [id for id in pathways 
     215                if enzyme_ids.issubset(KEGGPathway(id).enzymes())] 
     216 
    168217    def get_pathways_by_compounds(self, compound_ids): 
    169218        compound_ids = set(compound_ids) 
    170         pathways = [KEGGCompounds()[id].pathway for id in compound_ids] 
    171         pathwats = reduce(set.union, pathways, set()) 
    172         return [id for id in pathways if compound_ids.issubset(KEGGPathway(id).compounds())] 
    173      
     219        pathways = [KEGGCompound()[id].pathway for id in compound_ids] 
     220        pathways = reduce(set.union, pathways, set()) 
     221        return [id for id in pathways 
     222                if compound_ids.issubset(KEGGPathway(id).compounds())] 
     223 
    174224    def get_enzymes_by_compound(self, compound_id): 
    175225        return KEGGCompound()[compound_id].enzyme 
    176      
     226 
    177227    def get_enzymes_by_gene(self, gene_id): 
    178228        return self.genes[gene_id].enzymes 
    179      
     229 
    180230    def get_compounds_by_enzyme(self, enzyme_id): 
    181231        return self._enzymes_to_compounds.get(enzyme_id) 
     
    183233    @deprecated_keywords({"caseSensitive": "case_sensitive"}) 
    184234    def get_unique_gene_ids(self, genes, case_sensitive=True): 
    185         """Return a tuple with three elements. The first is a dictionary mapping from unique gene 
    186         ids to gene names in genes, the second is a list of conflicting gene names and the third is a list 
    187         of unknown genes. 
     235        """ 
     236        Return a tuple with three elements. The first is a dictionary 
     237        mapping from unique geneids to gene names in genes, the second 
     238        is a list of conflicting gene names and the third is a list of 
     239        unknown genes. 
     240 
    188241        """ 
    189242        unique, conflicting, unknown = {}, [], [] 
     
    197250                conflicting.append(gene) 
    198251        return unique, conflicting, unknown 
    199      
     252 
     253    @deprecated_function_name 
    200254    def get_genes(self): 
    201255        return self.genes 
    202      
     256 
    203257    @classmethod 
    204258    def organism_name_search(cls, name): 
     
    213267             
    214268        try: 
    215             return genome[name].entry_key 
     269            return genome[name].organism_code 
    216270        except KeyError: 
    217271            raise OrganismNotFoundError(name) 
     
    221275        name = cls.organism_name_search(name) 
    222276        genome = KEGGGenome() 
    223         info = genome.api.binfo(name) 
     277        info = genome.api.info(name) 
    224278        return info.release 
    225      
     279 
    226280    def _set_genematcher(self, genematcher): 
    227281        setattr(self, "_genematcher", genematcher) 
     
    231285            from .. import obiGene 
    232286            if self.org_code == "ddi": 
    233                 self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code), obiGene.GMDicty(), 
    234                                                      [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]]) 
     287                self._genematcher = obiGene.matcher( 
     288                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty(), 
     289                    [obiGene.GMKEGG(self.org_code), obiGene.GMDicty()]] 
     290                ) 
    235291            else: 
    236                 self._genematcher = obiGene.matcher([obiGene.GMKEGG(self.org_code)]) 
     292                self._genematcher = obiGene.matcher( 
     293                    [obiGene.GMKEGG(self.org_code)]) 
     294 
    237295            self._genematcher.set_targets(self.genes.keys()) 
    238296        return self._genematcher 
    239297     
    240298    genematcher = property(_get_genematcher, _set_genematcher) 
    241      
     299 
     300 
    242301KEGGOrganism = Organism 
    243      
     302 
     303 
    244304def organism_name_search(name): 
    245305    return KEGGOrganism.organism_name_search(name) 
     
    259319        if e.taxid in [taxid,  genome.TAXID_MAP.get(taxid, taxid)]: 
    260320            return e.org_code() 
    261          
     321 
    262322    return None 
    263323 
     
    266326    if name in genome: 
    267327        return genome[name].taxid 
    268      
     328 
    269329    keys = genome.search(name) 
    270330    if keys: 
     
    276336    pass 
    277337 
     338 
    278339def main(): 
    279340    KEGGGenome() 
     
    282343    doctest.testmod(optionflags=doctest.ELLIPSIS, extraglobs=extraglobs) 
    283344 
     345 
    284346if __name__ == "__main__": 
    285347    sys.exit(main()) 
Note: See TracChangeset for help on using the changeset viewer.