source: orange-bioinformatics/Orange/bioinformatics/obiHomoloGene.py @ 1632:9cf919d0f343

Revision 1632:9cf919d0f343, 7.0 KB checked in by mitar, 2 years ago (diff)

Fixing imports.

Line 
1import sys, os
2import urllib2
3
4from collections import defaultdict
5
6from Orange.orng import orngServerFiles
7
8class _homolog(object):
9    __slots__ = ["group_id", "taxonomy_id", "gene_id", "gene_symbol"]
10    def __init__(self, homolog_line):
11        for attr, val in zip(self.__slots__, homolog_line.split("\t")):
12            setattr(self, attr, val)
13   
14   
15class _Homologs(object):
16    """ A base class for homolog mappers
17    """
18    def all_genes(self):
19        """ Return all genes in this class instance
20        """
21        raise NotImplemented
22   
23    def homologs(self, gene, taxid):
24        """ Return all (homolotaxid, homolog) pairs of gene from organism with taxid
25        """
26        raise NotImplemented
27   
28    def homolog(self, gene, taxid, homolotaxid):
29        """ Return homolog of gene from organism with *taxid* in organism with "homolotaxid*
30        """
31        homologs = dict(self.homologs(gene, taxid))
32        return homologs.get(homolotaxid, None)
33   
34class HomoloGene(_Homologs):
35    DEFAULT_DATABASE_PATH = orngServerFiles.localpath("HomoloGene")
36    VERSION = 1
37    DOMAIN = "HomoloGene"
38    FILENAME = "homologene.data"
39    def __init__(self, local_database_path=None):
40        self.local_database_path = local_database_path if local_database_path else self.DEFAULT_DATABASE_PATH
41        self.load()
42
43    @classmethod
44    def download_from_NCBI(cls, file=None):
45        data = urllib2.urlopen("ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data").read()
46        if file is None:
47            try:
48                os.mkdir(orngServerFiles.localpath("HomoloGene"))
49            except OSError:
50                pass
51            file = open(orngServerFiles.localpath("HomoloGene", "homologene.data"), "wb")
52        elif type(file) in [str, unicode]:
53            file = open(file, "wb")
54        file.write(data)
55        file.flush()
56       
57    @classmethod   
58    def get_instance(cls):
59        if not hasattr(cls, "_shared_dict"):
60            h = cls()
61            cls._shared_dict = h.__dict__
62        h = cls.__new__(cls)
63        h.__dict__ = cls._shared_dict
64        return h
65   
66    def load(self):
67        path = orngServerFiles.localpath_download(self.DOMAIN, self.FILENAME)
68        lines = open(path, "rb").read().split("\n")[:-1]
69        self._homologs = {} 
70        self._homologs = dict([((h.taxonomy_id, h.gene_symbol), h) for h in [_homolog(line) for line in lines]])
71        self._homologs_by_group = reduce(lambda dict, h: dict[h.group_id].append(h) or dict, self._homologs.values(), defaultdict(list))
72#        for line in lines:
73#            h = _homolog(line)
74#            self._homologs[h.taxonomy_id, h.gene_symbol] = h
75#            self._homologs_by_group[h.group_id].append(h)
76       
77    def all_genes(self, taxid=None):
78        return [homolog.gene_symbol for (tid, id), homolog in self._homologs.iteritems() if tid == taxid]
79   
80    def homologs(self, gene, taxid):
81        group = self._homologs.get((taxid, gene), _homolog("")).group_id
82        homologs = self._homologs_by_group[group]
83        return [(h.taxonomy_id, h.gene_symbol) for h in homologs]
84       
85    def homolog(self, gene, taxid, homolotaxid):
86        homologs = dict(self.homologs(gene, taxid))
87        return homologs.get(homolotaxid, None)
88       
89def _parseOrthoXML(file):
90    """ Return (cluster_id, taxid, gene_id) tuples from orthoXML file
91    """
92    from xml.dom.minidom import parse
93    doc = parse(file)
94    species = doc.getElementsByTagName("species")
95    geneIds = {}
96    geneIdToTaxid = {}
97    for sp in species:
98        taxid = sp.attributes.get("NCBITaxId").value
99        genes = sp.getElementsByTagName("gene")
100        geneIds.update(dict([(gene.attributes.get("id").value, (gene.attributes.get("geneId").value,
101                  gene.attributes.get("protId").value)) for gene in genes]))
102        geneIdToTaxid.update(dict.fromkeys([gene.attributes.get("geneId").value for gene in genes], taxid))
103       
104    orthologs = []
105    clusters = doc.getElementsByTagName("cluster")
106    for cl in clusters:
107        clId = cl.attributes.get("id").value
108        geneRefs = cl.getElementsByTagName("geneRef")
109        ids = [ref.attributes.get("id").value for ref in geneRefs]
110        orthologs.extend([(clId, geneIdToTaxid[geneIds[id][0]], geneIds[id][0]) for id in ids])
111    return orthologs
112       
113class InParanoid(object):
114    """ InParanoid: Eukaryotic Ortholog Groups
115    """
116    VERSION = 1
117    def __init__(self):
118        import sqlite3
119        self.con = sqlite3.connect(orngServerFiles.localpath_download("HomoloGene", "InParanoid.sqlite"))
120       
121    def all_genes(self, taxid):
122        """ Return all genes in the database for the given taxid
123        """
124        return [t[0] for t in self.con.execute("select distinct geneid from homologs where homologs.taxid=?", (taxid,)).fetchall()]
125   
126    def all_taxids(self):
127        """ Return all taxids in the database
128        """
129        return [t[0] for t in self.con.execute("select distinct taxid from homologs").fetchall()]
130   
131    def _groups(self, gene, taxid):
132        """ Return all group identifiers for gene, taxid pair
133        """
134        return self.con.execute("select distinct groupid from homologs where homologs.taxid=? and homologs.geneid=?", (taxid, gene)).fetchall()
135   
136    def orthologs(self, gene, taxid, ortholog_taxid=None):
137        """ Return all orthologs of genename from organism with taxid.
138        If ortholog_taxid is given limit to orthologs from that organism only
139        """
140        groups = self._groups(gene, taxid)
141        res = []
142        for group in groups:
143            if ortholog_taxid:
144                res.extend(self.con.execute("select distinct taxid, geneid from homologs where homologs.groupid=? and homologs.taxid=?", (group[0], ortholog_taxid)).fetchall())
145            else:
146                res.extend(self.con.execute("select distinct taxid, geneid from homologs where homologs.groupid=?", group).fetchall())
147        res = sorted(set(res))
148        if ortholog_taxid:
149            res = [r[1] for r in res]
150        return res
151       
152def all_genes(taxid):
153    """ Return a set of all genes for organism with taxid
154    """
155    return HomoloGene.get_instance().all_genes(taxid)
156
157def homologs(genename, taxid):
158    """ Return a list of homologs (taxid, genename) for a homolog group that gene, taxid belong to
159    """ 
160    return HomoloGene.get_instance().homologs(genename, taxid)
161
162def homolog(genename, taxid, homolotaxid):
163    """ Return a homolog of genename, taxid in organism with holomotaxid or None if homolog does not exist.
164    """
165    return HomoloGene.get_instance().homolog(genename, taxid, homolotaxid)
166
167def all_genes_inParanoid(taxid):
168    """ Return a set of all genes for organism with taxid in the InParanoid database
169    """
170    return InParanoid().all_genes(taxid)
171
172def orthologs(genename, taxid, ortholog_taxid=None):
173    """ Return all orthologs of genename from organism with taxid.
174    If ortholog_taxid is given limit to orthologs from that organism only
175    """
176    return InParanoid().orthologs(genename, taxid, ortholog_taxid)
Note: See TracBrowser for help on using the repository browser.