source: orange-bioinformatics/Orange/bioinformatics/obiHomoloGene.py @ 1625:cefeb35cbfc9

Revision 1625:cefeb35cbfc9, 6.9 KB checked in by mitar, 2 years ago (diff)

Moving files around.

Line 
1import orngServerFiles
2import sys, os
3import urllib2
4
5from collections import defaultdict
6
7class _homolog(object):
8    __slots__ = ["group_id", "taxonomy_id", "gene_id", "gene_symbol"]
9    def __init__(self, homolog_line):
10        for attr, val in zip(self.__slots__, homolog_line.split("\t")):
11            setattr(self, attr, val)
12   
13   
14class _Homologs(object):
15    """ A base class for homolog mappers
16    """
17    def all_genes(self):
18        """ Return all genes in this class instance
19        """
20        raise NotImplemented
21   
22    def homologs(self, gene, taxid):
23        """ Return all (homolotaxid, homolog) pairs of gene from organism with taxid
24        """
25        raise NotImplemented
26   
27    def homolog(self, gene, taxid, homolotaxid):
28        """ Return homolog of gene from organism with *taxid* in organism with "homolotaxid*
29        """
30        homologs = dict(self.homologs(gene, taxid))
31        return homologs.get(homolotaxid, None)
32   
33class HomoloGene(_Homologs):
34    DEFAULT_DATABASE_PATH = orngServerFiles.localpath("HomoloGene")
35    VERSION = 1
36    DOMAIN = "HomoloGene"
37    FILENAME = "homologene.data"
38    def __init__(self, local_database_path=None):
39        self.local_database_path = local_database_path if local_database_path else self.DEFAULT_DATABASE_PATH
40        self.load()
41
42    @classmethod
43    def download_from_NCBI(cls, file=None):
44        data = urllib2.urlopen("ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data").read()
45        if file is None:
46            try:
47                os.mkdir(orngServerFiles.localpath("HomoloGene"))
48            except OSError:
49                pass
50            file = open(orngServerFiles.localpath("HomoloGene", "homologene.data"), "wb")
51        elif type(file) in [str, unicode]:
52            file = open(file, "wb")
53        file.write(data)
54        file.flush()
55       
56    @classmethod   
57    def get_instance(cls):
58        if not hasattr(cls, "_shared_dict"):
59            h = cls()
60            cls._shared_dict = h.__dict__
61        h = cls.__new__(cls)
62        h.__dict__ = cls._shared_dict
63        return h
64   
65    def load(self):
66        path = orngServerFiles.localpath_download(self.DOMAIN, self.FILENAME)
67        lines = open(path, "rb").read().split("\n")[:-1]
68        self._homologs = {} 
69        self._homologs = dict([((h.taxonomy_id, h.gene_symbol), h) for h in [_homolog(line) for line in lines]])
70        self._homologs_by_group = reduce(lambda dict, h: dict[h.group_id].append(h) or dict, self._homologs.values(), defaultdict(list))
71#        for line in lines:
72#            h = _homolog(line)
73#            self._homologs[h.taxonomy_id, h.gene_symbol] = h
74#            self._homologs_by_group[h.group_id].append(h)
75       
76    def all_genes(self, taxid=None):
77        return [homolog.gene_symbol for (tid, id), homolog in self._homologs.iteritems() if tid == taxid]
78   
79    def homologs(self, gene, taxid):
80        group = self._homologs.get((taxid, gene), _homolog("")).group_id
81        homologs = self._homologs_by_group[group]
82        return [(h.taxonomy_id, h.gene_symbol) for h in homologs]
83       
84    def homolog(self, gene, taxid, homolotaxid):
85        homologs = dict(self.homologs(gene, taxid))
86        return homologs.get(homolotaxid, None)
87       
88def _parseOrthoXML(file):
89    """ Return (cluster_id, taxid, gene_id) tuples from orthoXML file
90    """
91    from xml.dom.minidom import parse
92    doc = parse(file)
93    species = doc.getElementsByTagName("species")
94    geneIds = {}
95    geneIdToTaxid = {}
96    for sp in species:
97        taxid = sp.attributes.get("NCBITaxId").value
98        genes = sp.getElementsByTagName("gene")
99        geneIds.update(dict([(gene.attributes.get("id").value, (gene.attributes.get("geneId").value,
100                  gene.attributes.get("protId").value)) for gene in genes]))
101        geneIdToTaxid.update(dict.fromkeys([gene.attributes.get("geneId").value for gene in genes], taxid))
102       
103    orthologs = []
104    clusters = doc.getElementsByTagName("cluster")
105    for cl in clusters:
106        clId = cl.attributes.get("id").value
107        geneRefs = cl.getElementsByTagName("geneRef")
108        ids = [ref.attributes.get("id").value for ref in geneRefs]
109        orthologs.extend([(clId, geneIdToTaxid[geneIds[id][0]], geneIds[id][0]) for id in ids])
110    return orthologs
111       
112class InParanoid(object):
113    """ InParanoid: Eukaryotic Ortholog Groups
114    """
115    VERSION = 1
116    def __init__(self):
117        import sqlite3
118        self.con = sqlite3.connect(orngServerFiles.localpath_download("HomoloGene", "InParanoid.sqlite"))
119       
120    def all_genes(self, taxid):
121        """ Return all genes in the database for the given taxid
122        """
123        return [t[0] for t in self.con.execute("select distinct geneid from homologs where homologs.taxid=?", (taxid,)).fetchall()]
124   
125    def all_taxids(self):
126        """ Return all taxids in the database
127        """
128        return [t[0] for t in self.con.execute("select distinct taxid from homologs").fetchall()]
129   
130    def _groups(self, gene, taxid):
131        """ Return all group identifiers for gene, taxid pair
132        """
133        return self.con.execute("select distinct groupid from homologs where homologs.taxid=? and homologs.geneid=?", (taxid, gene)).fetchall()
134   
135    def orthologs(self, gene, taxid, ortholog_taxid=None):
136        """ Return all orthologs of genename from organism with taxid.
137        If ortholog_taxid is given limit to orthologs from that organism only
138        """
139        groups = self._groups(gene, taxid)
140        res = []
141        for group in groups:
142            if ortholog_taxid:
143                res.extend(self.con.execute("select distinct taxid, geneid from homologs where homologs.groupid=? and homologs.taxid=?", (group[0], ortholog_taxid)).fetchall())
144            else:
145                res.extend(self.con.execute("select distinct taxid, geneid from homologs where homologs.groupid=?", group).fetchall())
146        res = sorted(set(res))
147        if ortholog_taxid:
148            res = [r[1] for r in res]
149        return res
150       
151def all_genes(taxid):
152    """ Return a set of all genes for organism with taxid
153    """
154    return HomoloGene.get_instance().all_genes(taxid)
155
156def homologs(genename, taxid):
157    """ Return a list of homologs (taxid, genename) for a homolog group that gene, taxid belong to
158    """ 
159    return HomoloGene.get_instance().homologs(genename, taxid)
160
161def homolog(genename, taxid, homolotaxid):
162    """ Return a homolog of genename, taxid in organism with holomotaxid or None if homolog does not exist.
163    """
164    return HomoloGene.get_instance().homolog(genename, taxid, homolotaxid)
165
166def all_genes_inParanoid(taxid):
167    """ Return a set of all genes for organism with taxid in the InParanoid database
168    """
169    return InParanoid().all_genes(taxid)
170
171def orthologs(genename, taxid, ortholog_taxid=None):
172    """ Return all orthologs of genename from organism with taxid.
173    If ortholog_taxid is given limit to orthologs from that organism only
174    """
175    return InParanoid().orthologs(genename, taxid, ortholog_taxid)
Note: See TracBrowser for help on using the repository browser.