source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1719:6e8861564778

Revision 1719:6e8861564778, 11.8 KB checked in by markotoplak, 20 months ago (diff)

Fixed update scripts for MeSH, GO, HomoloGene, NCBI_geneinfo, OMIM, PPI. Moved wget to Orange.utils.

RevLine 
[461]1"""
2Maintainer: Marko Toplak
3"""
[1043]4
[1632]5from __future__ import absolute_import, with_statement
6
[1707]7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
[1632]10import cPickle as pickle, os, tempfile, sys
[1108]11from collections import defaultdict
[1106]12
[1707]13import Orange.core as orange
[1632]14from Orange.orng import orngServerFiles
15
[1643]16from . import obiGO, obiKEGG, obiTaxonomy
[1632]17
[1106]18sfdomain = "gene_sets"
[461]19
[485]20def nth(l,n):
21    return [ a[n] for a in l]
22
[1712]23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
[1107]24
[1094]25def goGeneSets(org):
26    """Returns gene sets from GO."""
[1141]27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
[626]29
[1094]30    genesets = []
[1141]31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
[1094]32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
[1108]35        if len(genes) > 0:
[1712]36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
[1108]37            genesets.append(gs)
[461]38
[1712]39    return GeneSets(genesets)
[461]40
[1094]41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
[1549]45   
[1094]46    kegg = obiKEGG.KEGGOrganism(org)
[461]47
[1094]48    genesets = []
49    for id in kegg.pathways():
[1712]50        print id
[1094]51        pway = obiKEGG.KEGGPathway(id)
[1549]52        hier = ("KEGG","pathways")
[1712]53        gs = GeneSet(id=id,
[1549]54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
[1094]59        genesets.append(gs)
[461]60
[1712]61    return GeneSets(genesets)
[461]62
[1141]63def omimGeneSets():
64    """
65    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
66    """
[1632]67    from . import obiOMIM
[1141]68    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
69                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
70                    for disease in obiOMIM.diseases()]
71    return GeneSets(genesets)
72
[1144]73def miRNAGeneSets(org):
74    """
75    Return gene sets from miRNA targets
76    """
[1716]77    from . import obimiRNA
78    org_code = obiKEGG.from_taxid(org)
[1144]79    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
80    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
[1549]81    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
[1144]82                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
83    return GeneSets(genesets)
84
[1549]85def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
[1632]86    from . import obimiRNA, obiGO
[1549]87    mirnas = obimiRNA.ids(int(org))
88    if ontology is None:
89        ontology = obiGO.Ontology()
90         
91    annotations = obiGO.Annotations(org, ontology=ontology)
92   
93    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
94    print go_sets
95   
96    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
97   
98    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
[1712]99    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
[1549]100                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
[1712]101    gset = GeneSets(gsets)
[1549]102    return gset
103
104
[1095]105def loadGMT(contents, name):
[485]106    """
107    Eech line consists of tab separated elements. First is
[1608]108    the geneset name, next is it's description.
[485]109   
110    For now the description is skipped.
111    """
[1643]112
[485]113    def hline(s):
[1608]114        tabs = [tab.strip() for tab in s.split("\t")]
[1712]115        return GeneSet(id=tabs[0], description=tabs[1],
[1608]116                                   hierarchy=(name,), genes=tabs[2:])
[485]117
[1106]118    def handleNELines(s, fn):
119        """
120        Run function on nonempty lines of a string.
121        Return a list of results for each line.
122        """
[1608]123        lines = (l.strip() for l in s.splitlines())
124        return [fn(l) for l in lines if l]
[1106]125
[1712]126    return GeneSets(handleNELines(contents, hline))
[485]127
[1106]128"""
129We have multiple paths for gene set data:
130buffer/bigfiles/gene_sets
131and
132buffer/gene_sets_local
133both have available.txt
134"""
135
136def omakedirs(dir):
137    try:
138        os.makedirs(dir)
139    except OSError:
140        pass
141
142def local_path():
143    """ Returns local path for gene sets. Creates it if it does not exists
144    yet. """
[1632]145    from Orange.orng import orngEnviron
[1108]146    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
[1106]147    omakedirs(pth)
148    return pth
149
150def build_index(dir):
151    """ Returns gene set availability index for some folder. """
152    pass
153
154def filename(hierarchy, organism):
155    """ Obtain a filename for given hierarchy and organism. """
156    return "gs_" + "_._".join(hierarchy + \
157        (organism if organism != None else "",)) + ".pck"
158
159def filename_parse(fn):
160    """ Returns a hierarchy and the organism from the filename."""
161    fn = fn[3:-4]
162    parts = fn.split("_._")
163    hierarchy = tuple(parts[:-1])
164    org = parts[-1] if parts[-1] != "" else None
165    return hierarchy, org
166
167def is_genesets_file(fn):
168    return fn.startswith("gs_") and fn.endswith(".pck")
169
170def list_local():
171    """ Returns available gene sets from the local repository:
172    a list of (hierarchy, organism, on_local) """
173    pth = local_path()
174    gs_files = filter(is_genesets_file, os.listdir(pth))
[1108]175    return [ filename_parse(fn) + (True,) for fn in gs_files ]
[1106]176   
[1108]177def list_serverfiles_from_flist(flist):
178    gs_files = filter(is_genesets_file, flist)
179    localfiles = set(orngServerFiles.listfiles(sfdomain))
180    return [ filename_parse(fn) + \
181        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
182
[1106]183def list_serverfiles_conn(serverfiles=None):
184    """ Returns available gene sets from the server files
185    repository: a list of (hierarchy, organism, on_local) """
186    if serverfiles == None:
187        serverfiles = orngServerFiles.ServerFiles()
[1108]188    flist = serverfiles.listfiles(sfdomain)
189    return list_serverfiles_from_flist(flist)
190
191def list_serverfiles():
192    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
193    flist = pickle.load(open(fname, 'r'))
194    return list_serverfiles_from_flist(flist)
195
196def list_all():
197    """
198    return a list of (hier, org, avalable_locally)
199    If something for a specific (hier, org) is not downloaded
200    yet, show it as not-local. """
201    flist = list_local() + list_serverfiles()
202    d = {}
203    for h,o,local in flist:
204        d[h,o] = min(local, d.get((h,o),True))
205    return [ (h,o,local) for (h,o),local in d.items() ]
[1106]206
207def update_server_list(serverfiles_upload, serverfiles_list=None):
208    if serverfiles_list == None:
209        serverfiles_list = orngServerFiles.ServerFiles()
[1108]210
211    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
[1106]212
213    tfname = pickle_temp(flist)
214   
215    try:
216        fn = "index.pck"
217        title = "Gene sets: index"
218        tags = [ "gene sets", "index", "essential" ]
219        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
220        serverfiles_upload.unprotect(sfdomain, fn)
[1111]221    except Exception,e:
[1106]222        raise e
223    finally:
224        os.remove(tfname)
225
[1712]226def _register_local(genesets):
[1106]227    """ Registers using the common hierarchy and organism. """
228    pth = local_path()
229
230    org = genesets.common_org()
231    hierarchy = genesets.common_hierarchy()
[1108]232    fn = filename(hierarchy, org)
[1106]233
234    with open(os.path.join(pth, fn), "w") as f:
235        pickle.dump(genesets, f)
236
237    return fn
238
239def pickle_temp(obj):
[1712]240    """ Pickle a file to a temporary file and returns its name """
[1106]241    fd,tfname = tempfile.mkstemp()
242    os.close(fd)
243    f = open(tfname, 'wb')
244    pickle.dump(obj, f)
245    f.close()
246    return tfname
247
[1712]248def _register_serverfiles(genesets, serverFiles):
[1106]249    """ Registers using the common hierarchy and organism. """
250    org = genesets.common_org()
251    hierarchy = genesets.common_hierarchy()
252    fn = filename(hierarchy, org)
253
254    #save to temporary file
255    tfname = pickle_temp(genesets)
256   
257    try:
[1108]258        taxname = obiTaxonomy.name(org)
[1106]259        title = "Gene sets: " + ", ".join(hierarchy) + \
[1108]260            ((" (" + taxname + ")") if org != None else "")
261        tags = list(hierarchy) + [ "gene sets", taxname ] + \
262            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
[1106]263        serverFiles.upload(sfdomain, fn, tfname, title, tags)
264        serverFiles.unprotect(sfdomain, fn)
265    finally:
266        os.remove(tfname)
267
268    update_server_list(serverFiles)
269
270def register(genesets, serverFiles=None):
271    """
272    Hierarchy is induced from the gene set names.
273    """
274    if serverFiles == None:
[1712]275        _register_local(genesets)
[1106]276    else:
[1712]277        _register_serverfiles(genesets, serverFiles)
[1106]278
[1108]279def build_hierarchy_dict(files):
280    hierd = defaultdict(list)
281    for ind,f in enumerate(files):
282        hier, org = f
283        for i in range(len(hier)+1):
284            hierd[(hier[:i], org)].append(ind)
285    return hierd
286
287def load_local(hierarchy, organism):
288    files = map(lambda x: x[:2], list_local())
289    hierd = build_hierarchy_dict(files)
290
291    out = GeneSets()
292    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
293        fname = os.path.join(local_path(), filename(h, o))
294        out.update(pickle.load(open(fname, 'r')))
295    return out
296
297def load_serverfiles(hierarchy, organism):
298    files = map(lambda x: x[:2], list_serverfiles())
299    hierd = build_hierarchy_dict(files)
300    out = GeneSets()
301    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
302        fname = orngServerFiles.localpath_download(sfdomain, 
303            filename(h, o))
304        out.update(pickle.load(open(fname, 'r')))
305    return out
306
307def load(hierarchy, organism):
[1712]308    """ First try to load from the local registred folder. If the file
309    is not available, load it from the server files. """
[1108]310    ret = load_local(hierarchy, organism)
[1712]311    if len(ret) == 0:
312        ret.update(load_serverfiles(hierarchy, organism))
[1108]313    return ret
314
315def collections(*args):
[485]316    """
317    Input is a list of collections.
[1108]318    Collection can either be a tuple (hierarchy, orgranism), where
319    hierarchy is a tuple also.
[485]320    """
[1712]321    result = GeneSets()
[485]322
[1108]323    for collection in args:
[1569]324        try:
[1108]325            result.update(collection)
[1713]326        except (ValueError, TypeError):
[1569]327            if issequencens(collection): #have a hierarchy, organism specification
328                new = load(*collection)
329                result.update(new)
[1108]330            else:
[1569]331                if collection.lower()[-4:] == ".gmt": #format from webpage
332                    result.update(loadGMT(open(collection,"rt").read(), collection))
333                else:
334                    raise Exception("collection() accepts files in .gmt format only.")
335
[1108]336    return result
[485]337
[1108]338def issequencens(x):
339    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
340    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
[485]341
[1214]342class TException(Exception): pass
343
[1108]344def upload_genesets(rsf):
345    """
346    Builds the default gene sets and
347    """
348    orngServerFiles.update_local_files()
[485]349
[1713]350    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
[1108]351    organisms = obiTaxonomy.common_taxids()
352    for fn in genesetsfn:
353        for org in organisms:
354            try:
[1712]355                print "Uploading ORG", org, fn
[1108]356                genesets = fn(org).split_by_hierarchy()
357                for gs in genesets:
358                    print "registering", gs.common_hierarchy()
[1713]359                    register(gs, rsf) #server files
360                    #register(gs)
[1214]361                    print "successful", gs.common_hierarchy()
[1712]362            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
363                print "organism not found", org
364
[461]365if __name__ == "__main__":
[1214]366    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
367    upload_genesets(rsf)
[1712]368    pass
Note: See TracBrowser for help on using the repository browser.