source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1805:c2e4880abd58

Revision 1805:c2e4880abd58, 14.3 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

Added functionality to obiGeneSets.py to remove locally stored gene sets. Created a prototype widget OWtest.py to store and remove own gene sets. The front-end still needs work

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def dictyMutantSets():
64    """
65    Return dicty mutant phenotype gene sets from Dictybase
66    """
67    from . import obiDictyMutants
68    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
69    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
70    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
71    #                    for mutant in obiDictyMutants.mutants()]
72 
73    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
74                        link="") \
75                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
76
77    return GeneSets(genesets)
78
79def cytobandGeneSets():
80    """
81    Create cytoband gene sets from Stanford Microarray Database
82    """
83    import urllib2
84
85    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
86    stream = urllib2.urlopen(url)
87    data = stream.read().splitlines()
88
89    genesets = []
90    for band in data:
91        b = band.split("\t")
92        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
93
94    return GeneSets(genesets)
95
96def reactomePathwaysGeneSets():
97    """
98    Prepare human pathways gene sets from reactome pathways
99    """
100    import urllib
101    import io
102    from zipfile import ZipFile
103
104    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
105    memfile = io.BytesIO(url.read())
106    with ZipFile(memfile, "r") as myzip:
107        f = myzip.open("ReactomePathways.gmt")
108        content = f.read().splitlines()     
109
110    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
111    return GeneSets(genesets)
112
113
114def omimGeneSets():
115    """
116    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
117    """
118    from . import obiOMIM   
119    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
120                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
121                    for disease in obiOMIM.diseases()]
122    return GeneSets(genesets)
123
124def miRNAGeneSets(org):
125    """
126    Return gene sets from miRNA targets
127    """
128    from . import obimiRNA
129    org_code = obiKEGG.from_taxid(org)
130    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
131    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
132    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
133                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
134    return GeneSets(genesets)
135
136def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
137    from . import obimiRNA, obiGO
138    mirnas = obimiRNA.ids(int(org))
139    if ontology is None:
140        ontology = obiGO.Ontology()
141
142    annotations = obiGO.Annotations(org, ontology=ontology)
143
144    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
145
146    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
147
148    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
149    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
150                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
151    gset = GeneSets(gsets)
152    return gset
153
154def loadGMT(contents, name):
155    """
156    Eech line consists of tab separated elements. First is
157    the geneset name, next is it's description.
158
159    For now the description is skipped.
160    """
161
162    def hline(s):
163        tabs = [tab.strip() for tab in s.split("\t")]
164        return GeneSet(id=tabs[0], description=tabs[1],
165                                   hierarchy=(name,), genes=tabs[2:])
166
167    def handleNELines(s, fn):
168        """
169        Run function on nonempty lines of a string.
170        Return a list of results for each line.
171        """
172        lines = (l.strip() for l in s.splitlines())
173        return [fn(l) for l in lines if l]
174
175    return GeneSets(handleNELines(contents, hline))
176
177"""
178We have multiple paths for gene set data:
179buffer/bigfiles/gene_sets
180and
181buffer/gene_sets_local
182both have available.txt
183"""
184
185def omakedirs(dir):
186    try:
187        os.makedirs(dir)
188    except OSError:
189        pass
190
191def local_path():
192    """ Returns local path for gene sets. Creates it if it does not exists
193    yet. """
194    from Orange.orng import orngEnviron
195    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
196    omakedirs(pth)
197    return pth
198
199def build_index(dir):
200    """ Returns gene set availability index for some folder. """
201    pass
202
203def filename(hierarchy, organism):
204    """ Obtain a filename for given hierarchy and organism. """
205    return "gs_" + "_._".join(hierarchy + \
206        (organism if organism != None else "",)) + ".pck"
207
208def filename_parse(fn):
209    """ Returns a hierarchy and the organism from the filename."""
210    fn = fn[3:-4]
211    parts = fn.split("_._")
212    hierarchy = tuple(parts[:-1])
213    org = parts[-1] if parts[-1] != "" else None
214    return hierarchy, org
215
216def is_genesets_file(fn):
217    return fn.startswith("gs_") and fn.endswith(".pck")
218
219def list_local():
220    """ Returns available gene sets from the local repository:
221    a list of (hierarchy, organism, on_local) """
222    pth = local_path()
223    gs_files = filter(is_genesets_file, os.listdir(pth))
224    return [ filename_parse(fn) + (True,) for fn in gs_files ]
225
226def remove_local(gene_set):
227    """ Removes a given gene set from the local repository. """
228    pth = local_path()
229    gs_files = filter(is_genesets_file, os.listdir(pth)) 
230    for setfile in gs_files:
231        if setfile.__contains__(gene_set):
232            setBgone = os.path.join(pth, setfile)
233            os.remove(setBgone) 
234
235def list_serverfiles_from_flist(flist):
236    gs_files = filter(is_genesets_file, flist)
237    localfiles = set(orngServerFiles.listfiles(sfdomain))
238    return [ filename_parse(fn) + \
239        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
240
241def list_serverfiles_conn(serverfiles=None):
242    """ Returns available gene sets from the server files
243    repository: a list of (hierarchy, organism, on_local) """
244    if serverfiles == None:
245        serverfiles = orngServerFiles.ServerFiles()
246    flist = serverfiles.listfiles(sfdomain)
247    return list_serverfiles_from_flist(flist)
248
249def list_serverfiles():
250    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
251    flist = pickle.load(open(fname, 'r'))
252    return list_serverfiles_from_flist(flist)
253
254def list_all():
255    """
256    return a list of (hier, org, avalable_locally)
257    If something for a specific (hier, org) is not downloaded
258    yet, show it as not-local. """
259    flist = list_local() + list_serverfiles()
260    d = {}
261    for h,o,local in flist:
262        d[h,o] = min(local, d.get((h,o),True))
263    return [ (h,o,local) for (h,o),local in d.items() ]
264
265def update_server_list(serverfiles_upload, serverfiles_list=None):
266    if serverfiles_list == None:
267        serverfiles_list = orngServerFiles.ServerFiles()
268
269    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
270
271    tfname = pickle_temp(flist)
272
273    try:
274        fn = "index.pck"
275        title = "Gene sets: index"
276        tags = [ "gene sets", "index", "essential" ]
277        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
278        serverfiles_upload.unprotect(sfdomain, fn)
279    except Exception,e:
280        raise e
281    finally:
282        os.remove(tfname)
283
284def _register_local(genesets):
285    """ Registers using the common hierarchy and organism. """
286    pth = local_path()
287
288    org = genesets.common_org()
289    hierarchy = genesets.common_hierarchy()
290    fn = filename(hierarchy, org)
291
292    with open(os.path.join(pth, fn), "w") as f:
293        pickle.dump(genesets, f)
294
295    return fn
296
297def pickle_temp(obj):
298    """ Pickle a file to a temporary file and returns its name """
299    fd,tfname = tempfile.mkstemp()
300    os.close(fd)
301    f = open(tfname, 'wb')
302    pickle.dump(obj, f)
303    f.close()
304    return tfname
305
306def _register_serverfiles(genesets, serverFiles):
307    """ Registers using the common hierarchy and organism. """
308    org = genesets.common_org()
309    hierarchy = genesets.common_hierarchy()
310    fn = filename(hierarchy, org)
311
312    #save to temporary file
313    tfname = pickle_temp(genesets)
314
315    try:
316        taxname = obiTaxonomy.name(org)
317        title = "Gene sets: " + ", ".join(hierarchy) + \
318            ((" (" + taxname + ")") if org != None else "")
319        tags = list(hierarchy) + [ "gene sets", taxname ] + \
320            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
321        serverFiles.upload(sfdomain, fn, tfname, title, tags)
322        serverFiles.unprotect(sfdomain, fn)
323    finally:
324        os.remove(tfname)
325
326    update_server_list(serverFiles)
327
328def register(genesets, serverFiles=None):
329    """
330    Hierarchy is induced from the gene set names.
331    """
332    if serverFiles == None:
333        _register_local(genesets)
334    else:
335        _register_serverfiles(genesets, serverFiles)
336
337def build_hierarchy_dict(files):
338    hierd = defaultdict(list)
339    for ind,f in enumerate(files):
340        hier, org = f
341        for i in range(len(hier)+1):
342            hierd[(hier[:i], org)].append(ind)
343    return hierd
344
345def load_local(hierarchy, organism):
346    files = map(lambda x: x[:2], list_local())
347    hierd = build_hierarchy_dict(files)
348
349    out = GeneSets()
350    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
351        fname = os.path.join(local_path(), filename(h, o))
352        out.update(pickle.load(open(fname, 'r')))
353    return out
354
355def load_serverfiles(hierarchy, organism):
356    files = map(lambda x: x[:2], list_serverfiles())
357    hierd = build_hierarchy_dict(files)
358    out = GeneSets()
359    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
360        fname = orngServerFiles.localpath_download(sfdomain,
361            filename(h, o))
362        out.update(pickle.load(open(fname, 'r')))
363    return out
364
365def load(hierarchy, organism):
366    """ First try to load from the local registred folder. If the file
367    is not available, load it from the server files. """
368    ret = load_local(hierarchy, organism)
369    if len(ret) == 0:
370        ret.update(load_serverfiles(hierarchy, organism))
371    return ret
372
373def collections(*args):
374    """
375    Input is a list of collections.
376    Collection can either be a tuple (hierarchy, orgranism), where
377    hierarchy is a tuple also.
378    """
379    result = GeneSets()
380
381    for collection in args:
382        try:
383            result.update(collection)
384        except (ValueError, TypeError):
385            if issequencens(collection): #have a hierarchy, organism specification
386                new = load(*collection)
387                result.update(new)
388            else:
389                if collection.lower()[-4:] == ".gmt": #format from webpage
390                    result.update(loadGMT(open(collection,"rt").read(), collection))
391                else:
392                    raise Exception("collection() accepts files in .gmt format only.")
393
394    return result
395
396def issequencens(x):
397    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
398    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
399
400class TException(Exception): pass
401
402def upload_genesets(rsf):
403    """
404    Builds the default gene sets and
405    """
406    orngServerFiles.update_local_files()
407
408    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
409    organisms = obiTaxonomy.common_taxids()
410    for fn in genesetsfn:
411        for org in organisms:
412            try:
413                print "Uploading ORG", org, fn
414                genesets = fn(org).split_by_hierarchy()
415                for gs in genesets:
416                    print "registering", gs.common_hierarchy()
417                    register(gs, rsf) #server files
418                    #register(gs)
419                    print "successful", gs.common_hierarchy()
420            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
421                print "organism not found", org
422
423if __name__ == "__main__":
424    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
425    upload_genesets(rsf)
426    pass
427
428
Note: See TracBrowser for help on using the repository browser.