source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1812:6871643ec38f

Revision 1812:6871643ec38f, 14.4 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

Custom genesets now use Custom Genesets hierarchy in obiGeneSets.py. Renamed OWtest.py to OWCustomSets.py and renamed the widget from test to Custom Gene Sets. Added a preview window to quickly view geneset files before importing them.

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12import datetime
13
14import Orange.core as orange
15from Orange.orng import orngServerFiles
16
17from . import obiGO, obiKEGG, obiTaxonomy
18
19sfdomain = "gene_sets"
20
21def nth(l,n):
22    return [ a[n] for a in l]
23
24from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
25
26def goGeneSets(org):
27    """Returns gene sets from GO."""
28    ontology = obiGO.Ontology()
29    annotations = obiGO.Annotations(org, ontology=ontology)
30
31    genesets = []
32    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
33    for termn, term in ontology.terms.items():
34        genes = annotations.GetAllGenes(termn)
35        hier = ("GO", term.namespace)
36        if len(genes) > 0:
37            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
38            genesets.append(gs)
39
40    return GeneSets(genesets)
41
42def keggGeneSets(org):
43    """
44    Returns gene sets from KEGG pathways.
45    """
46
47    kegg = obiKEGG.KEGGOrganism(org)
48
49    genesets = []
50    for id in kegg.pathways():
51        print id
52        pway = obiKEGG.KEGGPathway(id)
53        hier = ("KEGG","pathways")
54        gs = GeneSet(id=id,
55                                 name=pway.title,
56                                 genes=kegg.get_genes_by_pathway(id),
57                                 hierarchy=hier,
58                                 organism=org,
59                                 link=pway.link)
60        genesets.append(gs)
61
62    return GeneSets(genesets)
63
64def dictyMutantSets():
65    """
66    Return dicty mutant phenotype gene sets from Dictybase
67    """
68    from . import obiDictyMutants
69    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
70    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
71    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
72    #                    for mutant in obiDictyMutants.mutants()]
73 
74    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
75                        link="") \
76                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
77
78    return GeneSets(genesets)
79
80def cytobandGeneSets():
81    """
82    Create cytoband gene sets from Stanford Microarray Database
83    """
84    import urllib2
85
86    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
87    stream = urllib2.urlopen(url)
88    data = stream.read().splitlines()
89
90    genesets = []
91    for band in data:
92        b = band.split("\t")
93        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
94
95    return GeneSets(genesets)
96
97def reactomePathwaysGeneSets():
98    """
99    Prepare human pathways gene sets from reactome pathways
100    """
101    import urllib
102    import io
103    from zipfile import ZipFile
104
105    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
106    memfile = io.BytesIO(url.read())
107    with ZipFile(memfile, "r") as myzip:
108        f = myzip.open("ReactomePathways.gmt")
109        content = f.read().splitlines()     
110
111    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
112    return GeneSets(genesets)
113
114
115def omimGeneSets():
116    """
117    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
118    """
119    from . import obiOMIM   
120    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
121                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
122                    for disease in obiOMIM.diseases()]
123    return GeneSets(genesets)
124
125def miRNAGeneSets(org):
126    """
127    Return gene sets from miRNA targets
128    """
129    from . import obimiRNA
130    org_code = obiKEGG.from_taxid(org)
131    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
132    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
133    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
134                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
135    return GeneSets(genesets)
136
137def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
138    from . import obimiRNA, obiGO
139    mirnas = obimiRNA.ids(int(org))
140    if ontology is None:
141        ontology = obiGO.Ontology()
142
143    annotations = obiGO.Annotations(org, ontology=ontology)
144
145    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
146
147    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
148
149    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
150    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
151                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
152    gset = GeneSets(gsets)
153    return gset
154
155def loadGMT(contents, name):
156    """
157    Eech line consists of tab separated elements. First is
158    the geneset name, next is it's description.
159
160    For now the description is skipped.
161    """
162
163    def hline(s):
164        tabs = [tab.strip() for tab in s.split("\t")]
165        return GeneSet(id=tabs[0], description=tabs[1],
166                                   hierarchy=("Custom Genesets",name), genes=tabs[2:])
167
168    def handleNELines(s, fn):
169        """
170        Run function on nonempty lines of a string.
171        Return a list of results for each line.
172        """
173        lines = (l.strip() for l in s.splitlines())
174        return [fn(l) for l in lines if l]
175
176    return GeneSets(handleNELines(contents, hline))
177
178"""
179We have multiple paths for gene set data:
180buffer/bigfiles/gene_sets
181and
182buffer/gene_sets_local
183both have available.txt
184"""
185
186def omakedirs(dir):
187    try:
188        os.makedirs(dir)
189    except OSError:
190        pass
191
192def local_path():
193    """ Returns local path for gene sets. Creates it if it does not exists
194    yet. """
195    from Orange.orng import orngEnviron
196    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
197    omakedirs(pth)
198    return pth
199
200def build_index(dir):
201    """ Returns gene set availability index for some folder. """
202    pass
203
204def filename(hierarchy, organism):
205    """ Obtain a filename for given hierarchy and organism. """
206    return "gs_" + "_._".join(hierarchy + \
207        (organism if organism != None else "",)) + ".pck"
208
209def filename_parse(fn):
210    """ Returns a hierarchy and the organism from the filename."""
211    fn = fn[3:-4]
212    parts = fn.split("_._")
213    hierarchy = tuple(parts[:-1])
214    org = parts[-1] if parts[-1] != "" else None
215    return hierarchy, org
216
217def is_genesets_file(fn):
218    return fn.startswith("gs_") and fn.endswith(".pck")
219
220def list_local():
221    """ Returns available gene sets from the local repository:
222    a list of (hierarchy, organism, on_local) """
223    pth = local_path()
224    gs_files = filter(is_genesets_file, os.listdir(pth))
225    return [ filename_parse(fn) + (True,) for fn in gs_files ]
226
227def remove_local(gene_set):
228    """ Removes a given gene set from the local repository. """
229    pth = local_path()
230    gs_files = filter(is_genesets_file, os.listdir(pth)) 
231    for setfile in gs_files:
232        if setfile.__contains__(gene_set):
233            setBgone = os.path.join(pth, setfile)
234            os.remove(setBgone) 
235
236def modification_date(file):
237    t = os.path.getmtime(file)
238    return datetime.datetime.fromtimestamp(t)
239
240def list_serverfiles_from_flist(flist):
241    gs_files = filter(is_genesets_file, flist)
242    localfiles = set(orngServerFiles.listfiles(sfdomain))
243    return [ filename_parse(fn) + \
244        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
245
246def list_serverfiles_conn(serverfiles=None):
247    """ Returns available gene sets from the server files
248    repository: a list of (hierarchy, organism, on_local) """
249    if serverfiles == None:
250        serverfiles = orngServerFiles.ServerFiles()
251    flist = serverfiles.listfiles(sfdomain)
252    return list_serverfiles_from_flist(flist)
253
254def list_serverfiles():
255    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
256    flist = pickle.load(open(fname, 'r'))
257    return list_serverfiles_from_flist(flist)
258
259def list_all():
260    """
261    return a list of (hier, org, avalable_locally)
262    If something for a specific (hier, org) is not downloaded
263    yet, show it as not-local. """
264    flist = list_local() + list_serverfiles()
265    d = {}
266    for h,o,local in flist:
267        d[h,o] = min(local, d.get((h,o),True))
268    return [ (h,o,local) for (h,o),local in d.items() ]
269
270def update_server_list(serverfiles_upload, serverfiles_list=None):
271    if serverfiles_list == None:
272        serverfiles_list = orngServerFiles.ServerFiles()
273
274    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
275
276    tfname = pickle_temp(flist)
277
278    try:
279        fn = "index.pck"
280        title = "Gene sets: index"
281        tags = [ "gene sets", "index", "essential" ]
282        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
283        serverfiles_upload.unprotect(sfdomain, fn)
284    except Exception,e:
285        raise e
286    finally:
287        os.remove(tfname)
288
289def _register_local(genesets):
290    """ Registers using the common hierarchy and organism. """
291    pth = local_path()
292
293    org = genesets.common_org()
294    hierarchy = genesets.common_hierarchy()
295    fn = filename(hierarchy, org)
296
297    with open(os.path.join(pth, fn), "w") as f:
298        pickle.dump(genesets, f)
299
300    return fn
301
302def pickle_temp(obj):
303    """ Pickle a file to a temporary file and returns its name """
304    fd,tfname = tempfile.mkstemp()
305    os.close(fd)
306    f = open(tfname, 'wb')
307    pickle.dump(obj, f)
308    f.close()
309    return tfname
310
311def _register_serverfiles(genesets, serverFiles):
312    """ Registers using the common hierarchy and organism. """
313    org = genesets.common_org()
314    hierarchy = genesets.common_hierarchy()
315    fn = filename(hierarchy, org)
316
317    #save to temporary file
318    tfname = pickle_temp(genesets)
319
320    try:
321        taxname = obiTaxonomy.name(org)
322        title = "Gene sets: " + ", ".join(hierarchy) + \
323            ((" (" + taxname + ")") if org != None else "")
324        tags = list(hierarchy) + [ "gene sets", taxname ] + \
325            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
326        serverFiles.upload(sfdomain, fn, tfname, title, tags)
327        serverFiles.unprotect(sfdomain, fn)
328    finally:
329        os.remove(tfname)
330
331    update_server_list(serverFiles)
332
333def register(genesets, serverFiles=None):
334    """
335    Hierarchy is induced from the gene set names.
336    """
337    if serverFiles == None:
338        _register_local(genesets)
339    else:
340        _register_serverfiles(genesets, serverFiles)
341
342def build_hierarchy_dict(files):
343    hierd = defaultdict(list)
344    for ind,f in enumerate(files):
345        hier, org = f
346        for i in range(len(hier)+1):
347            hierd[(hier[:i], org)].append(ind)
348    return hierd
349
350def load_local(hierarchy, organism):
351    files = map(lambda x: x[:2], list_local())
352    hierd = build_hierarchy_dict(files)
353
354    out = GeneSets()
355    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
356        fname = os.path.join(local_path(), filename(h, o))
357        out.update(pickle.load(open(fname, 'r')))
358    return out
359
360def load_serverfiles(hierarchy, organism):
361    files = map(lambda x: x[:2], list_serverfiles())
362    hierd = build_hierarchy_dict(files)
363    out = GeneSets()
364    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
365        fname = orngServerFiles.localpath_download(sfdomain,
366            filename(h, o))
367        out.update(pickle.load(open(fname, 'r')))
368    return out
369
370def load(hierarchy, organism):
371    """ First try to load from the local registred folder. If the file
372    is not available, load it from the server files. """
373    ret = load_local(hierarchy, organism)
374    if len(ret) == 0:
375        ret.update(load_serverfiles(hierarchy, organism))
376    return ret
377
378def collections(*args):
379    """
380    Input is a list of collections.
381    Collection can either be a tuple (hierarchy, orgranism), where
382    hierarchy is a tuple also.
383    """
384    result = GeneSets()
385
386    for collection in args:
387        try:
388            result.update(collection)
389        except (ValueError, TypeError):
390            if issequencens(collection): #have a hierarchy, organism specification
391                new = load(*collection)
392                result.update(new)
393            else:
394                if collection.lower()[-4:] == ".gmt": #format from webpage
395                    result.update(loadGMT(open(collection,"rt").read(), collection))
396                else:
397                    raise Exception("collection() accepts files in .gmt format only.")
398
399    return result
400
401def issequencens(x):
402    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
403    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
404
405class TException(Exception): pass
406
407def upload_genesets(rsf):
408    """
409    Builds the default gene sets and
410    """
411    orngServerFiles.update_local_files()
412
413    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
414    organisms = obiTaxonomy.common_taxids()
415    for fn in genesetsfn:
416        for org in organisms:
417            try:
418                print "Uploading ORG", org, fn
419                genesets = fn(org).split_by_hierarchy()
420                for gs in genesets:
421                    print "registering", gs.common_hierarchy()
422                    register(gs, rsf) #server files
423                    #register(gs)
424                    print "successful", gs.common_hierarchy()
425            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
426                print "organism not found", org
427
428if __name__ == "__main__":
429    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
430    upload_genesets(rsf)
431    pass
432
433
Note: See TracBrowser for help on using the repository browser.