source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1860:504bc511801c

Revision 1860:504bc511801c, 15.4 KB checked in by markotoplak, 7 months ago (diff)

obiGeneSets.collections raises an exception if gene sets are not found. Temporarily disabled gene sets without species in Gene Set Enrichment widget.

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12import datetime
13
14import Orange.core as orange
15from Orange.orng import orngServerFiles
16
17from . import obiGO, obiKEGG, obiTaxonomy
18
19sfdomain = "gene_sets"
20
21def nth(l,n):
22    return [ a[n] for a in l]
23
24from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
25
26class NoGenesetsException(Exception): pass
27
28def goGeneSets(org):
29    """Returns gene sets from GO."""
30    ontology = obiGO.Ontology()
31    annotations = obiGO.Annotations(org, ontology=ontology)
32
33    genesets = []
34    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
35    for termn, term in ontology.terms.items():
36        genes = annotations.GetAllGenes(termn)
37        hier = ("GO", term.namespace)
38        if len(genes) > 0:
39            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
40            genesets.append(gs)
41
42    return GeneSets(genesets)
43
44def keggGeneSets(org):
45    """
46    Returns gene sets from KEGG pathways.
47    """
48
49    kegg = obiKEGG.KEGGOrganism(org)
50
51    genesets = []
52    for id in kegg.pathways():
53        print id
54        pway = obiKEGG.KEGGPathway(id)
55        hier = ("KEGG","pathways")
56        gs = GeneSet(id=id,
57                                 name=pway.title,
58                                 genes=kegg.get_genes_by_pathway(id),
59                                 hierarchy=hier,
60                                 organism=org,
61                                 link=pway.link)
62        genesets.append(gs)
63
64    return GeneSets(genesets)
65
66def dictyMutantSets():
67    """
68    Return dicty mutant phenotype gene sets from Dictybase
69    """
70    from . import obiDictyMutants
71    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
72    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
73    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
74    #                    for mutant in obiDictyMutants.mutants()]
75 
76    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
77                        link="") \
78                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
79
80    return GeneSets(genesets)
81
82def cytobandGeneSets():
83    """
84    Create cytoband gene sets from Stanford Microarray Database
85    """
86    import urllib2
87
88    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
89    stream = urllib2.urlopen(url)
90    data = stream.read().splitlines()
91
92    genesets = []
93    for band in data:
94        b = band.split("\t")
95        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
96
97    return GeneSets(genesets)
98
99def reactomePathwaysGeneSets():
100    """
101    Prepare human pathways gene sets from reactome pathways
102    """
103    import urllib
104    import io
105    from zipfile import ZipFile
106
107    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
108    memfile = io.BytesIO(url.read())
109    with ZipFile(memfile, "r") as myzip:
110        f = myzip.open("ReactomePathways.gmt")
111        content = f.read().splitlines()     
112
113    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
114    return GeneSets(genesets)
115
116
117def omimGeneSets():
118    """
119    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
120    """
121    from . import obiOMIM   
122    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
123                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
124                    for disease in obiOMIM.diseases()]
125    return GeneSets(genesets)
126
127def miRNAGeneSets(org):
128    """
129    Return gene sets from miRNA targets
130    """
131    from . import obimiRNA
132    org_code = obiKEGG.from_taxid(org)
133    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
134    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
135    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
136                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
137    return GeneSets(genesets)
138
139def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
140    from . import obimiRNA, obiGO
141    mirnas = obimiRNA.ids(int(org))
142    if ontology is None:
143        ontology = obiGO.Ontology()
144
145    annotations = obiGO.Annotations(org, ontology=ontology)
146
147    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
148
149    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
150
151    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
152    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
153                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
154    gset = GeneSets(gsets)
155    return gset
156
157def loadGMT(contents, name):
158    """
159    Eech line consists of tab separated elements. First is
160    the geneset name, next is it's description.
161
162    For now the description is skipped.
163
164    Example Gene Set (.gmt) file:
165    anti-liver_sw   anti-liver_sw   BMPR1A  APOD    WSB1    BMI1    SLC2A1  ...
166    B-cells_sw  B-cells_sw  E2F5    NCF1    PALM2-AKAP2 IRF4    SLC2A1  ...
167    Bladder_sw  Bladder_sw  PLCD4   ANGPTL1 LOC286191   ST0N1   LOC283904   ...
168    cerebellum_sw   cerebellum_sw   C19orf43    LOC653464   KI110802    ...
169    Cervix_sw   Cervix_sw   LAMA4   GSTM5   SNX19   DKK1    NT5E    ...
170    """
171
172    def hline(s):
173        tabs = [tab.strip() for tab in s.split("\t")]
174        return GeneSet(id=tabs[0], description=tabs[1],
175                                   hierarchy=("Custom Gene Sets",name), genes=tabs[2:])
176
177    def handleNELines(s, fn):
178        """
179        Run function on nonempty lines of a string.
180        Return a list of results for each line.
181        """
182        lines = (l.strip() for l in s.splitlines())
183        return [fn(l) for l in lines if l]
184
185    return GeneSets(handleNELines(contents, hline))
186
187def getGenesetsStats(genesets):
188    num_sets = len(genesets)
189    unique_genes = len(set([gene for geneset in genesets for gene in geneset.genes]))
190    genes_per_geneset = sum([len(geneset.genes) for geneset in genesets])/num_sets
191    return num_sets, unique_genes, genes_per_geneset
192"""
193We have multiple paths for gene set data:
194buffer/bigfiles/gene_sets
195and
196buffer/gene_sets_local
197both have available.txt
198"""
199
200def omakedirs(dir):
201    try:
202        os.makedirs(dir)
203    except OSError:
204        pass
205
206def local_path():
207    """ Returns local path for gene sets. Creates it if it does not exists
208    yet. """
209    from Orange.orng import orngEnviron
210    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
211    omakedirs(pth)
212    return pth
213
214def build_index(dir):
215    """ Returns gene set availability index for some folder. """
216    pass
217
218def filename(hierarchy, organism):
219    """ Obtain a filename for given hierarchy and organism. """
220    return "gs_" + "_._".join(hierarchy + \
221        (organism if organism != None else "",)) + ".pck"
222
223def filename_parse(fn):
224    """ Returns a hierarchy and the organism from the filename."""
225    fn = fn[3:-4]
226    parts = fn.split("_._")
227    hierarchy = tuple(parts[:-1])
228    org = parts[-1] if parts[-1] != "" else None
229    return hierarchy, org
230
231def is_genesets_file(fn):
232    return fn.startswith("gs_") and fn.endswith(".pck")
233
234def list_local():
235    """ Returns available gene sets from the local repository:
236    a list of (hierarchy, organism, on_local) """
237    pth = local_path()
238    gs_files = filter(is_genesets_file, os.listdir(pth))
239    return [ filename_parse(fn) + (True,) for fn in gs_files ]
240
241def remove_local(gene_set):
242    """ Removes a given gene set from the local repository. """
243    pth = local_path()
244    gs_files = filter(is_genesets_file, os.listdir(pth)) 
245    for setfile in gs_files:
246        if setfile.__contains__(gene_set):
247            setBgone = os.path.join(pth, setfile)
248            os.remove(setBgone) 
249
250def modification_date(file):
251    t = os.path.getmtime(file)
252    return datetime.datetime.fromtimestamp(t)
253
254def list_serverfiles_from_flist(flist):
255    gs_files = filter(is_genesets_file, flist)
256    localfiles = set(orngServerFiles.listfiles(sfdomain))
257    return [ filename_parse(fn) + \
258        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
259
260def list_serverfiles_conn(serverfiles=None):
261    """ Returns available gene sets from the server files
262    repository: a list of (hierarchy, organism, on_local) """
263    if serverfiles == None:
264        serverfiles = orngServerFiles.ServerFiles()
265    flist = serverfiles.listfiles(sfdomain)
266    return list_serverfiles_from_flist(flist)
267
268def list_serverfiles():
269    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
270    flist = pickle.load(open(fname, 'r'))
271    return list_serverfiles_from_flist(flist)
272
273def list_all():
274    """
275    return a list of (hier, org, avalable_locally)
276    If something for a specific (hier, org) is not downloaded
277    yet, show it as not-local. """
278    flist = list_local() + list_serverfiles()
279    d = {}
280    for h,o,local in flist:
281        d[h,o] = min(local, d.get((h,o),True))
282    return [ (h,o,local) for (h,o),local in d.items() ]
283
284def update_server_list(serverfiles_upload, serverfiles_list=None):
285    if serverfiles_list == None:
286        serverfiles_list = orngServerFiles.ServerFiles()
287
288    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
289
290    tfname = pickle_temp(flist)
291
292    try:
293        fn = "index.pck"
294        title = "Gene sets: index"
295        tags = [ "gene sets", "index", "essential" ]
296        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
297        serverfiles_upload.unprotect(sfdomain, fn)
298    except Exception,e:
299        raise e
300    finally:
301        os.remove(tfname)
302
303def _register_local(genesets):
304    """ Registers using the common hierarchy and organism. """
305    pth = local_path()
306    org = genesets.common_org()
307    hierarchy = genesets.common_hierarchy()
308    fn = filename(hierarchy, org)
309
310    with open(os.path.join(pth, fn), "w") as f:
311        pickle.dump(genesets, f)
312
313    return fn
314
315def pickle_temp(obj):
316    """ Pickle a file to a temporary file and returns its name """
317    fd,tfname = tempfile.mkstemp()
318    os.close(fd)
319    f = open(tfname, 'wb')
320    pickle.dump(obj, f)
321    f.close()
322    return tfname
323
324def _register_serverfiles(genesets, serverFiles):
325    """ Registers using the common hierarchy and organism. """
326    org = genesets.common_org()
327    hierarchy = genesets.common_hierarchy()
328    fn = filename(hierarchy, org)
329
330    #save to temporary file
331    tfname = pickle_temp(genesets)
332
333    try:
334        taxname = obiTaxonomy.name(org)
335        title = "Gene sets: " + ", ".join(hierarchy) + \
336            ((" (" + taxname + ")") if org != None else "")
337        tags = list(hierarchy) + [ "gene sets", taxname ] + obiTaxonomy.shortname(org) +\
338            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
339        serverFiles.upload(sfdomain, fn, tfname, title, tags)
340        serverFiles.unprotect(sfdomain, fn)
341    finally:
342        os.remove(tfname)
343
344    update_server_list(serverFiles)
345
346def register(genesets, serverFiles=None):
347    """
348    Hierarchy is induced from the gene set names.
349    """
350    if serverFiles == None:
351        _register_local(genesets)
352    else:
353        _register_serverfiles(genesets, serverFiles)
354
355def build_hierarchy_dict(files):
356    hierd = defaultdict(list)
357    for ind,f in enumerate(files):
358        hier, org = f
359        for i in range(len(hier)+1):
360            hierd[(hier[:i], org)].append(ind)
361    return hierd
362
363def load_local(hierarchy, organism):
364    return load_fn(hierarchy, organism, list_local, 
365        lambda h,o: os.path.join(local_path(), filename(h, o)))
366
367def load_serverfiles(hierarchy, organism):
368    return load_fn(hierarchy, organism, list_serverfiles, 
369        lambda h,o: orngServerFiles.localpath_download(sfdomain, filename(h, o)))
370
371def load_fn(hierarchy, organism, fnlist, fnget):
372    files = map(lambda x: x[:2], fnlist())
373    hierd = build_hierarchy_dict(files)
374    out = GeneSets()
375    matches = hierd[(hierarchy, organism)]
376    if not matches:
377        exstr = "No gene sets for " + str(hierarchy) + \
378                " (org " + str(organism) + ")"
379        raise NoGenesetsException(exstr)
380    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
381        fname = fnget(h, o)
382        out.update(pickle.load(open(fname, 'r')))
383    return out
384
385def load(hierarchy, organism):
386    """ First try to load from the local registred folder. If the file
387    is not available, load it from the server files. """
388    try:
389        return load_local(hierarchy, organism)
390    except NoGenesetsException:
391        return load_serverfiles(hierarchy, organism)
392
393def collections(*args):
394    """
395    Input is a list of collections.
396    Collection can either be a tuple (hierarchy, orgranism), where
397    hierarchy is a tuple also.
398    """
399    result = GeneSets()
400
401    for collection in args:
402        try:
403            result.update(collection)
404        except (ValueError, TypeError):
405            if issequencens(collection): #have a hierarchy, organism specification
406                new = load(*collection)
407                result.update(new)
408            else:
409                if collection.lower()[-4:] == ".gmt": #format from webpage
410                    result.update(loadGMT(open(collection,"rt").read(), collection))
411                else:
412                    raise Exception("collection() accepts files in .gmt format only.")
413
414    return result
415
416def issequencens(x):
417    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
418    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
419
420class TException(Exception): pass
421
422def upload_genesets(rsf):
423    """
424    Builds the default gene sets and
425    """
426    orngServerFiles.update_local_files()
427
428    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
429    organisms = obiTaxonomy.common_taxids()
430    for fn in genesetsfn:
431        for org in organisms:
432            try:
433                print "Uploading ORG", org, fn
434                genesets = fn(org).split_by_hierarchy()
435                for gs in genesets:
436                    print "registering", gs.common_hierarchy()
437                    register(gs, rsf) #server files
438                    #register(gs)
439                    print "successful", gs.common_hierarchy()
440            except obiTaxonomy.UnknownSpeciesIdentifier:
441                print "Organism ontology not available", org
442            except GenesetRegException:
443                print "Empty gene sets.", org
444
445if __name__ == "__main__":
446    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
447    upload_genesets(rsf)
448    pass
449
450
Note: See TracBrowser for help on using the repository browser.