source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1860:504bc511801c

Revision 1860:504bc511801c, 15.4 KB checked in by markotoplak, 6 months ago (diff)

obiGeneSets.collections raises an exception if gene sets are not found. Temporarily disabled gene sets without species in Gene Set Enrichment widget.

RevLine 
[461]1"""
2Maintainer: Marko Toplak
3"""
[1043]4
[1632]5from __future__ import absolute_import, with_statement
6
[1707]7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
[1632]10import cPickle as pickle, os, tempfile, sys
[1108]11from collections import defaultdict
[1807]12import datetime
[1106]13
[1707]14import Orange.core as orange
[1632]15from Orange.orng import orngServerFiles
16
[1643]17from . import obiGO, obiKEGG, obiTaxonomy
[1632]18
[1106]19sfdomain = "gene_sets"
[461]20
[485]21def nth(l,n):
22    return [ a[n] for a in l]
23
[1712]24from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
[1107]25
[1860]26class NoGenesetsException(Exception): pass
27
[1094]28def goGeneSets(org):
29    """Returns gene sets from GO."""
[1141]30    ontology = obiGO.Ontology()
31    annotations = obiGO.Annotations(org, ontology=ontology)
[626]32
[1094]33    genesets = []
[1141]34    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
[1094]35    for termn, term in ontology.terms.items():
36        genes = annotations.GetAllGenes(termn)
37        hier = ("GO", term.namespace)
[1108]38        if len(genes) > 0:
[1761]39            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
[1108]40            genesets.append(gs)
[461]41
[1712]42    return GeneSets(genesets)
[461]43
[1094]44def keggGeneSets(org):
45    """
46    Returns gene sets from KEGG pathways.
47    """
[1761]48
[1094]49    kegg = obiKEGG.KEGGOrganism(org)
[461]50
[1094]51    genesets = []
52    for id in kegg.pathways():
[1712]53        print id
[1094]54        pway = obiKEGG.KEGGPathway(id)
[1549]55        hier = ("KEGG","pathways")
[1712]56        gs = GeneSet(id=id,
[1549]57                                 name=pway.title,
58                                 genes=kegg.get_genes_by_pathway(id),
59                                 hierarchy=hier,
60                                 organism=org,
61                                 link=pway.link)
[1094]62        genesets.append(gs)
[461]63
[1712]64    return GeneSets(genesets)
[461]65
[1761]66def dictyMutantSets():
67    """
68    Return dicty mutant phenotype gene sets from Dictybase
69    """
70    from . import obiDictyMutants
[1780]71    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
[1782]72    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
[1781]73    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
74    #                    for mutant in obiDictyMutants.mutants()]
75 
[1782]76    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
[1781]77                        link="") \
78                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
[1780]79
[1761]80    return GeneSets(genesets)
81
[1794]82def cytobandGeneSets():
83    """
[1795]84    Create cytoband gene sets from Stanford Microarray Database
[1794]85    """
[1795]86    import urllib2
87
88    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
89    stream = urllib2.urlopen(url)
90    data = stream.read().splitlines()
91
92    genesets = []
93    for band in data:
94        b = band.split("\t")
95        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
96
[1794]97    return GeneSets(genesets)
98
[1798]99def reactomePathwaysGeneSets():
100    """
101    Prepare human pathways gene sets from reactome pathways
102    """
103    import urllib
104    import io
105    from zipfile import ZipFile
106
107    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
108    memfile = io.BytesIO(url.read())
109    with ZipFile(memfile, "r") as myzip:
110        f = myzip.open("ReactomePathways.gmt")
111        content = f.read().splitlines()     
112
113    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
114    return GeneSets(genesets)
115
116
[1141]117def omimGeneSets():
118    """
119    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
120    """
[1778]121    from . import obiOMIM   
[1141]122    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
[1778]123                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
[1141]124                    for disease in obiOMIM.diseases()]
125    return GeneSets(genesets)
126
[1144]127def miRNAGeneSets(org):
128    """
129    Return gene sets from miRNA targets
130    """
[1716]131    from . import obimiRNA
132    org_code = obiKEGG.from_taxid(org)
[1144]133    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
134    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
[1549]135    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
[1144]136                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
137    return GeneSets(genesets)
138
[1549]139def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
[1632]140    from . import obimiRNA, obiGO
[1549]141    mirnas = obimiRNA.ids(int(org))
142    if ontology is None:
143        ontology = obiGO.Ontology()
[1761]144
[1549]145    annotations = obiGO.Annotations(org, ontology=ontology)
[1761]146
[1549]147    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
[1761]148
[1549]149    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
[1761]150
[1549]151    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
[1712]152    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
[1549]153                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
[1712]154    gset = GeneSets(gsets)
[1549]155    return gset
156
[1095]157def loadGMT(contents, name):
[485]158    """
159    Eech line consists of tab separated elements. First is
[1608]160    the geneset name, next is it's description.
[1761]161
[485]162    For now the description is skipped.
[1818]163
164    Example Gene Set (.gmt) file:
165    anti-liver_sw   anti-liver_sw   BMPR1A  APOD    WSB1    BMI1    SLC2A1  ...
166    B-cells_sw  B-cells_sw  E2F5    NCF1    PALM2-AKAP2 IRF4    SLC2A1  ...
167    Bladder_sw  Bladder_sw  PLCD4   ANGPTL1 LOC286191   ST0N1   LOC283904   ...
168    cerebellum_sw   cerebellum_sw   C19orf43    LOC653464   KI110802    ...
169    Cervix_sw   Cervix_sw   LAMA4   GSTM5   SNX19   DKK1    NT5E    ...
[485]170    """
[1643]171
[485]172    def hline(s):
[1608]173        tabs = [tab.strip() for tab in s.split("\t")]
[1712]174        return GeneSet(id=tabs[0], description=tabs[1],
[1816]175                                   hierarchy=("Custom Gene Sets",name), genes=tabs[2:])
[485]176
[1106]177    def handleNELines(s, fn):
178        """
179        Run function on nonempty lines of a string.
180        Return a list of results for each line.
181        """
[1608]182        lines = (l.strip() for l in s.splitlines())
183        return [fn(l) for l in lines if l]
[1106]184
[1712]185    return GeneSets(handleNELines(contents, hline))
[485]186
[1814]187def getGenesetsStats(genesets):
188    num_sets = len(genesets)
189    unique_genes = len(set([gene for geneset in genesets for gene in geneset.genes]))
190    genes_per_geneset = sum([len(geneset.genes) for geneset in genesets])/num_sets
191    return num_sets, unique_genes, genes_per_geneset
[1106]192"""
193We have multiple paths for gene set data:
194buffer/bigfiles/gene_sets
195and
196buffer/gene_sets_local
197both have available.txt
198"""
199
200def omakedirs(dir):
201    try:
202        os.makedirs(dir)
203    except OSError:
204        pass
205
206def local_path():
207    """ Returns local path for gene sets. Creates it if it does not exists
208    yet. """
[1632]209    from Orange.orng import orngEnviron
[1108]210    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
[1106]211    omakedirs(pth)
212    return pth
213
214def build_index(dir):
215    """ Returns gene set availability index for some folder. """
216    pass
217
218def filename(hierarchy, organism):
219    """ Obtain a filename for given hierarchy and organism. """
220    return "gs_" + "_._".join(hierarchy + \
221        (organism if organism != None else "",)) + ".pck"
222
223def filename_parse(fn):
224    """ Returns a hierarchy and the organism from the filename."""
225    fn = fn[3:-4]
226    parts = fn.split("_._")
227    hierarchy = tuple(parts[:-1])
228    org = parts[-1] if parts[-1] != "" else None
229    return hierarchy, org
230
231def is_genesets_file(fn):
232    return fn.startswith("gs_") and fn.endswith(".pck")
233
234def list_local():
235    """ Returns available gene sets from the local repository:
236    a list of (hierarchy, organism, on_local) """
237    pth = local_path()
238    gs_files = filter(is_genesets_file, os.listdir(pth))
[1108]239    return [ filename_parse(fn) + (True,) for fn in gs_files ]
[1761]240
[1805]241def remove_local(gene_set):
242    """ Removes a given gene set from the local repository. """
243    pth = local_path()
244    gs_files = filter(is_genesets_file, os.listdir(pth)) 
245    for setfile in gs_files:
246        if setfile.__contains__(gene_set):
247            setBgone = os.path.join(pth, setfile)
248            os.remove(setBgone) 
249
[1807]250def modification_date(file):
251    t = os.path.getmtime(file)
252    return datetime.datetime.fromtimestamp(t)
253
[1108]254def list_serverfiles_from_flist(flist):
255    gs_files = filter(is_genesets_file, flist)
256    localfiles = set(orngServerFiles.listfiles(sfdomain))
257    return [ filename_parse(fn) + \
258        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
259
[1106]260def list_serverfiles_conn(serverfiles=None):
261    """ Returns available gene sets from the server files
262    repository: a list of (hierarchy, organism, on_local) """
263    if serverfiles == None:
264        serverfiles = orngServerFiles.ServerFiles()
[1108]265    flist = serverfiles.listfiles(sfdomain)
266    return list_serverfiles_from_flist(flist)
267
268def list_serverfiles():
269    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
270    flist = pickle.load(open(fname, 'r'))
271    return list_serverfiles_from_flist(flist)
272
273def list_all():
274    """
275    return a list of (hier, org, avalable_locally)
276    If something for a specific (hier, org) is not downloaded
277    yet, show it as not-local. """
278    flist = list_local() + list_serverfiles()
279    d = {}
280    for h,o,local in flist:
281        d[h,o] = min(local, d.get((h,o),True))
282    return [ (h,o,local) for (h,o),local in d.items() ]
[1106]283
284def update_server_list(serverfiles_upload, serverfiles_list=None):
285    if serverfiles_list == None:
286        serverfiles_list = orngServerFiles.ServerFiles()
[1108]287
288    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
[1106]289
290    tfname = pickle_temp(flist)
[1761]291
[1106]292    try:
293        fn = "index.pck"
294        title = "Gene sets: index"
295        tags = [ "gene sets", "index", "essential" ]
296        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
297        serverfiles_upload.unprotect(sfdomain, fn)
[1111]298    except Exception,e:
[1106]299        raise e
300    finally:
301        os.remove(tfname)
302
[1712]303def _register_local(genesets):
[1106]304    """ Registers using the common hierarchy and organism. """
305    pth = local_path()
306    org = genesets.common_org()
307    hierarchy = genesets.common_hierarchy()
[1108]308    fn = filename(hierarchy, org)
[1106]309
310    with open(os.path.join(pth, fn), "w") as f:
311        pickle.dump(genesets, f)
312
313    return fn
314
315def pickle_temp(obj):
[1712]316    """ Pickle a file to a temporary file and returns its name """
[1106]317    fd,tfname = tempfile.mkstemp()
318    os.close(fd)
319    f = open(tfname, 'wb')
320    pickle.dump(obj, f)
321    f.close()
322    return tfname
323
[1712]324def _register_serverfiles(genesets, serverFiles):
[1106]325    """ Registers using the common hierarchy and organism. """
326    org = genesets.common_org()
327    hierarchy = genesets.common_hierarchy()
328    fn = filename(hierarchy, org)
329
330    #save to temporary file
331    tfname = pickle_temp(genesets)
[1761]332
[1106]333    try:
[1108]334        taxname = obiTaxonomy.name(org)
[1106]335        title = "Gene sets: " + ", ".join(hierarchy) + \
[1108]336            ((" (" + taxname + ")") if org != None else "")
[1828]337        tags = list(hierarchy) + [ "gene sets", taxname ] + obiTaxonomy.shortname(org) +\
[1108]338            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
[1106]339        serverFiles.upload(sfdomain, fn, tfname, title, tags)
340        serverFiles.unprotect(sfdomain, fn)
341    finally:
342        os.remove(tfname)
343
344    update_server_list(serverFiles)
345
346def register(genesets, serverFiles=None):
347    """
348    Hierarchy is induced from the gene set names.
349    """
350    if serverFiles == None:
[1712]351        _register_local(genesets)
[1106]352    else:
[1712]353        _register_serverfiles(genesets, serverFiles)
[1106]354
[1108]355def build_hierarchy_dict(files):
356    hierd = defaultdict(list)
357    for ind,f in enumerate(files):
358        hier, org = f
359        for i in range(len(hier)+1):
360            hierd[(hier[:i], org)].append(ind)
361    return hierd
362
[1836]363def load_local(hierarchy, organism):
[1860]364    return load_fn(hierarchy, organism, list_local, 
365        lambda h,o: os.path.join(local_path(), filename(h, o)))
[1108]366
367def load_serverfiles(hierarchy, organism):
[1860]368    return load_fn(hierarchy, organism, list_serverfiles, 
369        lambda h,o: orngServerFiles.localpath_download(sfdomain, filename(h, o)))
370
371def load_fn(hierarchy, organism, fnlist, fnget):
372    files = map(lambda x: x[:2], fnlist())
[1108]373    hierd = build_hierarchy_dict(files)
374    out = GeneSets()
[1860]375    matches = hierd[(hierarchy, organism)]
376    if not matches:
377        exstr = "No gene sets for " + str(hierarchy) + \
378                " (org " + str(organism) + ")"
379        raise NoGenesetsException(exstr)
[1108]380    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
[1860]381        fname = fnget(h, o)
[1108]382        out.update(pickle.load(open(fname, 'r')))
383    return out
384
385def load(hierarchy, organism):
[1712]386    """ First try to load from the local registred folder. If the file
387    is not available, load it from the server files. """
[1860]388    try:
389        return load_local(hierarchy, organism)
390    except NoGenesetsException:
391        return load_serverfiles(hierarchy, organism)
[1108]392
393def collections(*args):
[485]394    """
395    Input is a list of collections.
[1108]396    Collection can either be a tuple (hierarchy, orgranism), where
397    hierarchy is a tuple also.
[485]398    """
[1712]399    result = GeneSets()
[485]400
[1108]401    for collection in args:
[1569]402        try:
[1108]403            result.update(collection)
[1713]404        except (ValueError, TypeError):
[1569]405            if issequencens(collection): #have a hierarchy, organism specification
406                new = load(*collection)
407                result.update(new)
[1108]408            else:
[1569]409                if collection.lower()[-4:] == ".gmt": #format from webpage
410                    result.update(loadGMT(open(collection,"rt").read(), collection))
411                else:
412                    raise Exception("collection() accepts files in .gmt format only.")
413
[1108]414    return result
[485]415
[1108]416def issequencens(x):
417    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
418    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
[485]419
[1214]420class TException(Exception): pass
421
[1108]422def upload_genesets(rsf):
423    """
[1761]424    Builds the default gene sets and
[1108]425    """
426    orngServerFiles.update_local_files()
[485]427
[1713]428    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
[1108]429    organisms = obiTaxonomy.common_taxids()
430    for fn in genesetsfn:
431        for org in organisms:
432            try:
[1712]433                print "Uploading ORG", org, fn
[1108]434                genesets = fn(org).split_by_hierarchy()
435                for gs in genesets:
436                    print "registering", gs.common_hierarchy()
[1713]437                    register(gs, rsf) #server files
438                    #register(gs)
[1214]439                    print "successful", gs.common_hierarchy()
[1835]440            except obiTaxonomy.UnknownSpeciesIdentifier:
441                print "Organism ontology not available", org
442            except GenesetRegException:
443                print "Empty gene sets.", org
[1712]444
[461]445if __name__ == "__main__":
[1214]446    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
447    upload_genesets(rsf)
[1712]448    pass
[1761]449
450
Note: See TracBrowser for help on using the repository browser.