source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1836:b14df26ba71c

Revision 1836:b14df26ba71c, 15.2 KB checked in by Ales Erjavec <ales.erjavec@…>, 9 months ago (diff)

Reverted part of changeset 857938f13079 ('load_local' definition deletion).

RevLine 
[461]1"""
2Maintainer: Marko Toplak
3"""
[1043]4
[1632]5from __future__ import absolute_import, with_statement
6
[1707]7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
[1632]10import cPickle as pickle, os, tempfile, sys
[1108]11from collections import defaultdict
[1807]12import datetime
[1106]13
[1707]14import Orange.core as orange
[1632]15from Orange.orng import orngServerFiles
16
[1643]17from . import obiGO, obiKEGG, obiTaxonomy
[1632]18
[1106]19sfdomain = "gene_sets"
[461]20
[485]21def nth(l,n):
22    return [ a[n] for a in l]
23
[1712]24from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
[1107]25
[1094]26def goGeneSets(org):
27    """Returns gene sets from GO."""
[1141]28    ontology = obiGO.Ontology()
29    annotations = obiGO.Annotations(org, ontology=ontology)
[626]30
[1094]31    genesets = []
[1141]32    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
[1094]33    for termn, term in ontology.terms.items():
34        genes = annotations.GetAllGenes(termn)
35        hier = ("GO", term.namespace)
[1108]36        if len(genes) > 0:
[1761]37            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
[1108]38            genesets.append(gs)
[461]39
[1712]40    return GeneSets(genesets)
[461]41
[1094]42def keggGeneSets(org):
43    """
44    Returns gene sets from KEGG pathways.
45    """
[1761]46
[1094]47    kegg = obiKEGG.KEGGOrganism(org)
[461]48
[1094]49    genesets = []
50    for id in kegg.pathways():
[1712]51        print id
[1094]52        pway = obiKEGG.KEGGPathway(id)
[1549]53        hier = ("KEGG","pathways")
[1712]54        gs = GeneSet(id=id,
[1549]55                                 name=pway.title,
56                                 genes=kegg.get_genes_by_pathway(id),
57                                 hierarchy=hier,
58                                 organism=org,
59                                 link=pway.link)
[1094]60        genesets.append(gs)
[461]61
[1712]62    return GeneSets(genesets)
[461]63
[1761]64def dictyMutantSets():
65    """
66    Return dicty mutant phenotype gene sets from Dictybase
67    """
68    from . import obiDictyMutants
[1780]69    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
[1782]70    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
[1781]71    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
72    #                    for mutant in obiDictyMutants.mutants()]
73 
[1782]74    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
[1781]75                        link="") \
76                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
[1780]77
[1761]78    return GeneSets(genesets)
79
[1794]80def cytobandGeneSets():
81    """
[1795]82    Create cytoband gene sets from Stanford Microarray Database
[1794]83    """
[1795]84    import urllib2
85
86    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
87    stream = urllib2.urlopen(url)
88    data = stream.read().splitlines()
89
90    genesets = []
91    for band in data:
92        b = band.split("\t")
93        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
94
[1794]95    return GeneSets(genesets)
96
[1798]97def reactomePathwaysGeneSets():
98    """
99    Prepare human pathways gene sets from reactome pathways
100    """
101    import urllib
102    import io
103    from zipfile import ZipFile
104
105    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
106    memfile = io.BytesIO(url.read())
107    with ZipFile(memfile, "r") as myzip:
108        f = myzip.open("ReactomePathways.gmt")
109        content = f.read().splitlines()     
110
111    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
112    return GeneSets(genesets)
113
114
[1141]115def omimGeneSets():
116    """
117    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
118    """
[1778]119    from . import obiOMIM   
[1141]120    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
[1778]121                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
[1141]122                    for disease in obiOMIM.diseases()]
123    return GeneSets(genesets)
124
[1144]125def miRNAGeneSets(org):
126    """
127    Return gene sets from miRNA targets
128    """
[1716]129    from . import obimiRNA
130    org_code = obiKEGG.from_taxid(org)
[1144]131    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
132    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
[1549]133    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
[1144]134                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
135    return GeneSets(genesets)
136
[1549]137def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
[1632]138    from . import obimiRNA, obiGO
[1549]139    mirnas = obimiRNA.ids(int(org))
140    if ontology is None:
141        ontology = obiGO.Ontology()
[1761]142
[1549]143    annotations = obiGO.Annotations(org, ontology=ontology)
[1761]144
[1549]145    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
[1761]146
[1549]147    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
[1761]148
[1549]149    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
[1712]150    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
[1549]151                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
[1712]152    gset = GeneSets(gsets)
[1549]153    return gset
154
[1095]155def loadGMT(contents, name):
[485]156    """
157    Eech line consists of tab separated elements. First is
[1608]158    the geneset name, next is it's description.
[1761]159
[485]160    For now the description is skipped.
[1818]161
162    Example Gene Set (.gmt) file:
163    anti-liver_sw   anti-liver_sw   BMPR1A  APOD    WSB1    BMI1    SLC2A1  ...
164    B-cells_sw  B-cells_sw  E2F5    NCF1    PALM2-AKAP2 IRF4    SLC2A1  ...
165    Bladder_sw  Bladder_sw  PLCD4   ANGPTL1 LOC286191   ST0N1   LOC283904   ...
166    cerebellum_sw   cerebellum_sw   C19orf43    LOC653464   KI110802    ...
167    Cervix_sw   Cervix_sw   LAMA4   GSTM5   SNX19   DKK1    NT5E    ...
[485]168    """
[1643]169
[485]170    def hline(s):
[1608]171        tabs = [tab.strip() for tab in s.split("\t")]
[1712]172        return GeneSet(id=tabs[0], description=tabs[1],
[1816]173                                   hierarchy=("Custom Gene Sets",name), genes=tabs[2:])
[485]174
[1106]175    def handleNELines(s, fn):
176        """
177        Run function on nonempty lines of a string.
178        Return a list of results for each line.
179        """
[1608]180        lines = (l.strip() for l in s.splitlines())
181        return [fn(l) for l in lines if l]
[1106]182
[1712]183    return GeneSets(handleNELines(contents, hline))
[485]184
[1814]185def getGenesetsStats(genesets):
186    num_sets = len(genesets)
187    unique_genes = len(set([gene for geneset in genesets for gene in geneset.genes]))
188    genes_per_geneset = sum([len(geneset.genes) for geneset in genesets])/num_sets
189    return num_sets, unique_genes, genes_per_geneset
[1106]190"""
191We have multiple paths for gene set data:
192buffer/bigfiles/gene_sets
193and
194buffer/gene_sets_local
195both have available.txt
196"""
197
198def omakedirs(dir):
199    try:
200        os.makedirs(dir)
201    except OSError:
202        pass
203
204def local_path():
205    """ Returns local path for gene sets. Creates it if it does not exists
206    yet. """
[1632]207    from Orange.orng import orngEnviron
[1108]208    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
[1106]209    omakedirs(pth)
210    return pth
211
212def build_index(dir):
213    """ Returns gene set availability index for some folder. """
214    pass
215
216def filename(hierarchy, organism):
217    """ Obtain a filename for given hierarchy and organism. """
218    return "gs_" + "_._".join(hierarchy + \
219        (organism if organism != None else "",)) + ".pck"
220
221def filename_parse(fn):
222    """ Returns a hierarchy and the organism from the filename."""
223    fn = fn[3:-4]
224    parts = fn.split("_._")
225    hierarchy = tuple(parts[:-1])
226    org = parts[-1] if parts[-1] != "" else None
227    return hierarchy, org
228
229def is_genesets_file(fn):
230    return fn.startswith("gs_") and fn.endswith(".pck")
231
232def list_local():
233    """ Returns available gene sets from the local repository:
234    a list of (hierarchy, organism, on_local) """
235    pth = local_path()
236    gs_files = filter(is_genesets_file, os.listdir(pth))
[1108]237    return [ filename_parse(fn) + (True,) for fn in gs_files ]
[1761]238
[1805]239def remove_local(gene_set):
240    """ Removes a given gene set from the local repository. """
241    pth = local_path()
242    gs_files = filter(is_genesets_file, os.listdir(pth)) 
243    for setfile in gs_files:
244        if setfile.__contains__(gene_set):
245            setBgone = os.path.join(pth, setfile)
246            os.remove(setBgone) 
247
[1807]248def modification_date(file):
249    t = os.path.getmtime(file)
250    return datetime.datetime.fromtimestamp(t)
251
[1108]252def list_serverfiles_from_flist(flist):
253    gs_files = filter(is_genesets_file, flist)
254    localfiles = set(orngServerFiles.listfiles(sfdomain))
255    return [ filename_parse(fn) + \
256        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
257
[1106]258def list_serverfiles_conn(serverfiles=None):
259    """ Returns available gene sets from the server files
260    repository: a list of (hierarchy, organism, on_local) """
261    if serverfiles == None:
262        serverfiles = orngServerFiles.ServerFiles()
[1108]263    flist = serverfiles.listfiles(sfdomain)
264    return list_serverfiles_from_flist(flist)
265
266def list_serverfiles():
267    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
268    flist = pickle.load(open(fname, 'r'))
269    return list_serverfiles_from_flist(flist)
270
271def list_all():
272    """
273    return a list of (hier, org, avalable_locally)
274    If something for a specific (hier, org) is not downloaded
275    yet, show it as not-local. """
276    flist = list_local() + list_serverfiles()
277    d = {}
278    for h,o,local in flist:
279        d[h,o] = min(local, d.get((h,o),True))
280    return [ (h,o,local) for (h,o),local in d.items() ]
[1106]281
282def update_server_list(serverfiles_upload, serverfiles_list=None):
283    if serverfiles_list == None:
284        serverfiles_list = orngServerFiles.ServerFiles()
[1108]285
286    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
[1106]287
288    tfname = pickle_temp(flist)
[1761]289
[1106]290    try:
291        fn = "index.pck"
292        title = "Gene sets: index"
293        tags = [ "gene sets", "index", "essential" ]
294        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
295        serverfiles_upload.unprotect(sfdomain, fn)
[1111]296    except Exception,e:
[1106]297        raise e
298    finally:
299        os.remove(tfname)
300
[1712]301def _register_local(genesets):
[1106]302    """ Registers using the common hierarchy and organism. """
303    pth = local_path()
304    org = genesets.common_org()
305    hierarchy = genesets.common_hierarchy()
[1108]306    fn = filename(hierarchy, org)
[1106]307
308    with open(os.path.join(pth, fn), "w") as f:
309        pickle.dump(genesets, f)
310
311    return fn
312
313def pickle_temp(obj):
[1712]314    """ Pickle a file to a temporary file and returns its name """
[1106]315    fd,tfname = tempfile.mkstemp()
316    os.close(fd)
317    f = open(tfname, 'wb')
318    pickle.dump(obj, f)
319    f.close()
320    return tfname
321
[1712]322def _register_serverfiles(genesets, serverFiles):
[1106]323    """ Registers using the common hierarchy and organism. """
324    org = genesets.common_org()
325    hierarchy = genesets.common_hierarchy()
326    fn = filename(hierarchy, org)
327
328    #save to temporary file
329    tfname = pickle_temp(genesets)
[1761]330
[1106]331    try:
[1108]332        taxname = obiTaxonomy.name(org)
[1106]333        title = "Gene sets: " + ", ".join(hierarchy) + \
[1108]334            ((" (" + taxname + ")") if org != None else "")
[1828]335        tags = list(hierarchy) + [ "gene sets", taxname ] + obiTaxonomy.shortname(org) +\
[1108]336            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
[1106]337        serverFiles.upload(sfdomain, fn, tfname, title, tags)
338        serverFiles.unprotect(sfdomain, fn)
339    finally:
340        os.remove(tfname)
341
342    update_server_list(serverFiles)
343
344def register(genesets, serverFiles=None):
345    """
346    Hierarchy is induced from the gene set names.
347    """
348    if serverFiles == None:
[1712]349        _register_local(genesets)
[1106]350    else:
[1712]351        _register_serverfiles(genesets, serverFiles)
[1106]352
[1108]353def build_hierarchy_dict(files):
354    hierd = defaultdict(list)
355    for ind,f in enumerate(files):
356        hier, org = f
357        for i in range(len(hier)+1):
358            hierd[(hier[:i], org)].append(ind)
359    return hierd
360
[1836]361def load_local(hierarchy, organism):
[1108]362    files = map(lambda x: x[:2], list_local())
363    hierd = build_hierarchy_dict(files)
364
365    out = GeneSets()
366    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
367        fname = os.path.join(local_path(), filename(h, o))
368        out.update(pickle.load(open(fname, 'r')))
369    return out
370
371def load_serverfiles(hierarchy, organism):
372    files = map(lambda x: x[:2], list_serverfiles())
373    hierd = build_hierarchy_dict(files)
374    out = GeneSets()
375    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
[1761]376        fname = orngServerFiles.localpath_download(sfdomain,
[1108]377            filename(h, o))
378        out.update(pickle.load(open(fname, 'r')))
379    return out
380
381def load(hierarchy, organism):
[1712]382    """ First try to load from the local registred folder. If the file
383    is not available, load it from the server files. """
[1108]384    ret = load_local(hierarchy, organism)
[1712]385    if len(ret) == 0:
386        ret.update(load_serverfiles(hierarchy, organism))
[1108]387    return ret
388
389def collections(*args):
[485]390    """
391    Input is a list of collections.
[1108]392    Collection can either be a tuple (hierarchy, orgranism), where
393    hierarchy is a tuple also.
[485]394    """
[1712]395    result = GeneSets()
[485]396
[1108]397    for collection in args:
[1569]398        try:
[1108]399            result.update(collection)
[1713]400        except (ValueError, TypeError):
[1569]401            if issequencens(collection): #have a hierarchy, organism specification
402                new = load(*collection)
403                result.update(new)
[1108]404            else:
[1569]405                if collection.lower()[-4:] == ".gmt": #format from webpage
406                    result.update(loadGMT(open(collection,"rt").read(), collection))
407                else:
408                    raise Exception("collection() accepts files in .gmt format only.")
409
[1108]410    return result
[485]411
[1108]412def issequencens(x):
413    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
414    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
[485]415
[1214]416class TException(Exception): pass
417
[1108]418def upload_genesets(rsf):
419    """
[1761]420    Builds the default gene sets and
[1108]421    """
422    orngServerFiles.update_local_files()
[485]423
[1713]424    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
[1108]425    organisms = obiTaxonomy.common_taxids()
426    for fn in genesetsfn:
427        for org in organisms:
428            try:
[1712]429                print "Uploading ORG", org, fn
[1108]430                genesets = fn(org).split_by_hierarchy()
431                for gs in genesets:
432                    print "registering", gs.common_hierarchy()
[1713]433                    register(gs, rsf) #server files
434                    #register(gs)
[1214]435                    print "successful", gs.common_hierarchy()
[1835]436            except obiTaxonomy.UnknownSpeciesIdentifier:
437                print "Organism ontology not available", org
438            except GenesetRegException:
439                print "Empty gene sets.", org
[1712]440
[461]441if __name__ == "__main__":
[1214]442    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
443    upload_genesets(rsf)
[1712]444    pass
[1761]445
446
Note: See TracBrowser for help on using the repository browser.