source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1814:c7d693f650af

Revision 1814:c7d693f650af, 14.7 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

Added a genesets statistics function to obiGeneSets.py. Added additional columns in OWCustomSets.py

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12import datetime
13
14import Orange.core as orange
15from Orange.orng import orngServerFiles
16
17from . import obiGO, obiKEGG, obiTaxonomy
18
19sfdomain = "gene_sets"
20
21def nth(l,n):
22    return [ a[n] for a in l]
23
24from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
25
26def goGeneSets(org):
27    """Returns gene sets from GO."""
28    ontology = obiGO.Ontology()
29    annotations = obiGO.Annotations(org, ontology=ontology)
30
31    genesets = []
32    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
33    for termn, term in ontology.terms.items():
34        genes = annotations.GetAllGenes(termn)
35        hier = ("GO", term.namespace)
36        if len(genes) > 0:
37            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
38            genesets.append(gs)
39
40    return GeneSets(genesets)
41
42def keggGeneSets(org):
43    """
44    Returns gene sets from KEGG pathways.
45    """
46
47    kegg = obiKEGG.KEGGOrganism(org)
48
49    genesets = []
50    for id in kegg.pathways():
51        print id
52        pway = obiKEGG.KEGGPathway(id)
53        hier = ("KEGG","pathways")
54        gs = GeneSet(id=id,
55                                 name=pway.title,
56                                 genes=kegg.get_genes_by_pathway(id),
57                                 hierarchy=hier,
58                                 organism=org,
59                                 link=pway.link)
60        genesets.append(gs)
61
62    return GeneSets(genesets)
63
64def dictyMutantSets():
65    """
66    Return dicty mutant phenotype gene sets from Dictybase
67    """
68    from . import obiDictyMutants
69    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
70    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
71    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
72    #                    for mutant in obiDictyMutants.mutants()]
73 
74    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
75                        link="") \
76                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
77
78    return GeneSets(genesets)
79
80def cytobandGeneSets():
81    """
82    Create cytoband gene sets from Stanford Microarray Database
83    """
84    import urllib2
85
86    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
87    stream = urllib2.urlopen(url)
88    data = stream.read().splitlines()
89
90    genesets = []
91    for band in data:
92        b = band.split("\t")
93        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
94
95    return GeneSets(genesets)
96
97def reactomePathwaysGeneSets():
98    """
99    Prepare human pathways gene sets from reactome pathways
100    """
101    import urllib
102    import io
103    from zipfile import ZipFile
104
105    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
106    memfile = io.BytesIO(url.read())
107    with ZipFile(memfile, "r") as myzip:
108        f = myzip.open("ReactomePathways.gmt")
109        content = f.read().splitlines()     
110
111    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
112    return GeneSets(genesets)
113
114
115def omimGeneSets():
116    """
117    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
118    """
119    from . import obiOMIM   
120    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
121                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
122                    for disease in obiOMIM.diseases()]
123    return GeneSets(genesets)
124
125def miRNAGeneSets(org):
126    """
127    Return gene sets from miRNA targets
128    """
129    from . import obimiRNA
130    org_code = obiKEGG.from_taxid(org)
131    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
132    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
133    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
134                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
135    return GeneSets(genesets)
136
137def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
138    from . import obimiRNA, obiGO
139    mirnas = obimiRNA.ids(int(org))
140    if ontology is None:
141        ontology = obiGO.Ontology()
142
143    annotations = obiGO.Annotations(org, ontology=ontology)
144
145    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
146
147    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
148
149    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
150    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
151                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
152    gset = GeneSets(gsets)
153    return gset
154
155def loadGMT(contents, name):
156    """
157    Eech line consists of tab separated elements. First is
158    the geneset name, next is it's description.
159
160    For now the description is skipped.
161    """
162
163    def hline(s):
164        tabs = [tab.strip() for tab in s.split("\t")]
165        return GeneSet(id=tabs[0], description=tabs[1],
166                                   hierarchy=("Custom Genesets",name), genes=tabs[2:])
167
168    def handleNELines(s, fn):
169        """
170        Run function on nonempty lines of a string.
171        Return a list of results for each line.
172        """
173        lines = (l.strip() for l in s.splitlines())
174        return [fn(l) for l in lines if l]
175
176    return GeneSets(handleNELines(contents, hline))
177
178def getGenesetsStats(genesets):
179    num_sets = len(genesets)
180    unique_genes = len(set([gene for geneset in genesets for gene in geneset.genes]))
181    genes_per_geneset = sum([len(geneset.genes) for geneset in genesets])/num_sets
182    return num_sets, unique_genes, genes_per_geneset
183"""
184We have multiple paths for gene set data:
185buffer/bigfiles/gene_sets
186and
187buffer/gene_sets_local
188both have available.txt
189"""
190
191def omakedirs(dir):
192    try:
193        os.makedirs(dir)
194    except OSError:
195        pass
196
197def local_path():
198    """ Returns local path for gene sets. Creates it if it does not exists
199    yet. """
200    from Orange.orng import orngEnviron
201    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
202    omakedirs(pth)
203    return pth
204
205def build_index(dir):
206    """ Returns gene set availability index for some folder. """
207    pass
208
209def filename(hierarchy, organism):
210    """ Obtain a filename for given hierarchy and organism. """
211    return "gs_" + "_._".join(hierarchy + \
212        (organism if organism != None else "",)) + ".pck"
213
214def filename_parse(fn):
215    """ Returns a hierarchy and the organism from the filename."""
216    fn = fn[3:-4]
217    parts = fn.split("_._")
218    hierarchy = tuple(parts[:-1])
219    org = parts[-1] if parts[-1] != "" else None
220    return hierarchy, org
221
222def is_genesets_file(fn):
223    return fn.startswith("gs_") and fn.endswith(".pck")
224
225def list_local():
226    """ Returns available gene sets from the local repository:
227    a list of (hierarchy, organism, on_local) """
228    pth = local_path()
229    gs_files = filter(is_genesets_file, os.listdir(pth))
230    return [ filename_parse(fn) + (True,) for fn in gs_files ]
231
232def remove_local(gene_set):
233    """ Removes a given gene set from the local repository. """
234    pth = local_path()
235    gs_files = filter(is_genesets_file, os.listdir(pth)) 
236    for setfile in gs_files:
237        if setfile.__contains__(gene_set):
238            setBgone = os.path.join(pth, setfile)
239            os.remove(setBgone) 
240
241def modification_date(file):
242    t = os.path.getmtime(file)
243    return datetime.datetime.fromtimestamp(t)
244
245def list_serverfiles_from_flist(flist):
246    gs_files = filter(is_genesets_file, flist)
247    localfiles = set(orngServerFiles.listfiles(sfdomain))
248    return [ filename_parse(fn) + \
249        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
250
251def list_serverfiles_conn(serverfiles=None):
252    """ Returns available gene sets from the server files
253    repository: a list of (hierarchy, organism, on_local) """
254    if serverfiles == None:
255        serverfiles = orngServerFiles.ServerFiles()
256    flist = serverfiles.listfiles(sfdomain)
257    return list_serverfiles_from_flist(flist)
258
259def list_serverfiles():
260    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
261    flist = pickle.load(open(fname, 'r'))
262    return list_serverfiles_from_flist(flist)
263
264def list_all():
265    """
266    return a list of (hier, org, avalable_locally)
267    If something for a specific (hier, org) is not downloaded
268    yet, show it as not-local. """
269    flist = list_local() + list_serverfiles()
270    d = {}
271    for h,o,local in flist:
272        d[h,o] = min(local, d.get((h,o),True))
273    return [ (h,o,local) for (h,o),local in d.items() ]
274
275def update_server_list(serverfiles_upload, serverfiles_list=None):
276    if serverfiles_list == None:
277        serverfiles_list = orngServerFiles.ServerFiles()
278
279    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
280
281    tfname = pickle_temp(flist)
282
283    try:
284        fn = "index.pck"
285        title = "Gene sets: index"
286        tags = [ "gene sets", "index", "essential" ]
287        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
288        serverfiles_upload.unprotect(sfdomain, fn)
289    except Exception,e:
290        raise e
291    finally:
292        os.remove(tfname)
293
294def _register_local(genesets):
295    """ Registers using the common hierarchy and organism. """
296    pth = local_path()
297
298    org = genesets.common_org()
299    hierarchy = genesets.common_hierarchy()
300    fn = filename(hierarchy, org)
301
302    with open(os.path.join(pth, fn), "w") as f:
303        pickle.dump(genesets, f)
304
305    return fn
306
307def pickle_temp(obj):
308    """ Pickle a file to a temporary file and returns its name """
309    fd,tfname = tempfile.mkstemp()
310    os.close(fd)
311    f = open(tfname, 'wb')
312    pickle.dump(obj, f)
313    f.close()
314    return tfname
315
316def _register_serverfiles(genesets, serverFiles):
317    """ Registers using the common hierarchy and organism. """
318    org = genesets.common_org()
319    hierarchy = genesets.common_hierarchy()
320    fn = filename(hierarchy, org)
321
322    #save to temporary file
323    tfname = pickle_temp(genesets)
324
325    try:
326        taxname = obiTaxonomy.name(org)
327        title = "Gene sets: " + ", ".join(hierarchy) + \
328            ((" (" + taxname + ")") if org != None else "")
329        tags = list(hierarchy) + [ "gene sets", taxname ] + \
330            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
331        serverFiles.upload(sfdomain, fn, tfname, title, tags)
332        serverFiles.unprotect(sfdomain, fn)
333    finally:
334        os.remove(tfname)
335
336    update_server_list(serverFiles)
337
338def register(genesets, serverFiles=None):
339    """
340    Hierarchy is induced from the gene set names.
341    """
342    if serverFiles == None:
343        _register_local(genesets)
344    else:
345        _register_serverfiles(genesets, serverFiles)
346
347def build_hierarchy_dict(files):
348    hierd = defaultdict(list)
349    for ind,f in enumerate(files):
350        hier, org = f
351        for i in range(len(hier)+1):
352            hierd[(hier[:i], org)].append(ind)
353    return hierd
354
355def load_local(hierarchy, organism):
356    files = map(lambda x: x[:2], list_local())
357    hierd = build_hierarchy_dict(files)
358
359    out = GeneSets()
360    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
361        fname = os.path.join(local_path(), filename(h, o))
362        out.update(pickle.load(open(fname, 'r')))
363    return out
364
365def load_serverfiles(hierarchy, organism):
366    files = map(lambda x: x[:2], list_serverfiles())
367    hierd = build_hierarchy_dict(files)
368    out = GeneSets()
369    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
370        fname = orngServerFiles.localpath_download(sfdomain,
371            filename(h, o))
372        out.update(pickle.load(open(fname, 'r')))
373    return out
374
375def load(hierarchy, organism):
376    """ First try to load from the local registred folder. If the file
377    is not available, load it from the server files. """
378    ret = load_local(hierarchy, organism)
379    if len(ret) == 0:
380        ret.update(load_serverfiles(hierarchy, organism))
381    return ret
382
383def collections(*args):
384    """
385    Input is a list of collections.
386    Collection can either be a tuple (hierarchy, orgranism), where
387    hierarchy is a tuple also.
388    """
389    result = GeneSets()
390
391    for collection in args:
392        try:
393            result.update(collection)
394        except (ValueError, TypeError):
395            if issequencens(collection): #have a hierarchy, organism specification
396                new = load(*collection)
397                result.update(new)
398            else:
399                if collection.lower()[-4:] == ".gmt": #format from webpage
400                    result.update(loadGMT(open(collection,"rt").read(), collection))
401                else:
402                    raise Exception("collection() accepts files in .gmt format only.")
403
404    return result
405
406def issequencens(x):
407    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
408    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
409
410class TException(Exception): pass
411
412def upload_genesets(rsf):
413    """
414    Builds the default gene sets and
415    """
416    orngServerFiles.update_local_files()
417
418    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
419    organisms = obiTaxonomy.common_taxids()
420    for fn in genesetsfn:
421        for org in organisms:
422            try:
423                print "Uploading ORG", org, fn
424                genesets = fn(org).split_by_hierarchy()
425                for gs in genesets:
426                    print "registering", gs.common_hierarchy()
427                    register(gs, rsf) #server files
428                    #register(gs)
429                    print "successful", gs.common_hierarchy()
430            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
431                print "organism not found", org
432
433if __name__ == "__main__":
434    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
435    upload_genesets(rsf)
436    pass
437
438
Note: See TracBrowser for help on using the repository browser.