source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1798:f1e73901a1d3

Revision 1798:f1e73901a1d3, 14.0 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

Added Reactome Pathways genesets for humans

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def dictyMutantSets():
64    """
65    Return dicty mutant phenotype gene sets from Dictybase
66    """
67    from . import obiDictyMutants
68    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
69    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
70    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
71    #                    for mutant in obiDictyMutants.mutants()]
72 
73    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
74                        link="") \
75                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
76
77    return GeneSets(genesets)
78
79def cytobandGeneSets():
80    """
81    Create cytoband gene sets from Stanford Microarray Database
82    """
83    import urllib2
84
85    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
86    stream = urllib2.urlopen(url)
87    data = stream.read().splitlines()
88
89    genesets = []
90    for band in data:
91        b = band.split("\t")
92        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
93
94    return GeneSets(genesets)
95
96def reactomePathwaysGeneSets():
97    """
98    Prepare human pathways gene sets from reactome pathways
99    """
100    import urllib
101    import io
102    from zipfile import ZipFile
103
104    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
105    memfile = io.BytesIO(url.read())
106    with ZipFile(memfile, "r") as myzip:
107        f = myzip.open("ReactomePathways.gmt")
108        content = f.read().splitlines()     
109
110    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
111    return GeneSets(genesets)
112
113
114def omimGeneSets():
115    """
116    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
117    """
118    from . import obiOMIM   
119    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
120                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
121                    for disease in obiOMIM.diseases()]
122    return GeneSets(genesets)
123
124def miRNAGeneSets(org):
125    """
126    Return gene sets from miRNA targets
127    """
128    from . import obimiRNA
129    org_code = obiKEGG.from_taxid(org)
130    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
131    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
132    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
133                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
134    return GeneSets(genesets)
135
136def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
137    from . import obimiRNA, obiGO
138    mirnas = obimiRNA.ids(int(org))
139    if ontology is None:
140        ontology = obiGO.Ontology()
141
142    annotations = obiGO.Annotations(org, ontology=ontology)
143
144    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
145    print go_sets
146
147    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
148
149    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
150    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
151                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
152    gset = GeneSets(gsets)
153    return gset
154
155def loadGMT(contents, name):
156    """
157    Eech line consists of tab separated elements. First is
158    the geneset name, next is it's description.
159
160    For now the description is skipped.
161    """
162
163    def hline(s):
164        tabs = [tab.strip() for tab in s.split("\t")]
165        return GeneSet(id=tabs[0], description=tabs[1],
166                                   hierarchy=(name,), genes=tabs[2:])
167
168    def handleNELines(s, fn):
169        """
170        Run function on nonempty lines of a string.
171        Return a list of results for each line.
172        """
173        lines = (l.strip() for l in s.splitlines())
174        return [fn(l) for l in lines if l]
175
176    return GeneSets(handleNELines(contents, hline))
177
178"""
179We have multiple paths for gene set data:
180buffer/bigfiles/gene_sets
181and
182buffer/gene_sets_local
183both have available.txt
184"""
185
186def omakedirs(dir):
187    try:
188        os.makedirs(dir)
189    except OSError:
190        pass
191
192def local_path():
193    """ Returns local path for gene sets. Creates it if it does not exists
194    yet. """
195    from Orange.orng import orngEnviron
196    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
197    omakedirs(pth)
198    return pth
199
200def build_index(dir):
201    """ Returns gene set availability index for some folder. """
202    pass
203
204def filename(hierarchy, organism):
205    """ Obtain a filename for given hierarchy and organism. """
206    return "gs_" + "_._".join(hierarchy + \
207        (organism if organism != None else "",)) + ".pck"
208
209def filename_parse(fn):
210    """ Returns a hierarchy and the organism from the filename."""
211    fn = fn[3:-4]
212    parts = fn.split("_._")
213    hierarchy = tuple(parts[:-1])
214    org = parts[-1] if parts[-1] != "" else None
215    return hierarchy, org
216
217def is_genesets_file(fn):
218    return fn.startswith("gs_") and fn.endswith(".pck")
219
220def list_local():
221    """ Returns available gene sets from the local repository:
222    a list of (hierarchy, organism, on_local) """
223    pth = local_path()
224    gs_files = filter(is_genesets_file, os.listdir(pth))
225    return [ filename_parse(fn) + (True,) for fn in gs_files ]
226
227def list_serverfiles_from_flist(flist):
228    gs_files = filter(is_genesets_file, flist)
229    localfiles = set(orngServerFiles.listfiles(sfdomain))
230    return [ filename_parse(fn) + \
231        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
232
233def list_serverfiles_conn(serverfiles=None):
234    """ Returns available gene sets from the server files
235    repository: a list of (hierarchy, organism, on_local) """
236    if serverfiles == None:
237        serverfiles = orngServerFiles.ServerFiles()
238    flist = serverfiles.listfiles(sfdomain)
239    return list_serverfiles_from_flist(flist)
240
241def list_serverfiles():
242    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
243    flist = pickle.load(open(fname, 'r'))
244    return list_serverfiles_from_flist(flist)
245
246def list_all():
247    """
248    return a list of (hier, org, avalable_locally)
249    If something for a specific (hier, org) is not downloaded
250    yet, show it as not-local. """
251    flist = list_local() + list_serverfiles()
252    d = {}
253    for h,o,local in flist:
254        d[h,o] = min(local, d.get((h,o),True))
255    return [ (h,o,local) for (h,o),local in d.items() ]
256
257def update_server_list(serverfiles_upload, serverfiles_list=None):
258    if serverfiles_list == None:
259        serverfiles_list = orngServerFiles.ServerFiles()
260
261    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
262
263    tfname = pickle_temp(flist)
264
265    try:
266        fn = "index.pck"
267        title = "Gene sets: index"
268        tags = [ "gene sets", "index", "essential" ]
269        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
270        serverfiles_upload.unprotect(sfdomain, fn)
271    except Exception,e:
272        raise e
273    finally:
274        os.remove(tfname)
275
276def _register_local(genesets):
277    """ Registers using the common hierarchy and organism. """
278    pth = local_path()
279
280    org = genesets.common_org()
281    hierarchy = genesets.common_hierarchy()
282    fn = filename(hierarchy, org)
283
284    with open(os.path.join(pth, fn), "w") as f:
285        pickle.dump(genesets, f)
286
287    return fn
288
289def pickle_temp(obj):
290    """ Pickle a file to a temporary file and returns its name """
291    fd,tfname = tempfile.mkstemp()
292    os.close(fd)
293    f = open(tfname, 'wb')
294    pickle.dump(obj, f)
295    f.close()
296    return tfname
297
298def _register_serverfiles(genesets, serverFiles):
299    """ Registers using the common hierarchy and organism. """
300    org = genesets.common_org()
301    hierarchy = genesets.common_hierarchy()
302    fn = filename(hierarchy, org)
303
304    #save to temporary file
305    tfname = pickle_temp(genesets)
306
307    try:
308        taxname = obiTaxonomy.name(org)
309        title = "Gene sets: " + ", ".join(hierarchy) + \
310            ((" (" + taxname + ")") if org != None else "")
311        tags = list(hierarchy) + [ "gene sets", taxname ] + \
312            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
313        serverFiles.upload(sfdomain, fn, tfname, title, tags)
314        serverFiles.unprotect(sfdomain, fn)
315    finally:
316        os.remove(tfname)
317
318    update_server_list(serverFiles)
319
320def register(genesets, serverFiles=None):
321    """
322    Hierarchy is induced from the gene set names.
323    """
324    if serverFiles == None:
325        _register_local(genesets)
326    else:
327        _register_serverfiles(genesets, serverFiles)
328
329def build_hierarchy_dict(files):
330    hierd = defaultdict(list)
331    for ind,f in enumerate(files):
332        hier, org = f
333        for i in range(len(hier)+1):
334            hierd[(hier[:i], org)].append(ind)
335    return hierd
336
337def load_local(hierarchy, organism):
338    files = map(lambda x: x[:2], list_local())
339    hierd = build_hierarchy_dict(files)
340
341    out = GeneSets()
342    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
343        fname = os.path.join(local_path(), filename(h, o))
344        out.update(pickle.load(open(fname, 'r')))
345    return out
346
347def load_serverfiles(hierarchy, organism):
348    files = map(lambda x: x[:2], list_serverfiles())
349    hierd = build_hierarchy_dict(files)
350    out = GeneSets()
351    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
352        fname = orngServerFiles.localpath_download(sfdomain,
353            filename(h, o))
354        out.update(pickle.load(open(fname, 'r')))
355    return out
356
357def load(hierarchy, organism):
358    """ First try to load from the local registred folder. If the file
359    is not available, load it from the server files. """
360    ret = load_local(hierarchy, organism)
361    if len(ret) == 0:
362        ret.update(load_serverfiles(hierarchy, organism))
363    return ret
364
365def collections(*args):
366    """
367    Input is a list of collections.
368    Collection can either be a tuple (hierarchy, orgranism), where
369    hierarchy is a tuple also.
370    """
371    result = GeneSets()
372
373    for collection in args:
374        try:
375            result.update(collection)
376        except (ValueError, TypeError):
377            if issequencens(collection): #have a hierarchy, organism specification
378                new = load(*collection)
379                result.update(new)
380            else:
381                if collection.lower()[-4:] == ".gmt": #format from webpage
382                    result.update(loadGMT(open(collection,"rt").read(), collection))
383                else:
384                    raise Exception("collection() accepts files in .gmt format only.")
385
386    return result
387
388def issequencens(x):
389    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
390    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
391
392class TException(Exception): pass
393
394def upload_genesets(rsf):
395    """
396    Builds the default gene sets and
397    """
398    orngServerFiles.update_local_files()
399
400    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
401    organisms = obiTaxonomy.common_taxids()
402    for fn in genesetsfn:
403        for org in organisms:
404            try:
405                print "Uploading ORG", org, fn
406                genesets = fn(org).split_by_hierarchy()
407                for gs in genesets:
408                    print "registering", gs.common_hierarchy()
409                    register(gs, rsf) #server files
410                    #register(gs)
411                    print "successful", gs.common_hierarchy()
412            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
413                print "organism not found", org
414
415if __name__ == "__main__":
416    print reactomePathwaysGeneSets()
417    exit()
418    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
419    upload_genesets(rsf)
420    pass
421
422
Note: See TracBrowser for help on using the repository browser.