source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1780:5cc362d703bb

Revision 1780:5cc362d703bb, 12.7 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

DictyMutants geneset working

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def dictyMutantSets():
64    """
65    Return dicty mutant phenotype gene sets from Dictybase
66    """
67    from . import obiDictyMutants
68    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
69    genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", ), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
70                        link=(link_fmt % mutant.name if mutant.name else None)) \
71                        for mutant in obiDictyMutants.mutants()]
72    genesets.extend([GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_phenotypes(mutant), hierarchy=("Dictybase", "phenotype_mutants"), organism="352472",
73                        link=(link_fmt % mutant.name if mutant.name else None)) \
74                        for mutant in obiDictyMutants.mutants()])
75
76    return GeneSets(genesets)
77
78def omimGeneSets():
79    """
80    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
81    """
82    from . import obiOMIM   
83    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
84                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
85                    for disease in obiOMIM.diseases()]
86    return GeneSets(genesets)
87
88def miRNAGeneSets(org):
89    """
90    Return gene sets from miRNA targets
91    """
92    from . import obimiRNA
93    org_code = obiKEGG.from_taxid(org)
94    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
95    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
96    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
97                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
98    return GeneSets(genesets)
99
100def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
101    from . import obimiRNA, obiGO
102    mirnas = obimiRNA.ids(int(org))
103    if ontology is None:
104        ontology = obiGO.Ontology()
105
106    annotations = obiGO.Annotations(org, ontology=ontology)
107
108    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
109    print go_sets
110
111    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
112
113    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
114    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
115                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
116    gset = GeneSets(gsets)
117    return gset
118
119def loadGMT(contents, name):
120    """
121    Eech line consists of tab separated elements. First is
122    the geneset name, next is it's description.
123
124    For now the description is skipped.
125    """
126
127    def hline(s):
128        tabs = [tab.strip() for tab in s.split("\t")]
129        return GeneSet(id=tabs[0], description=tabs[1],
130                                   hierarchy=(name,), genes=tabs[2:])
131
132    def handleNELines(s, fn):
133        """
134        Run function on nonempty lines of a string.
135        Return a list of results for each line.
136        """
137        lines = (l.strip() for l in s.splitlines())
138        return [fn(l) for l in lines if l]
139
140    return GeneSets(handleNELines(contents, hline))
141
142"""
143We have multiple paths for gene set data:
144buffer/bigfiles/gene_sets
145and
146buffer/gene_sets_local
147both have available.txt
148"""
149
150def omakedirs(dir):
151    try:
152        os.makedirs(dir)
153    except OSError:
154        pass
155
156def local_path():
157    """ Returns local path for gene sets. Creates it if it does not exists
158    yet. """
159    from Orange.orng import orngEnviron
160    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
161    omakedirs(pth)
162    return pth
163
164def build_index(dir):
165    """ Returns gene set availability index for some folder. """
166    pass
167
168def filename(hierarchy, organism):
169    """ Obtain a filename for given hierarchy and organism. """
170    return "gs_" + "_._".join(hierarchy + \
171        (organism if organism != None else "",)) + ".pck"
172
173def filename_parse(fn):
174    """ Returns a hierarchy and the organism from the filename."""
175    fn = fn[3:-4]
176    parts = fn.split("_._")
177    hierarchy = tuple(parts[:-1])
178    org = parts[-1] if parts[-1] != "" else None
179    return hierarchy, org
180
181def is_genesets_file(fn):
182    return fn.startswith("gs_") and fn.endswith(".pck")
183
184def list_local():
185    """ Returns available gene sets from the local repository:
186    a list of (hierarchy, organism, on_local) """
187    pth = local_path()
188    gs_files = filter(is_genesets_file, os.listdir(pth))
189    return [ filename_parse(fn) + (True,) for fn in gs_files ]
190
191def list_serverfiles_from_flist(flist):
192    gs_files = filter(is_genesets_file, flist)
193    localfiles = set(orngServerFiles.listfiles(sfdomain))
194    return [ filename_parse(fn) + \
195        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
196
197def list_serverfiles_conn(serverfiles=None):
198    """ Returns available gene sets from the server files
199    repository: a list of (hierarchy, organism, on_local) """
200    if serverfiles == None:
201        serverfiles = orngServerFiles.ServerFiles()
202    flist = serverfiles.listfiles(sfdomain)
203    return list_serverfiles_from_flist(flist)
204
205def list_serverfiles():
206    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
207    flist = pickle.load(open(fname, 'r'))
208    return list_serverfiles_from_flist(flist)
209
210def list_all():
211    """
212    return a list of (hier, org, avalable_locally)
213    If something for a specific (hier, org) is not downloaded
214    yet, show it as not-local. """
215    flist = list_local() + list_serverfiles()
216    d = {}
217    for h,o,local in flist:
218        d[h,o] = min(local, d.get((h,o),True))
219    return [ (h,o,local) for (h,o),local in d.items() ]
220
221def update_server_list(serverfiles_upload, serverfiles_list=None):
222    if serverfiles_list == None:
223        serverfiles_list = orngServerFiles.ServerFiles()
224
225    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
226
227    tfname = pickle_temp(flist)
228
229    try:
230        fn = "index.pck"
231        title = "Gene sets: index"
232        tags = [ "gene sets", "index", "essential" ]
233        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
234        serverfiles_upload.unprotect(sfdomain, fn)
235    except Exception,e:
236        raise e
237    finally:
238        os.remove(tfname)
239
240def _register_local(genesets):
241    """ Registers using the common hierarchy and organism. """
242    pth = local_path()
243
244    org = genesets.common_org()
245    hierarchy = genesets.common_hierarchy()
246    fn = filename(hierarchy, org)
247
248    with open(os.path.join(pth, fn), "w") as f:
249        pickle.dump(genesets, f)
250
251    return fn
252
253def pickle_temp(obj):
254    """ Pickle a file to a temporary file and returns its name """
255    fd,tfname = tempfile.mkstemp()
256    os.close(fd)
257    f = open(tfname, 'wb')
258    pickle.dump(obj, f)
259    f.close()
260    return tfname
261
262def _register_serverfiles(genesets, serverFiles):
263    """ Registers using the common hierarchy and organism. """
264    org = genesets.common_org()
265    hierarchy = genesets.common_hierarchy()
266    fn = filename(hierarchy, org)
267
268    #save to temporary file
269    tfname = pickle_temp(genesets)
270
271    try:
272        taxname = obiTaxonomy.name(org)
273        title = "Gene sets: " + ", ".join(hierarchy) + \
274            ((" (" + taxname + ")") if org != None else "")
275        tags = list(hierarchy) + [ "gene sets", taxname ] + \
276            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
277        serverFiles.upload(sfdomain, fn, tfname, title, tags)
278        serverFiles.unprotect(sfdomain, fn)
279    finally:
280        os.remove(tfname)
281
282    update_server_list(serverFiles)
283
284def register(genesets, serverFiles=None):
285    """
286    Hierarchy is induced from the gene set names.
287    """
288    if serverFiles == None:
289        _register_local(genesets)
290    else:
291        _register_serverfiles(genesets, serverFiles)
292
293def build_hierarchy_dict(files):
294    hierd = defaultdict(list)
295    for ind,f in enumerate(files):
296        hier, org = f
297        for i in range(len(hier)+1):
298            hierd[(hier[:i], org)].append(ind)
299    return hierd
300
301def load_local(hierarchy, organism):
302    files = map(lambda x: x[:2], list_local())
303    hierd = build_hierarchy_dict(files)
304
305    out = GeneSets()
306    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
307        fname = os.path.join(local_path(), filename(h, o))
308        out.update(pickle.load(open(fname, 'r')))
309    return out
310
311def load_serverfiles(hierarchy, organism):
312    files = map(lambda x: x[:2], list_serverfiles())
313    hierd = build_hierarchy_dict(files)
314    out = GeneSets()
315    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
316        fname = orngServerFiles.localpath_download(sfdomain,
317            filename(h, o))
318        out.update(pickle.load(open(fname, 'r')))
319    return out
320
321def load(hierarchy, organism):
322    """ First try to load from the local registred folder. If the file
323    is not available, load it from the server files. """
324    ret = load_local(hierarchy, organism)
325    if len(ret) == 0:
326        ret.update(load_serverfiles(hierarchy, organism))
327    return ret
328
329def collections(*args):
330    """
331    Input is a list of collections.
332    Collection can either be a tuple (hierarchy, orgranism), where
333    hierarchy is a tuple also.
334    """
335    result = GeneSets()
336
337    for collection in args:
338        try:
339            result.update(collection)
340        except (ValueError, TypeError):
341            if issequencens(collection): #have a hierarchy, organism specification
342                new = load(*collection)
343                result.update(new)
344            else:
345                if collection.lower()[-4:] == ".gmt": #format from webpage
346                    result.update(loadGMT(open(collection,"rt").read(), collection))
347                else:
348                    raise Exception("collection() accepts files in .gmt format only.")
349
350    return result
351
352def issequencens(x):
353    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
354    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
355
356class TException(Exception): pass
357
358def upload_genesets(rsf):
359    """
360    Builds the default gene sets and
361    """
362    orngServerFiles.update_local_files()
363
364    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
365    organisms = obiTaxonomy.common_taxids()
366    for fn in genesetsfn:
367        for org in organisms:
368            try:
369                print "Uploading ORG", org, fn
370                genesets = fn(org).split_by_hierarchy()
371                for gs in genesets:
372                    print "registering", gs.common_hierarchy()
373                    register(gs, rsf) #server files
374                    #register(gs)
375                    print "successful", gs.common_hierarchy()
376            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
377                print "organism not found", org
378
379if __name__ == "__main__":
380    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
381    upload_genesets(rsf)
382    pass
383
384
Note: See TracBrowser for help on using the repository browser.