source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1761:d8f3d4bb64fa

Revision 1761:d8f3d4bb64fa, 12.5 KB checked in by Flashpoint <vid.flashpoint@…>, 12 months ago (diff)

Added obiDictyMutants.py

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def dictyMutantSets():
64    """
65    Return dicty mutant phenotype gene sets from Dictybase
66    """
67    from . import obiDictyMutants
68    genesets = [GeneSet(id=dicty_mutant.id, name=dicty_mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "mutant_phenotypes"), organism="352472", # 352742 gathered from obiGO.py code_map -> Dicty identifier
69                        link=("http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" % mutant.id if mutant.id else None)) \
70                        for mutant in obiDictyMutants.mutants()]
71    return GeneSets(genesets)
72
73def omimGeneSets():
74    """
75    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
76    """
77    from . import obiOMIM    # The link here leads to a redirection... should replace it with the new URL
78    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
79                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
80                    for disease in obiOMIM.diseases()]
81    return GeneSets(genesets)
82
83def miRNAGeneSets(org):
84    """
85    Return gene sets from miRNA targets
86    """
87    from . import obimiRNA
88    org_code = obiKEGG.from_taxid(org)
89    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
90    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
91    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
92                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
93    return GeneSets(genesets)
94
95def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
96    from . import obimiRNA, obiGO
97    mirnas = obimiRNA.ids(int(org))
98    if ontology is None:
99        ontology = obiGO.Ontology()
100
101    annotations = obiGO.Annotations(org, ontology=ontology)
102
103    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
104    print go_sets
105
106    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
107
108    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
109    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
110                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
111    gset = GeneSets(gsets)
112    return gset
113
114def loadGMT(contents, name):
115    """
116    Eech line consists of tab separated elements. First is
117    the geneset name, next is it's description.
118
119    For now the description is skipped.
120    """
121
122    def hline(s):
123        tabs = [tab.strip() for tab in s.split("\t")]
124        return GeneSet(id=tabs[0], description=tabs[1],
125                                   hierarchy=(name,), genes=tabs[2:])
126
127    def handleNELines(s, fn):
128        """
129        Run function on nonempty lines of a string.
130        Return a list of results for each line.
131        """
132        lines = (l.strip() for l in s.splitlines())
133        return [fn(l) for l in lines if l]
134
135    return GeneSets(handleNELines(contents, hline))
136
137"""
138We have multiple paths for gene set data:
139buffer/bigfiles/gene_sets
140and
141buffer/gene_sets_local
142both have available.txt
143"""
144
145def omakedirs(dir):
146    try:
147        os.makedirs(dir)
148    except OSError:
149        pass
150
151def local_path():
152    """ Returns local path for gene sets. Creates it if it does not exists
153    yet. """
154    from Orange.orng import orngEnviron
155    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
156    omakedirs(pth)
157    return pth
158
159def build_index(dir):
160    """ Returns gene set availability index for some folder. """
161    pass
162
163def filename(hierarchy, organism):
164    """ Obtain a filename for given hierarchy and organism. """
165    return "gs_" + "_._".join(hierarchy + \
166        (organism if organism != None else "",)) + ".pck"
167
168def filename_parse(fn):
169    """ Returns a hierarchy and the organism from the filename."""
170    fn = fn[3:-4]
171    parts = fn.split("_._")
172    hierarchy = tuple(parts[:-1])
173    org = parts[-1] if parts[-1] != "" else None
174    return hierarchy, org
175
176def is_genesets_file(fn):
177    return fn.startswith("gs_") and fn.endswith(".pck")
178
179def list_local():
180    """ Returns available gene sets from the local repository:
181    a list of (hierarchy, organism, on_local) """
182    pth = local_path()
183    gs_files = filter(is_genesets_file, os.listdir(pth))
184    return [ filename_parse(fn) + (True,) for fn in gs_files ]
185
186def list_serverfiles_from_flist(flist):
187    gs_files = filter(is_genesets_file, flist)
188    localfiles = set(orngServerFiles.listfiles(sfdomain))
189    return [ filename_parse(fn) + \
190        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
191
192def list_serverfiles_conn(serverfiles=None):
193    """ Returns available gene sets from the server files
194    repository: a list of (hierarchy, organism, on_local) """
195    if serverfiles == None:
196        serverfiles = orngServerFiles.ServerFiles()
197    flist = serverfiles.listfiles(sfdomain)
198    return list_serverfiles_from_flist(flist)
199
200def list_serverfiles():
201    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
202    flist = pickle.load(open(fname, 'r'))
203    return list_serverfiles_from_flist(flist)
204
205def list_all():
206    """
207    return a list of (hier, org, avalable_locally)
208    If something for a specific (hier, org) is not downloaded
209    yet, show it as not-local. """
210    flist = list_local() + list_serverfiles()
211    d = {}
212    for h,o,local in flist:
213        d[h,o] = min(local, d.get((h,o),True))
214    return [ (h,o,local) for (h,o),local in d.items() ]
215
216def update_server_list(serverfiles_upload, serverfiles_list=None):
217    if serverfiles_list == None:
218        serverfiles_list = orngServerFiles.ServerFiles()
219
220    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
221
222    tfname = pickle_temp(flist)
223
224    try:
225        fn = "index.pck"
226        title = "Gene sets: index"
227        tags = [ "gene sets", "index", "essential" ]
228        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
229        serverfiles_upload.unprotect(sfdomain, fn)
230    except Exception,e:
231        raise e
232    finally:
233        os.remove(tfname)
234
235def _register_local(genesets):
236    """ Registers using the common hierarchy and organism. """
237    pth = local_path()
238
239    org = genesets.common_org()
240    hierarchy = genesets.common_hierarchy()
241    fn = filename(hierarchy, org)
242
243    with open(os.path.join(pth, fn), "w") as f:
244        pickle.dump(genesets, f)
245
246    return fn
247
248def pickle_temp(obj):
249    """ Pickle a file to a temporary file and returns its name """
250    fd,tfname = tempfile.mkstemp()
251    os.close(fd)
252    f = open(tfname, 'wb')
253    pickle.dump(obj, f)
254    f.close()
255    return tfname
256
257def _register_serverfiles(genesets, serverFiles):
258    """ Registers using the common hierarchy and organism. """
259    org = genesets.common_org()
260    hierarchy = genesets.common_hierarchy()
261    fn = filename(hierarchy, org)
262
263    #save to temporary file
264    tfname = pickle_temp(genesets)
265
266    try:
267        taxname = obiTaxonomy.name(org)
268        title = "Gene sets: " + ", ".join(hierarchy) + \
269            ((" (" + taxname + ")") if org != None else "")
270        tags = list(hierarchy) + [ "gene sets", taxname ] + \
271            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
272        serverFiles.upload(sfdomain, fn, tfname, title, tags)
273        serverFiles.unprotect(sfdomain, fn)
274    finally:
275        os.remove(tfname)
276
277    update_server_list(serverFiles)
278
279def register(genesets, serverFiles=None):
280    """
281    Hierarchy is induced from the gene set names.
282    """
283    if serverFiles == None:
284        _register_local(genesets)
285    else:
286        _register_serverfiles(genesets, serverFiles)
287
288def build_hierarchy_dict(files):
289    hierd = defaultdict(list)
290    for ind,f in enumerate(files):
291        hier, org = f
292        for i in range(len(hier)+1):
293            hierd[(hier[:i], org)].append(ind)
294    return hierd
295
296def load_local(hierarchy, organism):
297    files = map(lambda x: x[:2], list_local())
298    hierd = build_hierarchy_dict(files)
299
300    out = GeneSets()
301    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
302        fname = os.path.join(local_path(), filename(h, o))
303        out.update(pickle.load(open(fname, 'r')))
304    return out
305
306def load_serverfiles(hierarchy, organism):
307    files = map(lambda x: x[:2], list_serverfiles())
308    hierd = build_hierarchy_dict(files)
309    out = GeneSets()
310    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
311        fname = orngServerFiles.localpath_download(sfdomain,
312            filename(h, o))
313        out.update(pickle.load(open(fname, 'r')))
314    return out
315
316def load(hierarchy, organism):
317    """ First try to load from the local registred folder. If the file
318    is not available, load it from the server files. """
319    ret = load_local(hierarchy, organism)
320    if len(ret) == 0:
321        ret.update(load_serverfiles(hierarchy, organism))
322    return ret
323
324def collections(*args):
325    """
326    Input is a list of collections.
327    Collection can either be a tuple (hierarchy, orgranism), where
328    hierarchy is a tuple also.
329    """
330    result = GeneSets()
331
332    for collection in args:
333        try:
334            result.update(collection)
335        except (ValueError, TypeError):
336            if issequencens(collection): #have a hierarchy, organism specification
337                new = load(*collection)
338                result.update(new)
339            else:
340                if collection.lower()[-4:] == ".gmt": #format from webpage
341                    result.update(loadGMT(open(collection,"rt").read(), collection))
342                else:
343                    raise Exception("collection() accepts files in .gmt format only.")
344
345    return result
346
347def issequencens(x):
348    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
349    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
350
351class TException(Exception): pass
352
353def upload_genesets(rsf):
354    """
355    Builds the default gene sets and
356    """
357    orngServerFiles.update_local_files()
358
359    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
360    organisms = obiTaxonomy.common_taxids()
361    for fn in genesetsfn:
362        for org in organisms:
363            try:
364                print "Uploading ORG", org, fn
365                genesets = fn(org).split_by_hierarchy()
366                for gs in genesets:
367                    print "registering", gs.common_hierarchy()
368                    register(gs, rsf) #server files
369                    #register(gs)
370                    print "successful", gs.common_hierarchy()
371            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
372                print "organism not found", org
373
374if __name__ == "__main__":
375    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
376    upload_genesets(rsf)
377    pass
378
379
Note: See TracBrowser for help on using the repository browser.