source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1716:4c4266f7c5a5

Revision 1716:4c4266f7c5a5, 11.9 KB checked in by markotoplak, 20 months ago (diff)

Removed the old version of obiKEGG. Renamed obiKEGG2 -> obiKEGG.

Line 
1"""
2Getting genesets from KEGG and GO.
3
4Maintainer: Marko Toplak
5"""
6
7from __future__ import absolute_import, with_statement
8
9if __name__ == "__main__":
10    __package__ = "Orange.bio"
11
12import cPickle as pickle, os, tempfile, sys
13from collections import defaultdict
14
15import Orange.core as orange
16from Orange.orng import orngServerFiles
17
18from . import obiGO, obiKEGG, obiTaxonomy
19
20sfdomain = "gene_sets"
21
22def nth(l,n):
23    return [ a[n] for a in l]
24
25from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
26
27def goGeneSets(org):
28    """Returns gene sets from GO."""
29    ontology = obiGO.Ontology()
30    annotations = obiGO.Annotations(org, ontology=ontology)
31
32    genesets = []
33    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
34    for termn, term in ontology.terms.items():
35        genes = annotations.GetAllGenes(termn)
36        hier = ("GO", term.namespace)
37        if len(genes) > 0:
38            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
39            genesets.append(gs)
40
41    return GeneSets(genesets)
42
43def keggGeneSets(org):
44    """
45    Returns gene sets from KEGG pathways.
46    """
47   
48    kegg = obiKEGG.KEGGOrganism(org)
49
50    genesets = []
51    for id in kegg.pathways():
52        print id
53        pway = obiKEGG.KEGGPathway(id)
54        hier = ("KEGG","pathways")
55        gs = GeneSet(id=id,
56                                 name=pway.title,
57                                 genes=kegg.get_genes_by_pathway(id),
58                                 hierarchy=hier,
59                                 organism=org,
60                                 link=pway.link)
61        genesets.append(gs)
62
63    return GeneSets(genesets)
64
65def omimGeneSets():
66    """
67    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
68    """
69    from . import obiOMIM
70    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
71                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
72                    for disease in obiOMIM.diseases()]
73    return GeneSets(genesets)
74
75def miRNAGeneSets(org):
76    """
77    Return gene sets from miRNA targets
78    """
79    from . import obimiRNA
80    org_code = obiKEGG.from_taxid(org)
81    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
82    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
83    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
84                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
85    return GeneSets(genesets)
86
87def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
88    from . import obimiRNA, obiGO
89    mirnas = obimiRNA.ids(int(org))
90    if ontology is None:
91        ontology = obiGO.Ontology()
92         
93    annotations = obiGO.Annotations(org, ontology=ontology)
94   
95    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
96    print go_sets
97   
98    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
99   
100    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
101    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
102                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
103    gset = GeneSets(gsets)
104    return gset
105
106
107def loadGMT(contents, name):
108    """
109    Eech line consists of tab separated elements. First is
110    the geneset name, next is it's description.
111   
112    For now the description is skipped.
113    """
114
115    def hline(s):
116        tabs = [tab.strip() for tab in s.split("\t")]
117        return GeneSet(id=tabs[0], description=tabs[1],
118                                   hierarchy=(name,), genes=tabs[2:])
119
120    def handleNELines(s, fn):
121        """
122        Run function on nonempty lines of a string.
123        Return a list of results for each line.
124        """
125        lines = (l.strip() for l in s.splitlines())
126        return [fn(l) for l in lines if l]
127
128    return GeneSets(handleNELines(contents, hline))
129
130"""
131We have multiple paths for gene set data:
132buffer/bigfiles/gene_sets
133and
134buffer/gene_sets_local
135both have available.txt
136"""
137
138def omakedirs(dir):
139    try:
140        os.makedirs(dir)
141    except OSError:
142        pass
143
144def local_path():
145    """ Returns local path for gene sets. Creates it if it does not exists
146    yet. """
147    from Orange.orng import orngEnviron
148    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
149    omakedirs(pth)
150    return pth
151
152def build_index(dir):
153    """ Returns gene set availability index for some folder. """
154    pass
155
156def filename(hierarchy, organism):
157    """ Obtain a filename for given hierarchy and organism. """
158    return "gs_" + "_._".join(hierarchy + \
159        (organism if organism != None else "",)) + ".pck"
160
161def filename_parse(fn):
162    """ Returns a hierarchy and the organism from the filename."""
163    fn = fn[3:-4]
164    parts = fn.split("_._")
165    hierarchy = tuple(parts[:-1])
166    org = parts[-1] if parts[-1] != "" else None
167    return hierarchy, org
168
169def is_genesets_file(fn):
170    return fn.startswith("gs_") and fn.endswith(".pck")
171
172def list_local():
173    """ Returns available gene sets from the local repository:
174    a list of (hierarchy, organism, on_local) """
175    pth = local_path()
176    gs_files = filter(is_genesets_file, os.listdir(pth))
177    return [ filename_parse(fn) + (True,) for fn in gs_files ]
178   
179def list_serverfiles_from_flist(flist):
180    gs_files = filter(is_genesets_file, flist)
181    localfiles = set(orngServerFiles.listfiles(sfdomain))
182    return [ filename_parse(fn) + \
183        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
184
185def list_serverfiles_conn(serverfiles=None):
186    """ Returns available gene sets from the server files
187    repository: a list of (hierarchy, organism, on_local) """
188    if serverfiles == None:
189        serverfiles = orngServerFiles.ServerFiles()
190    flist = serverfiles.listfiles(sfdomain)
191    return list_serverfiles_from_flist(flist)
192
193def list_serverfiles():
194    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
195    flist = pickle.load(open(fname, 'r'))
196    return list_serverfiles_from_flist(flist)
197
198def list_all():
199    """
200    return a list of (hier, org, avalable_locally)
201    If something for a specific (hier, org) is not downloaded
202    yet, show it as not-local. """
203    flist = list_local() + list_serverfiles()
204    d = {}
205    for h,o,local in flist:
206        d[h,o] = min(local, d.get((h,o),True))
207    return [ (h,o,local) for (h,o),local in d.items() ]
208
209def update_server_list(serverfiles_upload, serverfiles_list=None):
210    if serverfiles_list == None:
211        serverfiles_list = orngServerFiles.ServerFiles()
212
213    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
214
215    tfname = pickle_temp(flist)
216   
217    try:
218        fn = "index.pck"
219        title = "Gene sets: index"
220        tags = [ "gene sets", "index", "essential" ]
221        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
222        serverfiles_upload.unprotect(sfdomain, fn)
223    except Exception,e:
224        raise e
225    finally:
226        os.remove(tfname)
227
228def _register_local(genesets):
229    """ Registers using the common hierarchy and organism. """
230    pth = local_path()
231
232    org = genesets.common_org()
233    hierarchy = genesets.common_hierarchy()
234    fn = filename(hierarchy, org)
235
236    with open(os.path.join(pth, fn), "w") as f:
237        pickle.dump(genesets, f)
238
239    return fn
240
241def pickle_temp(obj):
242    """ Pickle a file to a temporary file and returns its name """
243    fd,tfname = tempfile.mkstemp()
244    os.close(fd)
245    f = open(tfname, 'wb')
246    pickle.dump(obj, f)
247    f.close()
248    return tfname
249
250def _register_serverfiles(genesets, serverFiles):
251    """ Registers using the common hierarchy and organism. """
252    org = genesets.common_org()
253    hierarchy = genesets.common_hierarchy()
254    fn = filename(hierarchy, org)
255
256    #save to temporary file
257    tfname = pickle_temp(genesets)
258   
259    try:
260        taxname = obiTaxonomy.name(org)
261        title = "Gene sets: " + ", ".join(hierarchy) + \
262            ((" (" + taxname + ")") if org != None else "")
263        tags = list(hierarchy) + [ "gene sets", taxname ] + \
264            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
265        serverFiles.upload(sfdomain, fn, tfname, title, tags)
266        serverFiles.unprotect(sfdomain, fn)
267    finally:
268        os.remove(tfname)
269
270    update_server_list(serverFiles)
271
272def register(genesets, serverFiles=None):
273    """
274    Hierarchy is induced from the gene set names.
275    """
276    if serverFiles == None:
277        _register_local(genesets)
278    else:
279        _register_serverfiles(genesets, serverFiles)
280
281def build_hierarchy_dict(files):
282    hierd = defaultdict(list)
283    for ind,f in enumerate(files):
284        hier, org = f
285        for i in range(len(hier)+1):
286            hierd[(hier[:i], org)].append(ind)
287    return hierd
288
289def load_local(hierarchy, organism):
290    files = map(lambda x: x[:2], list_local())
291    hierd = build_hierarchy_dict(files)
292
293    out = GeneSets()
294    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
295        fname = os.path.join(local_path(), filename(h, o))
296        out.update(pickle.load(open(fname, 'r')))
297    return out
298
299def load_serverfiles(hierarchy, organism):
300    files = map(lambda x: x[:2], list_serverfiles())
301    hierd = build_hierarchy_dict(files)
302    out = GeneSets()
303    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
304        fname = orngServerFiles.localpath_download(sfdomain, 
305            filename(h, o))
306        out.update(pickle.load(open(fname, 'r')))
307    return out
308
309def load(hierarchy, organism):
310    """ First try to load from the local registred folder. If the file
311    is not available, load it from the server files. """
312    ret = load_local(hierarchy, organism)
313    if len(ret) == 0:
314        ret.update(load_serverfiles(hierarchy, organism))
315    return ret
316
317def collections(*args):
318    """
319    Input is a list of collections.
320    Collection can either be a tuple (hierarchy, orgranism), where
321    hierarchy is a tuple also.
322    """
323    result = GeneSets()
324
325    for collection in args:
326        try:
327            result.update(collection)
328        except (ValueError, TypeError):
329            if issequencens(collection): #have a hierarchy, organism specification
330                new = load(*collection)
331                result.update(new)
332            else:
333                if collection.lower()[-4:] == ".gmt": #format from webpage
334                    result.update(loadGMT(open(collection,"rt").read(), collection))
335                else:
336                    raise Exception("collection() accepts files in .gmt format only.")
337
338    return result
339
340def issequencens(x):
341    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
342    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
343
344class TException(Exception): pass
345
346def upload_genesets(rsf):
347    """
348    Builds the default gene sets and
349    """
350    orngServerFiles.update_local_files()
351
352    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
353    organisms = obiTaxonomy.common_taxids()
354    for fn in genesetsfn:
355        for org in organisms:
356            try:
357                print "Uploading ORG", org, fn
358                genesets = fn(org).split_by_hierarchy()
359                for gs in genesets:
360                    print "registering", gs.common_hierarchy()
361                    register(gs, rsf) #server files
362                    #register(gs)
363                    print "successful", gs.common_hierarchy()
364            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
365                print "organism not found", org
366
367if __name__ == "__main__":
368    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
369    upload_genesets(rsf)
370    pass
Note: See TracBrowser for help on using the repository browser.