source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1712:bcd52ead3d93

Revision 1712:bcd52ead3d93, 12.0 KB checked in by markotoplak, 20 months ago (diff)

Classes for storing gene set data moved to Orange.bio.geneset. obiGeneSets adapted to make new pickled classes.

Line 
1"""
2Getting genesets from KEGG and GO.
3
4Maintainer: Marko Toplak
5"""
6
7from __future__ import absolute_import, with_statement
8
9if __name__ == "__main__":
10    __package__ = "Orange.bio"
11
12import cPickle as pickle, os, tempfile, sys
13from collections import defaultdict
14
15import Orange.core as orange
16from Orange.orng import orngServerFiles
17
18from . import obiGO, obiKEGG, obiTaxonomy
19
20sfdomain = "gene_sets"
21
22def nth(l,n):
23    return [ a[n] for a in l]
24
25from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
26
27def goGeneSets(org):
28    """Returns gene sets from GO."""
29    ontology = obiGO.Ontology()
30    annotations = obiGO.Annotations(org, ontology=ontology)
31
32    genesets = []
33    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
34    for termn, term in ontology.terms.items():
35        genes = annotations.GetAllGenes(termn)
36        hier = ("GO", term.namespace)
37        if len(genes) > 0:
38            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
39            genesets.append(gs)
40
41    return GeneSets(genesets)
42
43def keggGeneSets(org):
44    """
45    Returns gene sets from KEGG pathways.
46    """
47    from . import obiKEGG2 as obiKEGG
48   
49    kegg = obiKEGG.KEGGOrganism(org)
50
51    genesets = []
52    for id in kegg.pathways():
53        print id
54        pway = obiKEGG.KEGGPathway(id)
55        hier = ("KEGG","pathways")
56        gs = GeneSet(id=id,
57                                 name=pway.title,
58                                 genes=kegg.get_genes_by_pathway(id),
59                                 hierarchy=hier,
60                                 organism=org,
61                                 link=pway.link)
62        genesets.append(gs)
63
64    return GeneSets(genesets)
65
66def omimGeneSets():
67    """
68    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
69    """
70    from . import obiOMIM
71    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
72                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
73                    for disease in obiOMIM.diseases()]
74    return GeneSets(genesets)
75
76def miRNAGeneSets(org):
77    """
78    Return gene sets from miRNA targets
79    """
80    from . import obimiRNA, obiKEGG
81    org_code = obiKEGG.from_taxid(org)
82    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
83    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
84    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
85                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
86    return GeneSets(genesets)
87
88def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
89    from . import obimiRNA, obiGO
90    mirnas = obimiRNA.ids(int(org))
91    if ontology is None:
92        ontology = obiGO.Ontology()
93         
94    annotations = obiGO.Annotations(org, ontology=ontology)
95   
96    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
97    print go_sets
98   
99    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
100   
101    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
102    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
103                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
104    gset = GeneSets(gsets)
105    return gset
106
107
108def loadGMT(contents, name):
109    """
110    Eech line consists of tab separated elements. First is
111    the geneset name, next is it's description.
112   
113    For now the description is skipped.
114    """
115
116    def hline(s):
117        tabs = [tab.strip() for tab in s.split("\t")]
118        return GeneSet(id=tabs[0], description=tabs[1],
119                                   hierarchy=(name,), genes=tabs[2:])
120
121    def handleNELines(s, fn):
122        """
123        Run function on nonempty lines of a string.
124        Return a list of results for each line.
125        """
126        lines = (l.strip() for l in s.splitlines())
127        return [fn(l) for l in lines if l]
128
129    return GeneSets(handleNELines(contents, hline))
130
131"""
132We have multiple paths for gene set data:
133buffer/bigfiles/gene_sets
134and
135buffer/gene_sets_local
136both have available.txt
137"""
138
139def omakedirs(dir):
140    try:
141        os.makedirs(dir)
142    except OSError:
143        pass
144
145def local_path():
146    """ Returns local path for gene sets. Creates it if it does not exists
147    yet. """
148    from Orange.orng import orngEnviron
149    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
150    omakedirs(pth)
151    return pth
152
153def build_index(dir):
154    """ Returns gene set availability index for some folder. """
155    pass
156
157def filename(hierarchy, organism):
158    """ Obtain a filename for given hierarchy and organism. """
159    return "gs_" + "_._".join(hierarchy + \
160        (organism if organism != None else "",)) + ".pck"
161
162def filename_parse(fn):
163    """ Returns a hierarchy and the organism from the filename."""
164    fn = fn[3:-4]
165    parts = fn.split("_._")
166    hierarchy = tuple(parts[:-1])
167    org = parts[-1] if parts[-1] != "" else None
168    return hierarchy, org
169
170def is_genesets_file(fn):
171    return fn.startswith("gs_") and fn.endswith(".pck")
172
173def list_local():
174    """ Returns available gene sets from the local repository:
175    a list of (hierarchy, organism, on_local) """
176    pth = local_path()
177    gs_files = filter(is_genesets_file, os.listdir(pth))
178    return [ filename_parse(fn) + (True,) for fn in gs_files ]
179   
180def list_serverfiles_from_flist(flist):
181    gs_files = filter(is_genesets_file, flist)
182    localfiles = set(orngServerFiles.listfiles(sfdomain))
183    return [ filename_parse(fn) + \
184        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
185
186def list_serverfiles_conn(serverfiles=None):
187    """ Returns available gene sets from the server files
188    repository: a list of (hierarchy, organism, on_local) """
189    if serverfiles == None:
190        serverfiles = orngServerFiles.ServerFiles()
191    flist = serverfiles.listfiles(sfdomain)
192    return list_serverfiles_from_flist(flist)
193
194def list_serverfiles():
195    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
196    flist = pickle.load(open(fname, 'r'))
197    return list_serverfiles_from_flist(flist)
198
199def list_all():
200    """
201    return a list of (hier, org, avalable_locally)
202    If something for a specific (hier, org) is not downloaded
203    yet, show it as not-local. """
204    flist = list_local() + list_serverfiles()
205    d = {}
206    for h,o,local in flist:
207        d[h,o] = min(local, d.get((h,o),True))
208    return [ (h,o,local) for (h,o),local in d.items() ]
209
210def update_server_list(serverfiles_upload, serverfiles_list=None):
211    if serverfiles_list == None:
212        serverfiles_list = orngServerFiles.ServerFiles()
213
214    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
215
216    tfname = pickle_temp(flist)
217   
218    try:
219        fn = "index.pck"
220        title = "Gene sets: index"
221        tags = [ "gene sets", "index", "essential" ]
222        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
223        serverfiles_upload.unprotect(sfdomain, fn)
224    except Exception,e:
225        raise e
226    finally:
227        os.remove(tfname)
228
229def _register_local(genesets):
230    """ Registers using the common hierarchy and organism. """
231    pth = local_path()
232
233    org = genesets.common_org()
234    hierarchy = genesets.common_hierarchy()
235    fn = filename(hierarchy, org)
236
237    with open(os.path.join(pth, fn), "w") as f:
238        pickle.dump(genesets, f)
239
240    return fn
241
242def pickle_temp(obj):
243    """ Pickle a file to a temporary file and returns its name """
244    fd,tfname = tempfile.mkstemp()
245    os.close(fd)
246    f = open(tfname, 'wb')
247    pickle.dump(obj, f)
248    f.close()
249    return tfname
250
251def _register_serverfiles(genesets, serverFiles):
252    """ Registers using the common hierarchy and organism. """
253    org = genesets.common_org()
254    hierarchy = genesets.common_hierarchy()
255    fn = filename(hierarchy, org)
256
257    #save to temporary file
258    tfname = pickle_temp(genesets)
259   
260    try:
261        taxname = obiTaxonomy.name(org)
262        title = "Gene sets: " + ", ".join(hierarchy) + \
263            ((" (" + taxname + ")") if org != None else "")
264        tags = list(hierarchy) + [ "gene sets", taxname ] + \
265            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
266        serverFiles.upload(sfdomain, fn, tfname, title, tags)
267        serverFiles.unprotect(sfdomain, fn)
268    finally:
269        os.remove(tfname)
270
271    update_server_list(serverFiles)
272
273def register(genesets, serverFiles=None):
274    """
275    Hierarchy is induced from the gene set names.
276    """
277    if serverFiles == None:
278        _register_local(genesets)
279    else:
280        _register_serverfiles(genesets, serverFiles)
281
282def build_hierarchy_dict(files):
283    hierd = defaultdict(list)
284    for ind,f in enumerate(files):
285        hier, org = f
286        for i in range(len(hier)+1):
287            hierd[(hier[:i], org)].append(ind)
288    return hierd
289
290def load_local(hierarchy, organism):
291    files = map(lambda x: x[:2], list_local())
292    hierd = build_hierarchy_dict(files)
293
294    out = GeneSets()
295    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
296        fname = os.path.join(local_path(), filename(h, o))
297        out.update(pickle.load(open(fname, 'r')))
298    return out
299
300def load_serverfiles(hierarchy, organism):
301    files = map(lambda x: x[:2], list_serverfiles())
302    hierd = build_hierarchy_dict(files)
303    out = GeneSets()
304    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
305        fname = orngServerFiles.localpath_download(sfdomain, 
306            filename(h, o))
307        out.update(pickle.load(open(fname, 'r')))
308    return out
309
310def load(hierarchy, organism):
311    """ First try to load from the local registred folder. If the file
312    is not available, load it from the server files. """
313    ret = load_local(hierarchy, organism)
314    if len(ret) == 0:
315        ret.update(load_serverfiles(hierarchy, organism))
316    return ret
317
318def collections(*args):
319    """
320    Input is a list of collections.
321    Collection can either be a tuple (hierarchy, orgranism), where
322    hierarchy is a tuple also.
323    """
324    result = GeneSets()
325
326    for collection in args:
327        try:
328            result.update(collection)
329        except ValueError:
330            if issequencens(collection): #have a hierarchy, organism specification
331                new = load(*collection)
332                result.update(new)
333            else:
334                if collection.lower()[-4:] == ".gmt": #format from webpage
335                    result.update(loadGMT(open(collection,"rt").read(), collection))
336                else:
337                    raise Exception("collection() accepts files in .gmt format only.")
338
339    return result
340
341def issequencens(x):
342    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
343    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
344
345class TException(Exception): pass
346
347def upload_genesets(rsf):
348    """
349    Builds the default gene sets and
350    """
351    orngServerFiles.update_local_files()
352
353    from . import obiKEGG2 as obiKEGG
354
355    #genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
356    genesetsfn = [ goGeneSets, miRNAGeneSets]
357    organisms = obiTaxonomy.common_taxids()
358    for fn in genesetsfn:
359        for org in organisms:
360            try:
361                print "Uploading ORG", org, fn
362                genesets = fn(org).split_by_hierarchy()
363                for gs in genesets:
364                    print "registering", gs.common_hierarchy()
365                    #register(gs, rsf) #server files
366                    register(gs)
367                    print "successful", gs.common_hierarchy()
368            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
369                print "organism not found", org
370
371
372if __name__ == "__main__":
373    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
374    upload_genesets(rsf)
375    pass
Note: See TracBrowser for help on using the repository browser.