source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1719:6e8861564778

Revision 1719:6e8861564778, 11.8 KB checked in by markotoplak, 20 months ago (diff)

Fixed update scripts for MeSH, GO, HomoloGene, NCBI_geneinfo, OMIM, PPI. Moved wget to Orange.utils.

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45   
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def omimGeneSets():
64    """
65    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
66    """
67    from . import obiOMIM
68    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
69                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
70                    for disease in obiOMIM.diseases()]
71    return GeneSets(genesets)
72
73def miRNAGeneSets(org):
74    """
75    Return gene sets from miRNA targets
76    """
77    from . import obimiRNA
78    org_code = obiKEGG.from_taxid(org)
79    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
80    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
81    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
82                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
83    return GeneSets(genesets)
84
85def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
86    from . import obimiRNA, obiGO
87    mirnas = obimiRNA.ids(int(org))
88    if ontology is None:
89        ontology = obiGO.Ontology()
90         
91    annotations = obiGO.Annotations(org, ontology=ontology)
92   
93    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
94    print go_sets
95   
96    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
97   
98    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
99    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
100                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
101    gset = GeneSets(gsets)
102    return gset
103
104
105def loadGMT(contents, name):
106    """
107    Eech line consists of tab separated elements. First is
108    the geneset name, next is it's description.
109   
110    For now the description is skipped.
111    """
112
113    def hline(s):
114        tabs = [tab.strip() for tab in s.split("\t")]
115        return GeneSet(id=tabs[0], description=tabs[1],
116                                   hierarchy=(name,), genes=tabs[2:])
117
118    def handleNELines(s, fn):
119        """
120        Run function on nonempty lines of a string.
121        Return a list of results for each line.
122        """
123        lines = (l.strip() for l in s.splitlines())
124        return [fn(l) for l in lines if l]
125
126    return GeneSets(handleNELines(contents, hline))
127
128"""
129We have multiple paths for gene set data:
130buffer/bigfiles/gene_sets
131and
132buffer/gene_sets_local
133both have available.txt
134"""
135
136def omakedirs(dir):
137    try:
138        os.makedirs(dir)
139    except OSError:
140        pass
141
142def local_path():
143    """ Returns local path for gene sets. Creates it if it does not exists
144    yet. """
145    from Orange.orng import orngEnviron
146    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
147    omakedirs(pth)
148    return pth
149
150def build_index(dir):
151    """ Returns gene set availability index for some folder. """
152    pass
153
154def filename(hierarchy, organism):
155    """ Obtain a filename for given hierarchy and organism. """
156    return "gs_" + "_._".join(hierarchy + \
157        (organism if organism != None else "",)) + ".pck"
158
159def filename_parse(fn):
160    """ Returns a hierarchy and the organism from the filename."""
161    fn = fn[3:-4]
162    parts = fn.split("_._")
163    hierarchy = tuple(parts[:-1])
164    org = parts[-1] if parts[-1] != "" else None
165    return hierarchy, org
166
167def is_genesets_file(fn):
168    return fn.startswith("gs_") and fn.endswith(".pck")
169
170def list_local():
171    """ Returns available gene sets from the local repository:
172    a list of (hierarchy, organism, on_local) """
173    pth = local_path()
174    gs_files = filter(is_genesets_file, os.listdir(pth))
175    return [ filename_parse(fn) + (True,) for fn in gs_files ]
176   
177def list_serverfiles_from_flist(flist):
178    gs_files = filter(is_genesets_file, flist)
179    localfiles = set(orngServerFiles.listfiles(sfdomain))
180    return [ filename_parse(fn) + \
181        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
182
183def list_serverfiles_conn(serverfiles=None):
184    """ Returns available gene sets from the server files
185    repository: a list of (hierarchy, organism, on_local) """
186    if serverfiles == None:
187        serverfiles = orngServerFiles.ServerFiles()
188    flist = serverfiles.listfiles(sfdomain)
189    return list_serverfiles_from_flist(flist)
190
191def list_serverfiles():
192    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
193    flist = pickle.load(open(fname, 'r'))
194    return list_serverfiles_from_flist(flist)
195
196def list_all():
197    """
198    return a list of (hier, org, avalable_locally)
199    If something for a specific (hier, org) is not downloaded
200    yet, show it as not-local. """
201    flist = list_local() + list_serverfiles()
202    d = {}
203    for h,o,local in flist:
204        d[h,o] = min(local, d.get((h,o),True))
205    return [ (h,o,local) for (h,o),local in d.items() ]
206
207def update_server_list(serverfiles_upload, serverfiles_list=None):
208    if serverfiles_list == None:
209        serverfiles_list = orngServerFiles.ServerFiles()
210
211    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
212
213    tfname = pickle_temp(flist)
214   
215    try:
216        fn = "index.pck"
217        title = "Gene sets: index"
218        tags = [ "gene sets", "index", "essential" ]
219        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
220        serverfiles_upload.unprotect(sfdomain, fn)
221    except Exception,e:
222        raise e
223    finally:
224        os.remove(tfname)
225
226def _register_local(genesets):
227    """ Registers using the common hierarchy and organism. """
228    pth = local_path()
229
230    org = genesets.common_org()
231    hierarchy = genesets.common_hierarchy()
232    fn = filename(hierarchy, org)
233
234    with open(os.path.join(pth, fn), "w") as f:
235        pickle.dump(genesets, f)
236
237    return fn
238
239def pickle_temp(obj):
240    """ Pickle a file to a temporary file and returns its name """
241    fd,tfname = tempfile.mkstemp()
242    os.close(fd)
243    f = open(tfname, 'wb')
244    pickle.dump(obj, f)
245    f.close()
246    return tfname
247
248def _register_serverfiles(genesets, serverFiles):
249    """ Registers using the common hierarchy and organism. """
250    org = genesets.common_org()
251    hierarchy = genesets.common_hierarchy()
252    fn = filename(hierarchy, org)
253
254    #save to temporary file
255    tfname = pickle_temp(genesets)
256   
257    try:
258        taxname = obiTaxonomy.name(org)
259        title = "Gene sets: " + ", ".join(hierarchy) + \
260            ((" (" + taxname + ")") if org != None else "")
261        tags = list(hierarchy) + [ "gene sets", taxname ] + \
262            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
263        serverFiles.upload(sfdomain, fn, tfname, title, tags)
264        serverFiles.unprotect(sfdomain, fn)
265    finally:
266        os.remove(tfname)
267
268    update_server_list(serverFiles)
269
270def register(genesets, serverFiles=None):
271    """
272    Hierarchy is induced from the gene set names.
273    """
274    if serverFiles == None:
275        _register_local(genesets)
276    else:
277        _register_serverfiles(genesets, serverFiles)
278
279def build_hierarchy_dict(files):
280    hierd = defaultdict(list)
281    for ind,f in enumerate(files):
282        hier, org = f
283        for i in range(len(hier)+1):
284            hierd[(hier[:i], org)].append(ind)
285    return hierd
286
287def load_local(hierarchy, organism):
288    files = map(lambda x: x[:2], list_local())
289    hierd = build_hierarchy_dict(files)
290
291    out = GeneSets()
292    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
293        fname = os.path.join(local_path(), filename(h, o))
294        out.update(pickle.load(open(fname, 'r')))
295    return out
296
297def load_serverfiles(hierarchy, organism):
298    files = map(lambda x: x[:2], list_serverfiles())
299    hierd = build_hierarchy_dict(files)
300    out = GeneSets()
301    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
302        fname = orngServerFiles.localpath_download(sfdomain, 
303            filename(h, o))
304        out.update(pickle.load(open(fname, 'r')))
305    return out
306
307def load(hierarchy, organism):
308    """ First try to load from the local registred folder. If the file
309    is not available, load it from the server files. """
310    ret = load_local(hierarchy, organism)
311    if len(ret) == 0:
312        ret.update(load_serverfiles(hierarchy, organism))
313    return ret
314
315def collections(*args):
316    """
317    Input is a list of collections.
318    Collection can either be a tuple (hierarchy, orgranism), where
319    hierarchy is a tuple also.
320    """
321    result = GeneSets()
322
323    for collection in args:
324        try:
325            result.update(collection)
326        except (ValueError, TypeError):
327            if issequencens(collection): #have a hierarchy, organism specification
328                new = load(*collection)
329                result.update(new)
330            else:
331                if collection.lower()[-4:] == ".gmt": #format from webpage
332                    result.update(loadGMT(open(collection,"rt").read(), collection))
333                else:
334                    raise Exception("collection() accepts files in .gmt format only.")
335
336    return result
337
338def issequencens(x):
339    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
340    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
341
342class TException(Exception): pass
343
344def upload_genesets(rsf):
345    """
346    Builds the default gene sets and
347    """
348    orngServerFiles.update_local_files()
349
350    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
351    organisms = obiTaxonomy.common_taxids()
352    for fn in genesetsfn:
353        for org in organisms:
354            try:
355                print "Uploading ORG", org, fn
356                genesets = fn(org).split_by_hierarchy()
357                for gs in genesets:
358                    print "registering", gs.common_hierarchy()
359                    register(gs, rsf) #server files
360                    #register(gs)
361                    print "successful", gs.common_hierarchy()
362            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
363                print "organism not found", org
364
365if __name__ == "__main__":
366    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
367    upload_genesets(rsf)
368    pass
Note: See TracBrowser for help on using the repository browser.