source: orange-bioinformatics/obiGeneSets.py @ 1216:167e6a623644

Revision 1216:167e6a623644, 15.1 KB checked in by markotoplak, 4 years ago (diff)

obiGeneSets: uploading gene sets - do not quit on exception

Line 
1"""
2Getting genesets from KEGG and GO.
3
4Maintainer: Marko Toplak
5"""
6from __future__ import with_statement
7
8import obiKEGG, orange
9import os
10import obiGO
11import cPickle as pickle
12#import pickle
13import orngServerFiles
14import obiTaxonomy
15import tempfile
16import obiGeneSets
17import sys
18from collections import defaultdict
19
20sfdomain = "gene_sets"
21
22def nth(l,n):
23    return [ a[n] for a in l]
24
25class GeneSet(object):
26
27    def __init__(self, genes=None, name=None, id=None, \
28        description=None, link=None, organism=None, hierarchy=None, pair=None):
29        """
30        pair can be (id, listofgenes) - it is used before anything else.
31        """
32        if genes == None:
33            genes = []
34
35        self.hierarchy = hierarchy
36        self.genes = set(genes)
37        self.name = name
38        self.id = id
39        self.description = description
40        self.link = link
41        self.organism = organism
42
43        if pair:
44            self.id, self.genes = pair[0], set(pair[1])
45
46    """
47    the following functions are needed for sets of gene sets to be able
48    to assess equality
49    """
50
51    def __hash__(self):
52        return self.id.__hash__() + self.name.__hash__()
53
54    def __eq__(self, other):
55        if isinstance(other, self.__class__):
56            return self.__dict__ == other.__dict__
57        else:
58            return False
59
60    def __ne__(self, other):
61        return not self.__eq__(other)
62
63    def size(self):
64        return len(self.genes)
65
66    def cname(self, source=True, name=True):
67        """ Constructs a gene set name with the hierarchy. """
68        oname = self.id
69        if source and self.hierarchy:
70            oname = "[ " + ", ".join(self.hierarchy) + " ] " + oname
71        if name and self.name:
72            oname = oname + " " + self.name
73        return oname
74
75    def to_odict(self, source=True, name=True):
76        """
77        Returns a pair (id, listofgenes), like in old format.
78        """
79        return self.cname(source=source, name=name), self.genes
80
81    def __repr__(self):
82        return "GeneSet(" + ", ".join( [ 
83            "id=" + str(self.id),
84            "genes=" + str(self.genes),
85            "name=" + str(self.name),
86            "link=" + str(self.link),
87            "hierarchy=" + str(self.hierarchy)
88        ]) + ")"
89
90class GeneSetIDException(Exception):
91    pass
92
93class GeneSets(set):
94   
95    def __init__(self, input=None):
96        """
97        odict are genesets in old dict format.
98        gs are genesets in new format
99        """
100        if input != None and len(input) > 0:
101            if hasattr(input, "items"):
102                for i,g in input.items():
103                    self.add(obiGeneSets.GeneSet(pair=(i,g)))
104            else:
105                self.update(input)
106
107    def to_odict(self):
108        """ Return gene sets in old dictionary format. """
109        return dict(gs.to_odict() for gs in self)
110
111    def set_hierarchy(self, hierarchy):
112        """ Sets hierarchy for all gene sets """
113        for gs in self:
114            gs.hierarchy = hierarchy
115
116    def __repr__(self):
117        return "GeneSets(" + set.__repr__(self) + ")"
118
119    def common_org(self):
120        """ Returns the common organism. """
121        if len(self) == 0:
122            raise GenesetRegException("Empty gene sets.")
123
124        organisms = set(a.organism for a in self)
125
126        try:
127            return only_option(organisms)
128        except:
129            raise GenesetRegException("multiple organisms: " + str(organisms))
130
131    def hierarchies(self):
132        """ Returns all hierachies """
133        if len(self) == 0:
134            raise GenesetRegException("Empty gene sets.")
135        return set(a.hierarchy for a in self)
136
137    def common_hierarchy(self):
138        hierarchies = self.hierarchies()
139
140        def common_hierarchy1(hierarchies):
141            def hier(l): return set(map(lambda x: x[:currentl], hierarchies))
142            currentl = max(map(len, hierarchies))
143            while len(hier(currentl)) > 1:
144                currentl -= 1
145            return only_option(hier(currentl))
146
147        return common_hierarchy1(hierarchies)
148
149    def split_by_hierarchy(self):
150        """ Splits gene sets by hierarchies. """
151        hd = dict((h,obiGeneSets.GeneSets()) for h in  self.hierarchies())
152        for gs in self:
153            hd[gs.hierarchy].add(gs)
154        return hd.values()
155
156def goGeneSets(org):
157    """Returns gene sets from GO."""
158
159    ontology = obiGO.Ontology()
160    annotations = obiGO.Annotations(org, ontology=ontology)
161
162    genesets = []
163    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
164    for termn, term in ontology.terms.items():
165        genes = annotations.GetAllGenes(termn)
166        hier = ("GO", term.namespace)
167        if len(genes) > 0:
168            gs = obiGeneSets.GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
169            genesets.append(gs)
170
171    return obiGeneSets.GeneSets(genesets)
172
173def keggGeneSets(org):
174    """
175    Returns gene sets from KEGG pathways.
176    """
177    kegg = obiKEGG.KEGGOrganism(org)
178
179    genesets = []
180    for id in kegg.pathways():
181        pway = obiKEGG.KEGGPathway(id)
182        hier = ("KEGG",)
183        gs = obiGeneSets.GeneSet(id=id, name=pway.title, genes=kegg.get_genes_by_pathway(id), hierarchy=hier, organism=org, link=pway.link)
184        genesets.append(gs)
185
186    return obiGeneSets.GeneSets(genesets)
187
188def omimGeneSets():
189    """
190    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
191    """
192    import obiOMIM
193    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
194                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
195                    for disease in obiOMIM.diseases()]
196    return GeneSets(genesets)
197
198def miRNAGeneSets(org):
199    """
200    Return gene sets from miRNA targets
201    """
202    import obimiRNA, obiKEGG
203    org_code = obiKEGG.from_taxid(org)
204    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
205    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
206    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA",),
207                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
208    return GeneSets(genesets)
209
210def loadGMT(contents, name):
211    """
212    Eech line consists of tab separated elements. First is
213    the geneset name, next is it's description.
214   
215    For now the description is skipped.
216    """
217    def hline(s):
218        tabs = [ tab.strip() for tab in s.split("\t") ]
219        return  obiGeneSets.GeneSet(id=tabs[0], description=tabs[1], hierarchy=(name,), genes=tabs[2:])
220
221    def handleNELines(s, fn):
222        """
223        Run function on nonempty lines of a string.
224        Return a list of results for each line.
225        """
226        lines = s.split("\n")
227        lines = [ l.strip() for l in lines ]
228        lines = filter(lambda x: x != "", lines)
229        return [ fn(l) for l in lines ]
230
231    return obiGeneSets.GeneSets(handleNELines(contents, hline))
232
233"""
234We have multiple paths for gene set data:
235buffer/bigfiles/gene_sets
236and
237buffer/gene_sets_local
238both have available.txt
239"""
240
241def omakedirs(dir):
242    try:
243        os.makedirs(dir)
244    except OSError:
245        pass
246
247def local_path():
248    """ Returns local path for gene sets. Creates it if it does not exists
249    yet. """
250    import orngEnviron
251    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
252    omakedirs(pth)
253    return pth
254
255def build_index(dir):
256    """ Returns gene set availability index for some folder. """
257    pass
258
259class GenesetRegException(Exception): pass
260
261def only_option(a):
262    if len(a) == 1:
263        return list(a)[0]
264    else:
265        raise Exception()
266
267def filename(hierarchy, organism):
268    """ Obtain a filename for given hierarchy and organism. """
269    return "gs_" + "_._".join(hierarchy + \
270        (organism if organism != None else "",)) + ".pck"
271
272def filename_parse(fn):
273    """ Returns a hierarchy and the organism from the filename."""
274    fn = fn[3:-4]
275    parts = fn.split("_._")
276    hierarchy = tuple(parts[:-1])
277    org = parts[-1] if parts[-1] != "" else None
278    return hierarchy, org
279
280def is_genesets_file(fn):
281    return fn.startswith("gs_") and fn.endswith(".pck")
282
283def list_local():
284    """ Returns available gene sets from the local repository:
285    a list of (hierarchy, organism, on_local) """
286    pth = local_path()
287    gs_files = filter(is_genesets_file, os.listdir(pth))
288    return [ filename_parse(fn) + (True,) for fn in gs_files ]
289   
290def list_serverfiles_from_flist(flist):
291    gs_files = filter(is_genesets_file, flist)
292    localfiles = set(orngServerFiles.listfiles(sfdomain))
293    return [ filename_parse(fn) + \
294        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
295
296def list_serverfiles_conn(serverfiles=None):
297    """ Returns available gene sets from the server files
298    repository: a list of (hierarchy, organism, on_local) """
299    if serverfiles == None:
300        serverfiles = orngServerFiles.ServerFiles()
301    flist = serverfiles.listfiles(sfdomain)
302    return list_serverfiles_from_flist(flist)
303
304def list_serverfiles():
305    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
306    flist = pickle.load(open(fname, 'r'))
307    return list_serverfiles_from_flist(flist)
308
309def list_all():
310    """
311    return a list of (hier, org, avalable_locally)
312    If something for a specific (hier, org) is not downloaded
313    yet, show it as not-local. """
314    flist = list_local() + list_serverfiles()
315    d = {}
316    for h,o,local in flist:
317        d[h,o] = min(local, d.get((h,o),True))
318    return [ (h,o,local) for (h,o),local in d.items() ]
319
320def update_server_list(serverfiles_upload, serverfiles_list=None):
321    if serverfiles_list == None:
322        serverfiles_list = orngServerFiles.ServerFiles()
323
324    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
325
326    tfname = pickle_temp(flist)
327   
328    try:
329        fn = "index.pck"
330        title = "Gene sets: index"
331        tags = [ "gene sets", "index", "essential" ]
332        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
333        serverfiles_upload.unprotect(sfdomain, fn)
334    except Exception,e:
335        raise e
336    finally:
337        os.remove(tfname)
338
339def register_local(genesets):
340    """ Registers using the common hierarchy and organism. """
341    pth = local_path()
342
343    org = genesets.common_org()
344    hierarchy = genesets.common_hierarchy()
345    fn = filename(hierarchy, org)
346
347    with open(os.path.join(pth, fn), "w") as f:
348        pickle.dump(genesets, f)
349
350    return fn
351
352def pickle_temp(obj):
353    """ Pickle a file to a temporary file returns its name """
354    fd,tfname = tempfile.mkstemp()
355    os.close(fd)
356    f = open(tfname, 'wb')
357    pickle.dump(obj, f)
358    f.close()
359    return tfname
360
361def register_serverfiles(genesets, serverFiles):
362    """ Registers using the common hierarchy and organism. """
363    org = genesets.common_org()
364    hierarchy = genesets.common_hierarchy()
365    fn = filename(hierarchy, org)
366
367    #save to temporary file
368    tfname = pickle_temp(genesets)
369   
370    try:
371        taxname = obiTaxonomy.name(org)
372        title = "Gene sets: " + ", ".join(hierarchy) + \
373            ((" (" + taxname + ")") if org != None else "")
374        tags = list(hierarchy) + [ "gene sets", taxname ] + \
375            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
376        serverFiles.upload(sfdomain, fn, tfname, title, tags)
377        serverFiles.unprotect(sfdomain, fn)
378    finally:
379        os.remove(tfname)
380
381    update_server_list(serverFiles)
382
383def register(genesets, serverFiles=None):
384    """
385    Hierarchy is induced from the gene set names.
386    """
387    if serverFiles == None:
388        register_local(genesets)
389    else:
390        register_serverfiles(genesets, serverFiles)
391
392def build_hierarchy_dict(files):
393    hierd = defaultdict(list)
394    for ind,f in enumerate(files):
395        hier, org = f
396        for i in range(len(hier)+1):
397            hierd[(hier[:i], org)].append(ind)
398    return hierd
399
400def load_local(hierarchy, organism):
401    files = map(lambda x: x[:2], list_local())
402
403    hierd = build_hierarchy_dict(files)
404
405    out = GeneSets()
406    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
407        fname = os.path.join(local_path(), filename(h, o))
408        out.update(pickle.load(open(fname, 'r')))
409    return out
410
411def load_serverfiles(hierarchy, organism):
412    files = map(lambda x: x[:2], list_serverfiles())
413    hierd = build_hierarchy_dict(files)
414
415    out = GeneSets()
416    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
417        fname = orngServerFiles.localpath_download(sfdomain, 
418            filename(h, o))
419        out.update(pickle.load(open(fname, 'r')))
420    return out
421
422def load(hierarchy, organism):
423    """ First try to load from the local registred folder, then
424    from the server files. """
425    ret = load_local(hierarchy, organism)
426    ret.update(load_serverfiles(hierarchy, organism))
427    return ret
428
429def collections(*args):
430    """
431    Input is a list of collections.
432    Collection can either be a tuple (hierarchy, orgranism), where
433    hierarchy is a tuple also.
434    """
435    result = obiGeneSets.GeneSets()
436
437    for collection in args:
438        if isinstance(collection, obiGeneSets.GeneSets):
439            result.update(collection)
440        elif issequencens(collection): #have a hierarchy, organism specification
441            new = load(*collection)
442            result.update(new)
443        else:
444            if collection.lower()[-4:] == ".gmt": #format from webpage
445                result.update(loadGMT(open(collection,"rt").read(), collection))
446            else:
447                raise Exception("collection() accepts files in .gmt format only.")
448 
449    return result
450
451def issequencens(x):
452    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
453    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
454
455class TException(Exception): pass
456
457def upload_genesets(rsf):
458    """
459    Builds the default gene sets and
460    """
461    orngServerFiles.update_local_files()
462
463    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
464    organisms = obiTaxonomy.common_taxids()
465    for fn in genesetsfn:
466        for org in organisms:
467        #for org in [ "9606" ]:
468            print "Uploading ORG", org, fn
469            try:
470                genesets = fn(org).split_by_hierarchy()
471                for gs in genesets:
472                    print "registering", gs.common_hierarchy()
473                    register_serverfiles(gs, rsf)
474                    print "successful", gs.common_hierarchy()
475            except Exception, e:
476                print "Not successful"
477
478if __name__ == "__main__":
479    gs = keggGeneSets("9606")
480    #print len(collections(keggGeneSets("9606"),(("KEGG",),"9606"), "C5.BP.gmt"))
481    #print len(collections((("KEGG",),"9606"), "C5.BP.gmt"))
482    #print sorted(list_all())
483    #print len(collections((("KEGG",),"9606"), (("GO",), "9606"), "C5.BP.gmt"))
484    #register_local(keggGeneSets("9606"))
485    #register_local(goGeneSets("9606"))
486    #register_serverfiles(gs, rsf)
487    #print list_serverfiles_conn()
488    #print "Server list from index", list_serverfiles()
489    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
490    upload_genesets(rsf)
Note: See TracBrowser for help on using the repository browser.