source: orange-bioinformatics/obiGeneSets.py @ 1569:7898b2d279b2

Revision 1569:7898b2d279b2, 16.7 KB checked in by markotoplak, 2 years ago (diff)

obiGeneSets can now load gene sets from a dictionary or a list of pairs (name, genes).

Line 
1"""
2Getting genesets from KEGG and GO.
3
4Maintainer: Marko Toplak
5"""
6from __future__ import with_statement
7
8import obiKEGG, orange
9import os
10import obiGO
11import cPickle as pickle
12#import pickle
13import orngServerFiles
14import obiTaxonomy
15import tempfile
16import obiGeneSets
17import sys
18from collections import defaultdict
19
20sfdomain = "gene_sets"
21
22def nth(l,n):
23    return [ a[n] for a in l]
24
25class GeneSet(object):
26
27    def __init__(self, genes=None, name=None, id=None, \
28        description=None, link=None, organism=None, hierarchy=None, pair=None):
29        """
30        pair can be (id, listofgenes) - it is used before anything else.
31        """
32        if genes == None:
33            genes = []
34
35        self.hierarchy = hierarchy
36        self.genes = set(genes)
37        self.name = name
38        self.id = id
39        self.description = description
40        self.link = link
41        self.organism = organism
42
43        if pair:
44            self.id, self.genes = pair[0], set(pair[1])
45
46    """
47    the following functions are needed for sets of gene sets to be able
48    to assess equality
49    """
50
51    def __hash__(self):
52        return self.id.__hash__() + self.name.__hash__()
53
54    def __eq__(self, other):
55        if isinstance(other, self.__class__):
56            return self.__dict__ == other.__dict__
57        else:
58            return False
59
60    def __ne__(self, other):
61        return not self.__eq__(other)
62
63    def size(self):
64        return len(self.genes)
65
66    def cname(self, source=True, name=True):
67        """ Constructs a gene set name with the hierarchy. """
68        oname = self.id
69        if source and self.hierarchy:
70            oname = "[ " + ", ".join(self.hierarchy) + " ] " + oname
71        if name and self.name:
72            oname = oname + " " + self.name
73        return oname
74
75    def to_odict(self, source=True, name=True):
76        """
77        Returns a pair (id, listofgenes), like in old format.
78        """
79        return self.cname(source=source, name=name), self.genes
80
81    def __repr__(self):
82        return "GeneSet(" + ", ".join( [ 
83            "id=" + str(self.id),
84            "genes=" + str(self.genes),
85            "name=" + str(self.name),
86            "link=" + str(self.link),
87            "hierarchy=" + str(self.hierarchy)
88        ]) + ")"
89
90class GeneSetIDException(Exception):
91    pass
92
93class GeneSets(set):
94   
95    def __init__(self, input=None):
96        """
97        odict are genesets in old dict format.
98        gs are genesets in new format
99        """
100        if input != None and len(input) > 0:
101            self.update(input)
102
103    def update(self, input):
104        if isinstance(input, obiGeneSets.GeneSets):
105            super(GeneSets, self).update(input)
106        elif hasattr(input, "items"):
107            for i,g in input.items():
108                self.add(obiGeneSets.GeneSet(pair=(i,g)))
109        else:
110            for i in input:
111                if isinstance(i, obiGeneSets.GeneSet):
112                    self.add(i)
113                else:
114                    i,j = i
115                    self.add(obiGeneSets.GeneSet(pair=(i,g)))
116
117    def to_odict(self):
118        """ Return gene sets in old dictionary format. """
119        return dict(gs.to_odict() for gs in self)
120
121    def set_hierarchy(self, hierarchy):
122        """ Sets hierarchy for all gene sets """
123        for gs in self:
124            gs.hierarchy = hierarchy
125
126    def __repr__(self):
127        return "GeneSets(" + set.__repr__(self) + ")"
128
129    def common_org(self):
130        """ Returns the common organism. """
131        if len(self) == 0:
132            raise GenesetRegException("Empty gene sets.")
133
134        organisms = set(a.organism for a in self)
135
136        try:
137            return only_option(organisms)
138        except:
139            raise GenesetRegException("multiple organisms: " + str(organisms))
140
141    def hierarchies(self):
142        """ Returns all hierachies """
143        if len(self) == 0:
144            raise GenesetRegException("Empty gene sets.")
145        return set(a.hierarchy for a in self)
146
147    def common_hierarchy(self):
148        hierarchies = self.hierarchies()
149
150        def common_hierarchy1(hierarchies):
151            def hier(l): return set(map(lambda x: x[:currentl], hierarchies))
152            currentl = max(map(len, hierarchies))
153            while len(hier(currentl)) > 1:
154                currentl -= 1
155            return only_option(hier(currentl))
156
157        return common_hierarchy1(hierarchies)
158
159    def split_by_hierarchy(self):
160        """ Splits gene sets by hierarchies. """
161        hd = dict((h,obiGeneSets.GeneSets()) for h in  self.hierarchies())
162        for gs in self:
163            hd[gs.hierarchy].add(gs)
164        return hd.values()
165
166def goGeneSets(org):
167    """Returns gene sets from GO."""
168
169    ontology = obiGO.Ontology()
170    annotations = obiGO.Annotations(org, ontology=ontology)
171
172    genesets = []
173    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
174    for termn, term in ontology.terms.items():
175        genes = annotations.GetAllGenes(termn)
176        hier = ("GO", term.namespace)
177        if len(genes) > 0:
178            gs = obiGeneSets.GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
179            genesets.append(gs)
180
181    return obiGeneSets.GeneSets(genesets)
182
183def keggGeneSets(org):
184    """
185    Returns gene sets from KEGG pathways.
186    """
187    import obiKEGG2 as obiKEGG
188   
189    kegg = obiKEGG.KEGGOrganism(org)
190
191    genesets = []
192    for id in kegg.pathways():
193        pway = obiKEGG.KEGGPathway(id)
194        hier = ("KEGG","pathways")
195        gs = obiGeneSets.GeneSet(id=id,
196                                 name=pway.title,
197                                 genes=kegg.get_genes_by_pathway(id),
198                                 hierarchy=hier,
199                                 organism=org,
200                                 link=pway.link)
201        genesets.append(gs)
202
203    return obiGeneSets.GeneSets(genesets)
204
205def omimGeneSets():
206    """
207    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
208    """
209    import obiOMIM
210    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
211                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
212                    for disease in obiOMIM.diseases()]
213    return GeneSets(genesets)
214
215def miRNAGeneSets(org):
216    """
217    Return gene sets from miRNA targets
218    """
219    import obimiRNA, obiKEGG
220    org_code = obiKEGG.from_taxid(org)
221    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
222    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
223    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
224                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
225    return GeneSets(genesets)
226
227def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
228    import obimiRNA, obiGO
229    mirnas = obimiRNA.ids(int(org))
230    if ontology is None:
231        ontology = obiGO.Ontology()
232         
233    annotations = obiGO.Annotations(org, ontology=ontology)
234   
235    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
236    print go_sets
237   
238    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
239   
240    import obiGeneSets as gs
241    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
242    gsets = [gs.GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
243                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
244    gset = gs.GeneSets(gsets)
245    return gset
246
247
248def loadGMT(contents, name):
249    """
250    Eech line consists of tab separated elements. First is
251    the geneset name, next is it's description.
252   
253    For now the description is skipped.
254    """
255    def hline(s):
256        tabs = [ tab.strip() for tab in s.split("\t") ]
257        return  obiGeneSets.GeneSet(id=tabs[0], description=tabs[1], hierarchy=(name,), genes=tabs[2:])
258
259    def handleNELines(s, fn):
260        """
261        Run function on nonempty lines of a string.
262        Return a list of results for each line.
263        """
264        lines = s.split("\n")
265        lines = [ l.strip() for l in lines ]
266        lines = filter(lambda x: x != "", lines)
267        return [ fn(l) for l in lines ]
268
269    return obiGeneSets.GeneSets(handleNELines(contents, hline))
270
271"""
272We have multiple paths for gene set data:
273buffer/bigfiles/gene_sets
274and
275buffer/gene_sets_local
276both have available.txt
277"""
278
279def omakedirs(dir):
280    try:
281        os.makedirs(dir)
282    except OSError:
283        pass
284
285def local_path():
286    """ Returns local path for gene sets. Creates it if it does not exists
287    yet. """
288    import orngEnviron
289    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
290    omakedirs(pth)
291    return pth
292
293def build_index(dir):
294    """ Returns gene set availability index for some folder. """
295    pass
296
297class GenesetRegException(Exception): pass
298
299def only_option(a):
300    if len(a) == 1:
301        return list(a)[0]
302    else:
303        raise Exception()
304
305def filename(hierarchy, organism):
306    """ Obtain a filename for given hierarchy and organism. """
307    return "gs_" + "_._".join(hierarchy + \
308        (organism if organism != None else "",)) + ".pck"
309
310def filename_parse(fn):
311    """ Returns a hierarchy and the organism from the filename."""
312    fn = fn[3:-4]
313    parts = fn.split("_._")
314    hierarchy = tuple(parts[:-1])
315    org = parts[-1] if parts[-1] != "" else None
316    return hierarchy, org
317
318def is_genesets_file(fn):
319    return fn.startswith("gs_") and fn.endswith(".pck")
320
321def list_local():
322    """ Returns available gene sets from the local repository:
323    a list of (hierarchy, organism, on_local) """
324    pth = local_path()
325    gs_files = filter(is_genesets_file, os.listdir(pth))
326    return [ filename_parse(fn) + (True,) for fn in gs_files ]
327   
328def list_serverfiles_from_flist(flist):
329    gs_files = filter(is_genesets_file, flist)
330    localfiles = set(orngServerFiles.listfiles(sfdomain))
331    return [ filename_parse(fn) + \
332        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
333
334def list_serverfiles_conn(serverfiles=None):
335    """ Returns available gene sets from the server files
336    repository: a list of (hierarchy, organism, on_local) """
337    if serverfiles == None:
338        serverfiles = orngServerFiles.ServerFiles()
339    flist = serverfiles.listfiles(sfdomain)
340    return list_serverfiles_from_flist(flist)
341
342def list_serverfiles():
343    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
344    flist = pickle.load(open(fname, 'r'))
345    return list_serverfiles_from_flist(flist)
346
347def list_all():
348    """
349    return a list of (hier, org, avalable_locally)
350    If something for a specific (hier, org) is not downloaded
351    yet, show it as not-local. """
352    flist = list_local() + list_serverfiles()
353    d = {}
354    for h,o,local in flist:
355        d[h,o] = min(local, d.get((h,o),True))
356    return [ (h,o,local) for (h,o),local in d.items() ]
357
358def update_server_list(serverfiles_upload, serverfiles_list=None):
359    if serverfiles_list == None:
360        serverfiles_list = orngServerFiles.ServerFiles()
361
362    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
363
364    tfname = pickle_temp(flist)
365   
366    try:
367        fn = "index.pck"
368        title = "Gene sets: index"
369        tags = [ "gene sets", "index", "essential" ]
370        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
371        serverfiles_upload.unprotect(sfdomain, fn)
372    except Exception,e:
373        raise e
374    finally:
375        os.remove(tfname)
376
377def register_local(genesets):
378    """ Registers using the common hierarchy and organism. """
379    pth = local_path()
380
381    org = genesets.common_org()
382    hierarchy = genesets.common_hierarchy()
383    fn = filename(hierarchy, org)
384
385    with open(os.path.join(pth, fn), "w") as f:
386        pickle.dump(genesets, f)
387
388    return fn
389
390def pickle_temp(obj):
391    """ Pickle a file to a temporary file returns its name """
392    fd,tfname = tempfile.mkstemp()
393    os.close(fd)
394    f = open(tfname, 'wb')
395    pickle.dump(obj, f)
396    f.close()
397    return tfname
398
399def register_serverfiles(genesets, serverFiles):
400    """ Registers using the common hierarchy and organism. """
401    org = genesets.common_org()
402    hierarchy = genesets.common_hierarchy()
403    fn = filename(hierarchy, org)
404
405    #save to temporary file
406    tfname = pickle_temp(genesets)
407   
408    try:
409        taxname = obiTaxonomy.name(org)
410        title = "Gene sets: " + ", ".join(hierarchy) + \
411            ((" (" + taxname + ")") if org != None else "")
412        tags = list(hierarchy) + [ "gene sets", taxname ] + \
413            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
414        serverFiles.upload(sfdomain, fn, tfname, title, tags)
415        serverFiles.unprotect(sfdomain, fn)
416    finally:
417        os.remove(tfname)
418
419    update_server_list(serverFiles)
420
421def register(genesets, serverFiles=None):
422    """
423    Hierarchy is induced from the gene set names.
424    """
425    if serverFiles == None:
426        register_local(genesets)
427    else:
428        register_serverfiles(genesets, serverFiles)
429
430def build_hierarchy_dict(files):
431    hierd = defaultdict(list)
432    for ind,f in enumerate(files):
433        hier, org = f
434        for i in range(len(hier)+1):
435            hierd[(hier[:i], org)].append(ind)
436    return hierd
437
438def load_local(hierarchy, organism):
439    files = map(lambda x: x[:2], list_local())
440
441    hierd = build_hierarchy_dict(files)
442
443    out = GeneSets()
444    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
445        fname = os.path.join(local_path(), filename(h, o))
446        out.update(pickle.load(open(fname, 'r')))
447    return out
448
449def load_serverfiles(hierarchy, organism):
450    files = map(lambda x: x[:2], list_serverfiles())
451    hierd = build_hierarchy_dict(files)
452
453    out = GeneSets()
454    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
455        fname = orngServerFiles.localpath_download(sfdomain, 
456            filename(h, o))
457        out.update(pickle.load(open(fname, 'r')))
458    return out
459
460def load(hierarchy, organism):
461    """ First try to load from the local registred folder, then
462    from the server files. """
463    ret = load_local(hierarchy, organism)
464    ret.update(load_serverfiles(hierarchy, organism))
465    return ret
466
467def collections(*args):
468    """
469    Input is a list of collections.
470    Collection can either be a tuple (hierarchy, orgranism), where
471    hierarchy is a tuple also.
472    """
473    result = obiGeneSets.GeneSets()
474
475    for collection in args:
476        try:
477            result.update(collection)
478        except ValueError:
479            if issequencens(collection): #have a hierarchy, organism specification
480                new = load(*collection)
481                result.update(new)
482            else:
483                if collection.lower()[-4:] == ".gmt": #format from webpage
484                    result.update(loadGMT(open(collection,"rt").read(), collection))
485                else:
486                    raise Exception("collection() accepts files in .gmt format only.")
487
488    return result
489
490def issequencens(x):
491    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
492    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
493
494class TException(Exception): pass
495
496def upload_genesets(rsf):
497    """
498    Builds the default gene sets and
499    """
500    orngServerFiles.update_local_files()
501
502    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
503    organisms = obiTaxonomy.common_taxids()
504    for fn in genesetsfn:
505        for org in organisms:
506        #for org in [ "9606" ]:
507            print "Uploading ORG", org, fn
508            try:
509                genesets = fn(org).split_by_hierarchy()
510                for gs in genesets:
511                    print "registering", gs.common_hierarchy()
512                    register_serverfiles(gs, rsf)
513                    print "successful", gs.common_hierarchy()
514            except Exception, e:
515                print "Not successful"
516
517if __name__ == "__main__":
518
519    #gs = keggGeneSets("9606")
520    akegg = collections((("KEGG",),"9606"))
521    a = collections({"PATH": ["g1","g2","g3"]}, "steroltalk.gmt", akegg)
522    for e in a:
523        print e
524    #print len(collections(keggGeneSets("9606"),(("KEGG",),"9606"), "C5.BP.gmt"))
525    #print len(collections((("KEGG",),"9606"), "C5.BP.gmt"))
526    #print sorted(list_all())
527    #print len(collections((("KEGG",),"9606"), (("GO",), "9606"), "C5.BP.gmt"))
528    #register_local(keggGeneSets("9606"))
529    #register_local(goGeneSets("9606"))
530    #register_serverfiles(gs, rsf)
531    #print list_serverfiles_conn()
532    #print "Server list from index", list_serverfiles()
533    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
534    upload_genesets(rsf)
Note: See TracBrowser for help on using the repository browser.