source: orange-bioinformatics/obiGeneSets.py @ 1549:bff566fa42dc

Revision 1549:bff566fa42dc, 16.2 KB checked in by ales_erjavec, 2 years ago (diff)

obiGeneSets.keggGeneSets now uses obiKEGG2.

Line 
1"""
2Getting genesets from KEGG and GO.
3
4Maintainer: Marko Toplak
5"""
6from __future__ import with_statement
7
8import obiKEGG, orange
9import os
10import obiGO
11import cPickle as pickle
12#import pickle
13import orngServerFiles
14import obiTaxonomy
15import tempfile
16import obiGeneSets
17import sys
18from collections import defaultdict
19
20sfdomain = "gene_sets"
21
22def nth(l,n):
23    return [ a[n] for a in l]
24
25class GeneSet(object):
26
27    def __init__(self, genes=None, name=None, id=None, \
28        description=None, link=None, organism=None, hierarchy=None, pair=None):
29        """
30        pair can be (id, listofgenes) - it is used before anything else.
31        """
32        if genes == None:
33            genes = []
34
35        self.hierarchy = hierarchy
36        self.genes = set(genes)
37        self.name = name
38        self.id = id
39        self.description = description
40        self.link = link
41        self.organism = organism
42
43        if pair:
44            self.id, self.genes = pair[0], set(pair[1])
45
46    """
47    the following functions are needed for sets of gene sets to be able
48    to assess equality
49    """
50
51    def __hash__(self):
52        return self.id.__hash__() + self.name.__hash__()
53
54    def __eq__(self, other):
55        if isinstance(other, self.__class__):
56            return self.__dict__ == other.__dict__
57        else:
58            return False
59
60    def __ne__(self, other):
61        return not self.__eq__(other)
62
63    def size(self):
64        return len(self.genes)
65
66    def cname(self, source=True, name=True):
67        """ Constructs a gene set name with the hierarchy. """
68        oname = self.id
69        if source and self.hierarchy:
70            oname = "[ " + ", ".join(self.hierarchy) + " ] " + oname
71        if name and self.name:
72            oname = oname + " " + self.name
73        return oname
74
75    def to_odict(self, source=True, name=True):
76        """
77        Returns a pair (id, listofgenes), like in old format.
78        """
79        return self.cname(source=source, name=name), self.genes
80
81    def __repr__(self):
82        return "GeneSet(" + ", ".join( [ 
83            "id=" + str(self.id),
84            "genes=" + str(self.genes),
85            "name=" + str(self.name),
86            "link=" + str(self.link),
87            "hierarchy=" + str(self.hierarchy)
88        ]) + ")"
89
90class GeneSetIDException(Exception):
91    pass
92
93class GeneSets(set):
94   
95    def __init__(self, input=None):
96        """
97        odict are genesets in old dict format.
98        gs are genesets in new format
99        """
100        if input != None and len(input) > 0:
101            if hasattr(input, "items"):
102                for i,g in input.items():
103                    self.add(obiGeneSets.GeneSet(pair=(i,g)))
104            else:
105                self.update(input)
106
107    def to_odict(self):
108        """ Return gene sets in old dictionary format. """
109        return dict(gs.to_odict() for gs in self)
110
111    def set_hierarchy(self, hierarchy):
112        """ Sets hierarchy for all gene sets """
113        for gs in self:
114            gs.hierarchy = hierarchy
115
116    def __repr__(self):
117        return "GeneSets(" + set.__repr__(self) + ")"
118
119    def common_org(self):
120        """ Returns the common organism. """
121        if len(self) == 0:
122            raise GenesetRegException("Empty gene sets.")
123
124        organisms = set(a.organism for a in self)
125
126        try:
127            return only_option(organisms)
128        except:
129            raise GenesetRegException("multiple organisms: " + str(organisms))
130
131    def hierarchies(self):
132        """ Returns all hierachies """
133        if len(self) == 0:
134            raise GenesetRegException("Empty gene sets.")
135        return set(a.hierarchy for a in self)
136
137    def common_hierarchy(self):
138        hierarchies = self.hierarchies()
139
140        def common_hierarchy1(hierarchies):
141            def hier(l): return set(map(lambda x: x[:currentl], hierarchies))
142            currentl = max(map(len, hierarchies))
143            while len(hier(currentl)) > 1:
144                currentl -= 1
145            return only_option(hier(currentl))
146
147        return common_hierarchy1(hierarchies)
148
149    def split_by_hierarchy(self):
150        """ Splits gene sets by hierarchies. """
151        hd = dict((h,obiGeneSets.GeneSets()) for h in  self.hierarchies())
152        for gs in self:
153            hd[gs.hierarchy].add(gs)
154        return hd.values()
155
156def goGeneSets(org):
157    """Returns gene sets from GO."""
158
159    ontology = obiGO.Ontology()
160    annotations = obiGO.Annotations(org, ontology=ontology)
161
162    genesets = []
163    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
164    for termn, term in ontology.terms.items():
165        genes = annotations.GetAllGenes(termn)
166        hier = ("GO", term.namespace)
167        if len(genes) > 0:
168            gs = obiGeneSets.GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
169            genesets.append(gs)
170
171    return obiGeneSets.GeneSets(genesets)
172
173def keggGeneSets(org):
174    """
175    Returns gene sets from KEGG pathways.
176    """
177    import obiKEGG2 as obiKEGG
178   
179    kegg = obiKEGG.KEGGOrganism(org)
180
181    genesets = []
182    for id in kegg.pathways():
183        pway = obiKEGG.KEGGPathway(id)
184        hier = ("KEGG","pathways")
185        gs = obiGeneSets.GeneSet(id=id,
186                                 name=pway.title,
187                                 genes=kegg.get_genes_by_pathway(id),
188                                 hierarchy=hier,
189                                 organism=org,
190                                 link=pway.link)
191        genesets.append(gs)
192
193    return obiGeneSets.GeneSets(genesets)
194
195def omimGeneSets():
196    """
197    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
198    """
199    import obiOMIM
200    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
201                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
202                    for disease in obiOMIM.diseases()]
203    return GeneSets(genesets)
204
205def miRNAGeneSets(org):
206    """
207    Return gene sets from miRNA targets
208    """
209    import obimiRNA, obiKEGG
210    org_code = obiKEGG.from_taxid(org)
211    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
212    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
213    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
214                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
215    return GeneSets(genesets)
216
217def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
218    import obimiRNA, obiGO
219    mirnas = obimiRNA.ids(int(org))
220    if ontology is None:
221        ontology = obiGO.Ontology()
222         
223    annotations = obiGO.Annotations(org, ontology=ontology)
224   
225    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
226    print go_sets
227   
228    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
229   
230    import obiGeneSets as gs
231    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
232    gsets = [gs.GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
233                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
234    gset = gs.GeneSets(gsets)
235    return gset
236
237
238def loadGMT(contents, name):
239    """
240    Eech line consists of tab separated elements. First is
241    the geneset name, next is it's description.
242   
243    For now the description is skipped.
244    """
245    def hline(s):
246        tabs = [ tab.strip() for tab in s.split("\t") ]
247        return  obiGeneSets.GeneSet(id=tabs[0], description=tabs[1], hierarchy=(name,), genes=tabs[2:])
248
249    def handleNELines(s, fn):
250        """
251        Run function on nonempty lines of a string.
252        Return a list of results for each line.
253        """
254        lines = s.split("\n")
255        lines = [ l.strip() for l in lines ]
256        lines = filter(lambda x: x != "", lines)
257        return [ fn(l) for l in lines ]
258
259    return obiGeneSets.GeneSets(handleNELines(contents, hline))
260
261"""
262We have multiple paths for gene set data:
263buffer/bigfiles/gene_sets
264and
265buffer/gene_sets_local
266both have available.txt
267"""
268
269def omakedirs(dir):
270    try:
271        os.makedirs(dir)
272    except OSError:
273        pass
274
275def local_path():
276    """ Returns local path for gene sets. Creates it if it does not exists
277    yet. """
278    import orngEnviron
279    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
280    omakedirs(pth)
281    return pth
282
283def build_index(dir):
284    """ Returns gene set availability index for some folder. """
285    pass
286
287class GenesetRegException(Exception): pass
288
289def only_option(a):
290    if len(a) == 1:
291        return list(a)[0]
292    else:
293        raise Exception()
294
295def filename(hierarchy, organism):
296    """ Obtain a filename for given hierarchy and organism. """
297    return "gs_" + "_._".join(hierarchy + \
298        (organism if organism != None else "",)) + ".pck"
299
300def filename_parse(fn):
301    """ Returns a hierarchy and the organism from the filename."""
302    fn = fn[3:-4]
303    parts = fn.split("_._")
304    hierarchy = tuple(parts[:-1])
305    org = parts[-1] if parts[-1] != "" else None
306    return hierarchy, org
307
308def is_genesets_file(fn):
309    return fn.startswith("gs_") and fn.endswith(".pck")
310
311def list_local():
312    """ Returns available gene sets from the local repository:
313    a list of (hierarchy, organism, on_local) """
314    pth = local_path()
315    gs_files = filter(is_genesets_file, os.listdir(pth))
316    return [ filename_parse(fn) + (True,) for fn in gs_files ]
317   
318def list_serverfiles_from_flist(flist):
319    gs_files = filter(is_genesets_file, flist)
320    localfiles = set(orngServerFiles.listfiles(sfdomain))
321    return [ filename_parse(fn) + \
322        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
323
324def list_serverfiles_conn(serverfiles=None):
325    """ Returns available gene sets from the server files
326    repository: a list of (hierarchy, organism, on_local) """
327    if serverfiles == None:
328        serverfiles = orngServerFiles.ServerFiles()
329    flist = serverfiles.listfiles(sfdomain)
330    return list_serverfiles_from_flist(flist)
331
332def list_serverfiles():
333    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
334    flist = pickle.load(open(fname, 'r'))
335    return list_serverfiles_from_flist(flist)
336
337def list_all():
338    """
339    return a list of (hier, org, avalable_locally)
340    If something for a specific (hier, org) is not downloaded
341    yet, show it as not-local. """
342    flist = list_local() + list_serverfiles()
343    d = {}
344    for h,o,local in flist:
345        d[h,o] = min(local, d.get((h,o),True))
346    return [ (h,o,local) for (h,o),local in d.items() ]
347
348def update_server_list(serverfiles_upload, serverfiles_list=None):
349    if serverfiles_list == None:
350        serverfiles_list = orngServerFiles.ServerFiles()
351
352    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
353
354    tfname = pickle_temp(flist)
355   
356    try:
357        fn = "index.pck"
358        title = "Gene sets: index"
359        tags = [ "gene sets", "index", "essential" ]
360        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
361        serverfiles_upload.unprotect(sfdomain, fn)
362    except Exception,e:
363        raise e
364    finally:
365        os.remove(tfname)
366
367def register_local(genesets):
368    """ Registers using the common hierarchy and organism. """
369    pth = local_path()
370
371    org = genesets.common_org()
372    hierarchy = genesets.common_hierarchy()
373    fn = filename(hierarchy, org)
374
375    with open(os.path.join(pth, fn), "w") as f:
376        pickle.dump(genesets, f)
377
378    return fn
379
380def pickle_temp(obj):
381    """ Pickle a file to a temporary file returns its name """
382    fd,tfname = tempfile.mkstemp()
383    os.close(fd)
384    f = open(tfname, 'wb')
385    pickle.dump(obj, f)
386    f.close()
387    return tfname
388
389def register_serverfiles(genesets, serverFiles):
390    """ Registers using the common hierarchy and organism. """
391    org = genesets.common_org()
392    hierarchy = genesets.common_hierarchy()
393    fn = filename(hierarchy, org)
394
395    #save to temporary file
396    tfname = pickle_temp(genesets)
397   
398    try:
399        taxname = obiTaxonomy.name(org)
400        title = "Gene sets: " + ", ".join(hierarchy) + \
401            ((" (" + taxname + ")") if org != None else "")
402        tags = list(hierarchy) + [ "gene sets", taxname ] + \
403            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
404        serverFiles.upload(sfdomain, fn, tfname, title, tags)
405        serverFiles.unprotect(sfdomain, fn)
406    finally:
407        os.remove(tfname)
408
409    update_server_list(serverFiles)
410
411def register(genesets, serverFiles=None):
412    """
413    Hierarchy is induced from the gene set names.
414    """
415    if serverFiles == None:
416        register_local(genesets)
417    else:
418        register_serverfiles(genesets, serverFiles)
419
420def build_hierarchy_dict(files):
421    hierd = defaultdict(list)
422    for ind,f in enumerate(files):
423        hier, org = f
424        for i in range(len(hier)+1):
425            hierd[(hier[:i], org)].append(ind)
426    return hierd
427
428def load_local(hierarchy, organism):
429    files = map(lambda x: x[:2], list_local())
430
431    hierd = build_hierarchy_dict(files)
432
433    out = GeneSets()
434    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
435        fname = os.path.join(local_path(), filename(h, o))
436        out.update(pickle.load(open(fname, 'r')))
437    return out
438
439def load_serverfiles(hierarchy, organism):
440    files = map(lambda x: x[:2], list_serverfiles())
441    hierd = build_hierarchy_dict(files)
442
443    out = GeneSets()
444    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
445        fname = orngServerFiles.localpath_download(sfdomain, 
446            filename(h, o))
447        out.update(pickle.load(open(fname, 'r')))
448    return out
449
450def load(hierarchy, organism):
451    """ First try to load from the local registred folder, then
452    from the server files. """
453    ret = load_local(hierarchy, organism)
454    ret.update(load_serverfiles(hierarchy, organism))
455    return ret
456
457def collections(*args):
458    """
459    Input is a list of collections.
460    Collection can either be a tuple (hierarchy, orgranism), where
461    hierarchy is a tuple also.
462    """
463    result = obiGeneSets.GeneSets()
464
465    for collection in args:
466        if isinstance(collection, obiGeneSets.GeneSets):
467            result.update(collection)
468        elif issequencens(collection): #have a hierarchy, organism specification
469            new = load(*collection)
470            result.update(new)
471        else:
472            if collection.lower()[-4:] == ".gmt": #format from webpage
473                result.update(loadGMT(open(collection,"rt").read(), collection))
474            else:
475                raise Exception("collection() accepts files in .gmt format only.")
476 
477    return result
478
479def issequencens(x):
480    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
481    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
482
483class TException(Exception): pass
484
485def upload_genesets(rsf):
486    """
487    Builds the default gene sets and
488    """
489    orngServerFiles.update_local_files()
490
491    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
492    organisms = obiTaxonomy.common_taxids()
493    for fn in genesetsfn:
494        for org in organisms:
495        #for org in [ "9606" ]:
496            print "Uploading ORG", org, fn
497            try:
498                genesets = fn(org).split_by_hierarchy()
499                for gs in genesets:
500                    print "registering", gs.common_hierarchy()
501                    register_serverfiles(gs, rsf)
502                    print "successful", gs.common_hierarchy()
503            except Exception, e:
504                print "Not successful"
505
506if __name__ == "__main__":
507    gs = keggGeneSets("9606")
508    #print len(collections(keggGeneSets("9606"),(("KEGG",),"9606"), "C5.BP.gmt"))
509    #print len(collections((("KEGG",),"9606"), "C5.BP.gmt"))
510    #print sorted(list_all())
511    #print len(collections((("KEGG",),"9606"), (("GO",), "9606"), "C5.BP.gmt"))
512    #register_local(keggGeneSets("9606"))
513    #register_local(goGeneSets("9606"))
514    #register_serverfiles(gs, rsf)
515    #print list_serverfiles_conn()
516    #print "Server list from index", list_serverfiles()
517    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
518    upload_genesets(rsf)
Note: See TracBrowser for help on using the repository browser.