source: orange-bioinformatics/Orange/bioinformatics/obiGeneSets.py @ 1632:9cf919d0f343

Revision 1632:9cf919d0f343, 16.7 KB checked in by mitar, 2 years ago (diff)

Fixing imports.

Line 
1"""
2Getting genesets from KEGG and GO.
3
4Maintainer: Marko Toplak
5"""
6
7from __future__ import absolute_import, with_statement
8
9import cPickle as pickle, os, tempfile, sys
10from collections import defaultdict
11
12import orange
13from Orange.orng import orngServerFiles
14
15from . import obiGeneSets, obiGO, obiKEGG, obiTaxonomy
16
17sfdomain = "gene_sets"
18
19def nth(l,n):
20    return [ a[n] for a in l]
21
22class GeneSet(object):
23
24    def __init__(self, genes=None, name=None, id=None, \
25        description=None, link=None, organism=None, hierarchy=None, pair=None):
26        """
27        pair can be (id, listofgenes) - it is used before anything else.
28        """
29        if genes == None:
30            genes = []
31
32        self.hierarchy = hierarchy
33        self.genes = set(genes)
34        self.name = name
35        self.id = id
36        self.description = description
37        self.link = link
38        self.organism = organism
39
40        if pair:
41            self.id, self.genes = pair[0], set(pair[1])
42
43    """
44    the following functions are needed for sets of gene sets to be able
45    to assess equality
46    """
47
48    def __hash__(self):
49        return self.id.__hash__() + self.name.__hash__()
50
51    def __eq__(self, other):
52        if isinstance(other, self.__class__):
53            return self.__dict__ == other.__dict__
54        else:
55            return False
56
57    def __ne__(self, other):
58        return not self.__eq__(other)
59
60    def size(self):
61        return len(self.genes)
62
63    def cname(self, source=True, name=True):
64        """ Constructs a gene set name with the hierarchy. """
65        oname = self.id
66        if source and self.hierarchy:
67            oname = "[ " + ", ".join(self.hierarchy) + " ] " + oname
68        if name and self.name:
69            oname = oname + " " + self.name
70        return oname
71
72    def to_odict(self, source=True, name=True):
73        """
74        Returns a pair (id, listofgenes), like in old format.
75        """
76        return self.cname(source=source, name=name), self.genes
77
78    def __repr__(self):
79        return "GeneSet(" + ", ".join( [ 
80            "id=" + str(self.id),
81            "genes=" + str(self.genes),
82            "name=" + str(self.name),
83            "link=" + str(self.link),
84            "hierarchy=" + str(self.hierarchy)
85        ]) + ")"
86
87class GeneSetIDException(Exception):
88    pass
89
90class GeneSets(set):
91   
92    def __init__(self, input=None):
93        """
94        odict are genesets in old dict format.
95        gs are genesets in new format
96        """
97        if input != None and len(input) > 0:
98            self.update(input)
99
100    def update(self, input):
101        if isinstance(input, obiGeneSets.GeneSets):
102            super(GeneSets, self).update(input)
103        elif hasattr(input, "items"):
104            for i, g in input.items():
105                self.add(obiGeneSets.GeneSet(pair=(i, g)))
106        else:
107            for i in input:
108                if isinstance(i, obiGeneSets.GeneSet):
109                    self.add(i)
110                else:
111                    i, g = i
112                    self.add(obiGeneSets.GeneSet(pair=(i, g)))
113
114    def to_odict(self):
115        """ Return gene sets in old dictionary format. """
116        return dict(gs.to_odict() for gs in self)
117
118    def set_hierarchy(self, hierarchy):
119        """ Sets hierarchy for all gene sets """
120        for gs in self:
121            gs.hierarchy = hierarchy
122
123    def __repr__(self):
124        return "GeneSets(" + set.__repr__(self) + ")"
125
126    def common_org(self):
127        """ Returns the common organism. """
128        if len(self) == 0:
129            raise GenesetRegException("Empty gene sets.")
130
131        organisms = set(a.organism for a in self)
132
133        try:
134            return only_option(organisms)
135        except:
136            raise GenesetRegException("multiple organisms: " + str(organisms))
137
138    def hierarchies(self):
139        """ Returns all hierachies """
140        if len(self) == 0:
141            raise GenesetRegException("Empty gene sets.")
142        return set(a.hierarchy for a in self)
143
144    def common_hierarchy(self):
145        hierarchies = self.hierarchies()
146
147        def common_hierarchy1(hierarchies):
148            def hier(l): return set(map(lambda x: x[:currentl], hierarchies))
149            currentl = max(map(len, hierarchies))
150            while len(hier(currentl)) > 1:
151                currentl -= 1
152            return only_option(hier(currentl))
153
154        return common_hierarchy1(hierarchies)
155
156    def split_by_hierarchy(self):
157        """ Splits gene sets by hierarchies. """
158        hd = dict((h,obiGeneSets.GeneSets()) for h in  self.hierarchies())
159        for gs in self:
160            hd[gs.hierarchy].add(gs)
161        return hd.values()
162
163def goGeneSets(org):
164    """Returns gene sets from GO."""
165
166    ontology = obiGO.Ontology()
167    annotations = obiGO.Annotations(org, ontology=ontology)
168
169    genesets = []
170    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
171    for termn, term in ontology.terms.items():
172        genes = annotations.GetAllGenes(termn)
173        hier = ("GO", term.namespace)
174        if len(genes) > 0:
175            gs = obiGeneSets.GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
176            genesets.append(gs)
177
178    return obiGeneSets.GeneSets(genesets)
179
180def keggGeneSets(org):
181    """
182    Returns gene sets from KEGG pathways.
183    """
184    from . import obiKEGG2 as obiKEGG
185   
186    kegg = obiKEGG.KEGGOrganism(org)
187
188    genesets = []
189    for id in kegg.pathways():
190        pway = obiKEGG.KEGGPathway(id)
191        hier = ("KEGG","pathways")
192        gs = obiGeneSets.GeneSet(id=id,
193                                 name=pway.title,
194                                 genes=kegg.get_genes_by_pathway(id),
195                                 hierarchy=hier,
196                                 organism=org,
197                                 link=pway.link)
198        genesets.append(gs)
199
200    return obiGeneSets.GeneSets(genesets)
201
202def omimGeneSets():
203    """
204    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
205    """
206    from . import obiOMIM
207    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
208                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
209                    for disease in obiOMIM.diseases()]
210    return GeneSets(genesets)
211
212def miRNAGeneSets(org):
213    """
214    Return gene sets from miRNA targets
215    """
216    from . import obimiRNA, obiKEGG
217    org_code = obiKEGG.from_taxid(org)
218    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
219    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
220    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
221                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
222    return GeneSets(genesets)
223
224def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
225    from . import obimiRNA, obiGO
226    mirnas = obimiRNA.ids(int(org))
227    if ontology is None:
228        ontology = obiGO.Ontology()
229         
230    annotations = obiGO.Annotations(org, ontology=ontology)
231   
232    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
233    print go_sets
234   
235    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
236   
237    from . import obiGeneSets as gs
238    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
239    gsets = [gs.GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
240                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
241    gset = gs.GeneSets(gsets)
242    return gset
243
244
245def loadGMT(contents, name):
246    """
247    Eech line consists of tab separated elements. First is
248    the geneset name, next is it's description.
249   
250    For now the description is skipped.
251    """
252    def hline(s):
253        tabs = [tab.strip() for tab in s.split("\t")]
254        return obiGeneSets.GeneSet(id=tabs[0], description=tabs[1],
255                                   hierarchy=(name,), genes=tabs[2:])
256
257    def handleNELines(s, fn):
258        """
259        Run function on nonempty lines of a string.
260        Return a list of results for each line.
261        """
262        lines = (l.strip() for l in s.splitlines())
263        return [fn(l) for l in lines if l]
264
265    return obiGeneSets.GeneSets(handleNELines(contents, hline))
266
267"""
268We have multiple paths for gene set data:
269buffer/bigfiles/gene_sets
270and
271buffer/gene_sets_local
272both have available.txt
273"""
274
275def omakedirs(dir):
276    try:
277        os.makedirs(dir)
278    except OSError:
279        pass
280
281def local_path():
282    """ Returns local path for gene sets. Creates it if it does not exists
283    yet. """
284    from Orange.orng import orngEnviron
285    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
286    omakedirs(pth)
287    return pth
288
289def build_index(dir):
290    """ Returns gene set availability index for some folder. """
291    pass
292
293class GenesetRegException(Exception): pass
294
295def only_option(a):
296    if len(a) == 1:
297        return list(a)[0]
298    else:
299        raise Exception()
300
301def filename(hierarchy, organism):
302    """ Obtain a filename for given hierarchy and organism. """
303    return "gs_" + "_._".join(hierarchy + \
304        (organism if organism != None else "",)) + ".pck"
305
306def filename_parse(fn):
307    """ Returns a hierarchy and the organism from the filename."""
308    fn = fn[3:-4]
309    parts = fn.split("_._")
310    hierarchy = tuple(parts[:-1])
311    org = parts[-1] if parts[-1] != "" else None
312    return hierarchy, org
313
314def is_genesets_file(fn):
315    return fn.startswith("gs_") and fn.endswith(".pck")
316
317def list_local():
318    """ Returns available gene sets from the local repository:
319    a list of (hierarchy, organism, on_local) """
320    pth = local_path()
321    gs_files = filter(is_genesets_file, os.listdir(pth))
322    return [ filename_parse(fn) + (True,) for fn in gs_files ]
323   
324def list_serverfiles_from_flist(flist):
325    gs_files = filter(is_genesets_file, flist)
326    localfiles = set(orngServerFiles.listfiles(sfdomain))
327    return [ filename_parse(fn) + \
328        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
329
330def list_serverfiles_conn(serverfiles=None):
331    """ Returns available gene sets from the server files
332    repository: a list of (hierarchy, organism, on_local) """
333    if serverfiles == None:
334        serverfiles = orngServerFiles.ServerFiles()
335    flist = serverfiles.listfiles(sfdomain)
336    return list_serverfiles_from_flist(flist)
337
338def list_serverfiles():
339    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
340    flist = pickle.load(open(fname, 'r'))
341    return list_serverfiles_from_flist(flist)
342
343def list_all():
344    """
345    return a list of (hier, org, avalable_locally)
346    If something for a specific (hier, org) is not downloaded
347    yet, show it as not-local. """
348    flist = list_local() + list_serverfiles()
349    d = {}
350    for h,o,local in flist:
351        d[h,o] = min(local, d.get((h,o),True))
352    return [ (h,o,local) for (h,o),local in d.items() ]
353
354def update_server_list(serverfiles_upload, serverfiles_list=None):
355    if serverfiles_list == None:
356        serverfiles_list = orngServerFiles.ServerFiles()
357
358    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
359
360    tfname = pickle_temp(flist)
361   
362    try:
363        fn = "index.pck"
364        title = "Gene sets: index"
365        tags = [ "gene sets", "index", "essential" ]
366        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
367        serverfiles_upload.unprotect(sfdomain, fn)
368    except Exception,e:
369        raise e
370    finally:
371        os.remove(tfname)
372
373def register_local(genesets):
374    """ Registers using the common hierarchy and organism. """
375    pth = local_path()
376
377    org = genesets.common_org()
378    hierarchy = genesets.common_hierarchy()
379    fn = filename(hierarchy, org)
380
381    with open(os.path.join(pth, fn), "w") as f:
382        pickle.dump(genesets, f)
383
384    return fn
385
386def pickle_temp(obj):
387    """ Pickle a file to a temporary file returns its name """
388    fd,tfname = tempfile.mkstemp()
389    os.close(fd)
390    f = open(tfname, 'wb')
391    pickle.dump(obj, f)
392    f.close()
393    return tfname
394
395def register_serverfiles(genesets, serverFiles):
396    """ Registers using the common hierarchy and organism. """
397    org = genesets.common_org()
398    hierarchy = genesets.common_hierarchy()
399    fn = filename(hierarchy, org)
400
401    #save to temporary file
402    tfname = pickle_temp(genesets)
403   
404    try:
405        taxname = obiTaxonomy.name(org)
406        title = "Gene sets: " + ", ".join(hierarchy) + \
407            ((" (" + taxname + ")") if org != None else "")
408        tags = list(hierarchy) + [ "gene sets", taxname ] + \
409            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
410        serverFiles.upload(sfdomain, fn, tfname, title, tags)
411        serverFiles.unprotect(sfdomain, fn)
412    finally:
413        os.remove(tfname)
414
415    update_server_list(serverFiles)
416
417def register(genesets, serverFiles=None):
418    """
419    Hierarchy is induced from the gene set names.
420    """
421    if serverFiles == None:
422        register_local(genesets)
423    else:
424        register_serverfiles(genesets, serverFiles)
425
426def build_hierarchy_dict(files):
427    hierd = defaultdict(list)
428    for ind,f in enumerate(files):
429        hier, org = f
430        for i in range(len(hier)+1):
431            hierd[(hier[:i], org)].append(ind)
432    return hierd
433
434def load_local(hierarchy, organism):
435    files = map(lambda x: x[:2], list_local())
436
437    hierd = build_hierarchy_dict(files)
438
439    out = GeneSets()
440    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
441        fname = os.path.join(local_path(), filename(h, o))
442        out.update(pickle.load(open(fname, 'r')))
443    return out
444
445def load_serverfiles(hierarchy, organism):
446    files = map(lambda x: x[:2], list_serverfiles())
447    hierd = build_hierarchy_dict(files)
448
449    out = GeneSets()
450    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
451        fname = orngServerFiles.localpath_download(sfdomain, 
452            filename(h, o))
453        out.update(pickle.load(open(fname, 'r')))
454    return out
455
456def load(hierarchy, organism):
457    """ First try to load from the local registred folder, then
458    from the server files. """
459    ret = load_local(hierarchy, organism)
460    ret.update(load_serverfiles(hierarchy, organism))
461    return ret
462
463def collections(*args):
464    """
465    Input is a list of collections.
466    Collection can either be a tuple (hierarchy, orgranism), where
467    hierarchy is a tuple also.
468    """
469    result = obiGeneSets.GeneSets()
470
471    for collection in args:
472        try:
473            result.update(collection)
474        except ValueError:
475            if issequencens(collection): #have a hierarchy, organism specification
476                new = load(*collection)
477                result.update(new)
478            else:
479                if collection.lower()[-4:] == ".gmt": #format from webpage
480                    result.update(loadGMT(open(collection,"rt").read(), collection))
481                else:
482                    raise Exception("collection() accepts files in .gmt format only.")
483
484    return result
485
486def issequencens(x):
487    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
488    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
489
490class TException(Exception): pass
491
492def upload_genesets(rsf):
493    """
494    Builds the default gene sets and
495    """
496    orngServerFiles.update_local_files()
497
498    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
499    organisms = obiTaxonomy.common_taxids()
500    for fn in genesetsfn:
501        for org in organisms:
502        #for org in [ "9606" ]:
503            print "Uploading ORG", org, fn
504            try:
505                genesets = fn(org).split_by_hierarchy()
506                for gs in genesets:
507                    print "registering", gs.common_hierarchy()
508                    register_serverfiles(gs, rsf)
509                    print "successful", gs.common_hierarchy()
510            except Exception, e:
511                print "Not successful"
512
513if __name__ == "__main__":
514
515    #gs = keggGeneSets("9606")
516    akegg = collections((("KEGG",),"9606"))
517    a = collections({"PATH": ["g1","g2","g3"]}, "steroltalk.gmt", akegg)
518    for e in a:
519        print e
520    #print len(collections(keggGeneSets("9606"),(("KEGG",),"9606"), "C5.BP.gmt"))
521    #print len(collections((("KEGG",),"9606"), "C5.BP.gmt"))
522    #print sorted(list_all())
523    #print len(collections((("KEGG",),"9606"), (("GO",), "9606"), "C5.BP.gmt"))
524    #register_local(keggGeneSets("9606"))
525    #register_local(goGeneSets("9606"))
526    #register_serverfiles(gs, rsf)
527    #print list_serverfiles_conn()
528    #print "Server list from index", list_serverfiles()
529    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
530    upload_genesets(rsf)
Note: See TracBrowser for help on using the repository browser.