source: orange-bioinformatics/obiGeneSets.py @ 1608:bb559485a499

Revision 1608:bb559485a499, 16.6 KB checked in by Lan Zagar <lan.zagar@…>, 2 years ago (diff)

Handle different line endings (see #1160).

Line 
1"""
2Getting genesets from KEGG and GO.
3
4Maintainer: Marko Toplak
5"""
6from __future__ import with_statement
7
8import obiKEGG, orange
9import os
10import obiGO
11import cPickle as pickle
12#import pickle
13import orngServerFiles
14import obiTaxonomy
15import tempfile
16import obiGeneSets
17import sys
18from collections import defaultdict
19
20sfdomain = "gene_sets"
21
22def nth(l,n):
23    return [ a[n] for a in l]
24
25class GeneSet(object):
26
27    def __init__(self, genes=None, name=None, id=None, \
28        description=None, link=None, organism=None, hierarchy=None, pair=None):
29        """
30        pair can be (id, listofgenes) - it is used before anything else.
31        """
32        if genes == None:
33            genes = []
34
35        self.hierarchy = hierarchy
36        self.genes = set(genes)
37        self.name = name
38        self.id = id
39        self.description = description
40        self.link = link
41        self.organism = organism
42
43        if pair:
44            self.id, self.genes = pair[0], set(pair[1])
45
46    """
47    the following functions are needed for sets of gene sets to be able
48    to assess equality
49    """
50
51    def __hash__(self):
52        return self.id.__hash__() + self.name.__hash__()
53
54    def __eq__(self, other):
55        if isinstance(other, self.__class__):
56            return self.__dict__ == other.__dict__
57        else:
58            return False
59
60    def __ne__(self, other):
61        return not self.__eq__(other)
62
63    def size(self):
64        return len(self.genes)
65
66    def cname(self, source=True, name=True):
67        """ Constructs a gene set name with the hierarchy. """
68        oname = self.id
69        if source and self.hierarchy:
70            oname = "[ " + ", ".join(self.hierarchy) + " ] " + oname
71        if name and self.name:
72            oname = oname + " " + self.name
73        return oname
74
75    def to_odict(self, source=True, name=True):
76        """
77        Returns a pair (id, listofgenes), like in old format.
78        """
79        return self.cname(source=source, name=name), self.genes
80
81    def __repr__(self):
82        return "GeneSet(" + ", ".join( [ 
83            "id=" + str(self.id),
84            "genes=" + str(self.genes),
85            "name=" + str(self.name),
86            "link=" + str(self.link),
87            "hierarchy=" + str(self.hierarchy)
88        ]) + ")"
89
90class GeneSetIDException(Exception):
91    pass
92
93class GeneSets(set):
94   
95    def __init__(self, input=None):
96        """
97        odict are genesets in old dict format.
98        gs are genesets in new format
99        """
100        if input != None and len(input) > 0:
101            self.update(input)
102
103    def update(self, input):
104        if isinstance(input, obiGeneSets.GeneSets):
105            super(GeneSets, self).update(input)
106        elif hasattr(input, "items"):
107            for i, g in input.items():
108                self.add(obiGeneSets.GeneSet(pair=(i, g)))
109        else:
110            for i in input:
111                if isinstance(i, obiGeneSets.GeneSet):
112                    self.add(i)
113                else:
114                    i, g = i
115                    self.add(obiGeneSets.GeneSet(pair=(i, g)))
116
117    def to_odict(self):
118        """ Return gene sets in old dictionary format. """
119        return dict(gs.to_odict() for gs in self)
120
121    def set_hierarchy(self, hierarchy):
122        """ Sets hierarchy for all gene sets """
123        for gs in self:
124            gs.hierarchy = hierarchy
125
126    def __repr__(self):
127        return "GeneSets(" + set.__repr__(self) + ")"
128
129    def common_org(self):
130        """ Returns the common organism. """
131        if len(self) == 0:
132            raise GenesetRegException("Empty gene sets.")
133
134        organisms = set(a.organism for a in self)
135
136        try:
137            return only_option(organisms)
138        except:
139            raise GenesetRegException("multiple organisms: " + str(organisms))
140
141    def hierarchies(self):
142        """ Returns all hierachies """
143        if len(self) == 0:
144            raise GenesetRegException("Empty gene sets.")
145        return set(a.hierarchy for a in self)
146
147    def common_hierarchy(self):
148        hierarchies = self.hierarchies()
149
150        def common_hierarchy1(hierarchies):
151            def hier(l): return set(map(lambda x: x[:currentl], hierarchies))
152            currentl = max(map(len, hierarchies))
153            while len(hier(currentl)) > 1:
154                currentl -= 1
155            return only_option(hier(currentl))
156
157        return common_hierarchy1(hierarchies)
158
159    def split_by_hierarchy(self):
160        """ Splits gene sets by hierarchies. """
161        hd = dict((h,obiGeneSets.GeneSets()) for h in  self.hierarchies())
162        for gs in self:
163            hd[gs.hierarchy].add(gs)
164        return hd.values()
165
166def goGeneSets(org):
167    """Returns gene sets from GO."""
168
169    ontology = obiGO.Ontology()
170    annotations = obiGO.Annotations(org, ontology=ontology)
171
172    genesets = []
173    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
174    for termn, term in ontology.terms.items():
175        genes = annotations.GetAllGenes(termn)
176        hier = ("GO", term.namespace)
177        if len(genes) > 0:
178            gs = obiGeneSets.GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn) 
179            genesets.append(gs)
180
181    return obiGeneSets.GeneSets(genesets)
182
183def keggGeneSets(org):
184    """
185    Returns gene sets from KEGG pathways.
186    """
187    import obiKEGG2 as obiKEGG
188   
189    kegg = obiKEGG.KEGGOrganism(org)
190
191    genesets = []
192    for id in kegg.pathways():
193        pway = obiKEGG.KEGGPathway(id)
194        hier = ("KEGG","pathways")
195        gs = obiGeneSets.GeneSet(id=id,
196                                 name=pway.title,
197                                 genes=kegg.get_genes_by_pathway(id),
198                                 hierarchy=hier,
199                                 organism=org,
200                                 link=pway.link)
201        genesets.append(gs)
202
203    return obiGeneSets.GeneSets(genesets)
204
205def omimGeneSets():
206    """
207    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
208    """
209    import obiOMIM
210    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
211                    link=("http://www.ncbi.nlm.nih.gov/entrez/dispomim.cgi?id=" % disease.id if disease.id else None)) \
212                    for disease in obiOMIM.diseases()]
213    return GeneSets(genesets)
214
215def miRNAGeneSets(org):
216    """
217    Return gene sets from miRNA targets
218    """
219    import obimiRNA, obiKEGG
220    org_code = obiKEGG.from_taxid(org)
221    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
222    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
223    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
224                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
225    return GeneSets(genesets)
226
227def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
228    import obimiRNA, obiGO
229    mirnas = obimiRNA.ids(int(org))
230    if ontology is None:
231        ontology = obiGO.Ontology()
232         
233    annotations = obiGO.Annotations(org, ontology=ontology)
234   
235    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
236    print go_sets
237   
238    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
239   
240    import obiGeneSets as gs
241    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
242    gsets = [gs.GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
243                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
244    gset = gs.GeneSets(gsets)
245    return gset
246
247
248def loadGMT(contents, name):
249    """
250    Eech line consists of tab separated elements. First is
251    the geneset name, next is it's description.
252   
253    For now the description is skipped.
254    """
255    def hline(s):
256        tabs = [tab.strip() for tab in s.split("\t")]
257        return obiGeneSets.GeneSet(id=tabs[0], description=tabs[1],
258                                   hierarchy=(name,), genes=tabs[2:])
259
260    def handleNELines(s, fn):
261        """
262        Run function on nonempty lines of a string.
263        Return a list of results for each line.
264        """
265        lines = (l.strip() for l in s.splitlines())
266        return [fn(l) for l in lines if l]
267
268    return obiGeneSets.GeneSets(handleNELines(contents, hline))
269
270"""
271We have multiple paths for gene set data:
272buffer/bigfiles/gene_sets
273and
274buffer/gene_sets_local
275both have available.txt
276"""
277
278def omakedirs(dir):
279    try:
280        os.makedirs(dir)
281    except OSError:
282        pass
283
284def local_path():
285    """ Returns local path for gene sets. Creates it if it does not exists
286    yet. """
287    import orngEnviron
288    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
289    omakedirs(pth)
290    return pth
291
292def build_index(dir):
293    """ Returns gene set availability index for some folder. """
294    pass
295
296class GenesetRegException(Exception): pass
297
298def only_option(a):
299    if len(a) == 1:
300        return list(a)[0]
301    else:
302        raise Exception()
303
304def filename(hierarchy, organism):
305    """ Obtain a filename for given hierarchy and organism. """
306    return "gs_" + "_._".join(hierarchy + \
307        (organism if organism != None else "",)) + ".pck"
308
309def filename_parse(fn):
310    """ Returns a hierarchy and the organism from the filename."""
311    fn = fn[3:-4]
312    parts = fn.split("_._")
313    hierarchy = tuple(parts[:-1])
314    org = parts[-1] if parts[-1] != "" else None
315    return hierarchy, org
316
317def is_genesets_file(fn):
318    return fn.startswith("gs_") and fn.endswith(".pck")
319
320def list_local():
321    """ Returns available gene sets from the local repository:
322    a list of (hierarchy, organism, on_local) """
323    pth = local_path()
324    gs_files = filter(is_genesets_file, os.listdir(pth))
325    return [ filename_parse(fn) + (True,) for fn in gs_files ]
326   
327def list_serverfiles_from_flist(flist):
328    gs_files = filter(is_genesets_file, flist)
329    localfiles = set(orngServerFiles.listfiles(sfdomain))
330    return [ filename_parse(fn) + \
331        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
332
333def list_serverfiles_conn(serverfiles=None):
334    """ Returns available gene sets from the server files
335    repository: a list of (hierarchy, organism, on_local) """
336    if serverfiles == None:
337        serverfiles = orngServerFiles.ServerFiles()
338    flist = serverfiles.listfiles(sfdomain)
339    return list_serverfiles_from_flist(flist)
340
341def list_serverfiles():
342    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
343    flist = pickle.load(open(fname, 'r'))
344    return list_serverfiles_from_flist(flist)
345
346def list_all():
347    """
348    return a list of (hier, org, avalable_locally)
349    If something for a specific (hier, org) is not downloaded
350    yet, show it as not-local. """
351    flist = list_local() + list_serverfiles()
352    d = {}
353    for h,o,local in flist:
354        d[h,o] = min(local, d.get((h,o),True))
355    return [ (h,o,local) for (h,o),local in d.items() ]
356
357def update_server_list(serverfiles_upload, serverfiles_list=None):
358    if serverfiles_list == None:
359        serverfiles_list = orngServerFiles.ServerFiles()
360
361    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
362
363    tfname = pickle_temp(flist)
364   
365    try:
366        fn = "index.pck"
367        title = "Gene sets: index"
368        tags = [ "gene sets", "index", "essential" ]
369        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
370        serverfiles_upload.unprotect(sfdomain, fn)
371    except Exception,e:
372        raise e
373    finally:
374        os.remove(tfname)
375
376def register_local(genesets):
377    """ Registers using the common hierarchy and organism. """
378    pth = local_path()
379
380    org = genesets.common_org()
381    hierarchy = genesets.common_hierarchy()
382    fn = filename(hierarchy, org)
383
384    with open(os.path.join(pth, fn), "w") as f:
385        pickle.dump(genesets, f)
386
387    return fn
388
389def pickle_temp(obj):
390    """ Pickle a file to a temporary file returns its name """
391    fd,tfname = tempfile.mkstemp()
392    os.close(fd)
393    f = open(tfname, 'wb')
394    pickle.dump(obj, f)
395    f.close()
396    return tfname
397
398def register_serverfiles(genesets, serverFiles):
399    """ Registers using the common hierarchy and organism. """
400    org = genesets.common_org()
401    hierarchy = genesets.common_hierarchy()
402    fn = filename(hierarchy, org)
403
404    #save to temporary file
405    tfname = pickle_temp(genesets)
406   
407    try:
408        taxname = obiTaxonomy.name(org)
409        title = "Gene sets: " + ", ".join(hierarchy) + \
410            ((" (" + taxname + ")") if org != None else "")
411        tags = list(hierarchy) + [ "gene sets", taxname ] + \
412            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
413        serverFiles.upload(sfdomain, fn, tfname, title, tags)
414        serverFiles.unprotect(sfdomain, fn)
415    finally:
416        os.remove(tfname)
417
418    update_server_list(serverFiles)
419
420def register(genesets, serverFiles=None):
421    """
422    Hierarchy is induced from the gene set names.
423    """
424    if serverFiles == None:
425        register_local(genesets)
426    else:
427        register_serverfiles(genesets, serverFiles)
428
429def build_hierarchy_dict(files):
430    hierd = defaultdict(list)
431    for ind,f in enumerate(files):
432        hier, org = f
433        for i in range(len(hier)+1):
434            hierd[(hier[:i], org)].append(ind)
435    return hierd
436
437def load_local(hierarchy, organism):
438    files = map(lambda x: x[:2], list_local())
439
440    hierd = build_hierarchy_dict(files)
441
442    out = GeneSets()
443    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
444        fname = os.path.join(local_path(), filename(h, o))
445        out.update(pickle.load(open(fname, 'r')))
446    return out
447
448def load_serverfiles(hierarchy, organism):
449    files = map(lambda x: x[:2], list_serverfiles())
450    hierd = build_hierarchy_dict(files)
451
452    out = GeneSets()
453    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
454        fname = orngServerFiles.localpath_download(sfdomain, 
455            filename(h, o))
456        out.update(pickle.load(open(fname, 'r')))
457    return out
458
459def load(hierarchy, organism):
460    """ First try to load from the local registred folder, then
461    from the server files. """
462    ret = load_local(hierarchy, organism)
463    ret.update(load_serverfiles(hierarchy, organism))
464    return ret
465
466def collections(*args):
467    """
468    Input is a list of collections.
469    Collection can either be a tuple (hierarchy, orgranism), where
470    hierarchy is a tuple also.
471    """
472    result = obiGeneSets.GeneSets()
473
474    for collection in args:
475        try:
476            result.update(collection)
477        except ValueError:
478            if issequencens(collection): #have a hierarchy, organism specification
479                new = load(*collection)
480                result.update(new)
481            else:
482                if collection.lower()[-4:] == ".gmt": #format from webpage
483                    result.update(loadGMT(open(collection,"rt").read(), collection))
484                else:
485                    raise Exception("collection() accepts files in .gmt format only.")
486
487    return result
488
489def issequencens(x):
490    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
491    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
492
493class TException(Exception): pass
494
495def upload_genesets(rsf):
496    """
497    Builds the default gene sets and
498    """
499    orngServerFiles.update_local_files()
500
501    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
502    organisms = obiTaxonomy.common_taxids()
503    for fn in genesetsfn:
504        for org in organisms:
505        #for org in [ "9606" ]:
506            print "Uploading ORG", org, fn
507            try:
508                genesets = fn(org).split_by_hierarchy()
509                for gs in genesets:
510                    print "registering", gs.common_hierarchy()
511                    register_serverfiles(gs, rsf)
512                    print "successful", gs.common_hierarchy()
513            except Exception, e:
514                print "Not successful"
515
516if __name__ == "__main__":
517
518    #gs = keggGeneSets("9606")
519    akegg = collections((("KEGG",),"9606"))
520    a = collections({"PATH": ["g1","g2","g3"]}, "steroltalk.gmt", akegg)
521    for e in a:
522        print e
523    #print len(collections(keggGeneSets("9606"),(("KEGG",),"9606"), "C5.BP.gmt"))
524    #print len(collections((("KEGG",),"9606"), "C5.BP.gmt"))
525    #print sorted(list_all())
526    #print len(collections((("KEGG",),"9606"), (("GO",), "9606"), "C5.BP.gmt"))
527    #register_local(keggGeneSets("9606"))
528    #register_local(goGeneSets("9606"))
529    #register_serverfiles(gs, rsf)
530    #print list_serverfiles_conn()
531    #print "Server list from index", list_serverfiles()
532    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
533    upload_genesets(rsf)
Note: See TracBrowser for help on using the repository browser.