source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1818:ffa449bd9709

Revision 1818:ffa449bd9709, 15.1 KB checked in by Flashpoint <vid.flashpoint@…>, 10 months ago (diff)

Version 1 of the OWCustomSet.py widget

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12import datetime
13
14import Orange.core as orange
15from Orange.orng import orngServerFiles
16
17from . import obiGO, obiKEGG, obiTaxonomy
18
19sfdomain = "gene_sets"
20
21def nth(l,n):
22    return [ a[n] for a in l]
23
24from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
25
26def goGeneSets(org):
27    """Returns gene sets from GO."""
28    ontology = obiGO.Ontology()
29    annotations = obiGO.Annotations(org, ontology=ontology)
30
31    genesets = []
32    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
33    for termn, term in ontology.terms.items():
34        genes = annotations.GetAllGenes(termn)
35        hier = ("GO", term.namespace)
36        if len(genes) > 0:
37            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
38            genesets.append(gs)
39
40    return GeneSets(genesets)
41
42def keggGeneSets(org):
43    """
44    Returns gene sets from KEGG pathways.
45    """
46
47    kegg = obiKEGG.KEGGOrganism(org)
48
49    genesets = []
50    for id in kegg.pathways():
51        print id
52        pway = obiKEGG.KEGGPathway(id)
53        hier = ("KEGG","pathways")
54        gs = GeneSet(id=id,
55                                 name=pway.title,
56                                 genes=kegg.get_genes_by_pathway(id),
57                                 hierarchy=hier,
58                                 organism=org,
59                                 link=pway.link)
60        genesets.append(gs)
61
62    return GeneSets(genesets)
63
64def dictyMutantSets():
65    """
66    Return dicty mutant phenotype gene sets from Dictybase
67    """
68    from . import obiDictyMutants
69    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
70    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
71    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
72    #                    for mutant in obiDictyMutants.mutants()]
73 
74    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
75                        link="") \
76                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
77
78    return GeneSets(genesets)
79
80def cytobandGeneSets():
81    """
82    Create cytoband gene sets from Stanford Microarray Database
83    """
84    import urllib2
85
86    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
87    stream = urllib2.urlopen(url)
88    data = stream.read().splitlines()
89
90    genesets = []
91    for band in data:
92        b = band.split("\t")
93        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
94
95    return GeneSets(genesets)
96
97def reactomePathwaysGeneSets():
98    """
99    Prepare human pathways gene sets from reactome pathways
100    """
101    import urllib
102    import io
103    from zipfile import ZipFile
104
105    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
106    memfile = io.BytesIO(url.read())
107    with ZipFile(memfile, "r") as myzip:
108        f = myzip.open("ReactomePathways.gmt")
109        content = f.read().splitlines()     
110
111    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
112    return GeneSets(genesets)
113
114
115def omimGeneSets():
116    """
117    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
118    """
119    from . import obiOMIM   
120    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
121                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
122                    for disease in obiOMIM.diseases()]
123    return GeneSets(genesets)
124
125def miRNAGeneSets(org):
126    """
127    Return gene sets from miRNA targets
128    """
129    from . import obimiRNA
130    org_code = obiKEGG.from_taxid(org)
131    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
132    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
133    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
134                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
135    return GeneSets(genesets)
136
137def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
138    from . import obimiRNA, obiGO
139    mirnas = obimiRNA.ids(int(org))
140    if ontology is None:
141        ontology = obiGO.Ontology()
142
143    annotations = obiGO.Annotations(org, ontology=ontology)
144
145    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
146
147    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
148
149    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
150    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
151                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
152    gset = GeneSets(gsets)
153    return gset
154
155def loadGMT(contents, name):
156    """
157    Eech line consists of tab separated elements. First is
158    the geneset name, next is it's description.
159
160    For now the description is skipped.
161
162    Example Gene Set (.gmt) file:
163    anti-liver_sw   anti-liver_sw   BMPR1A  APOD    WSB1    BMI1    SLC2A1  ...
164    B-cells_sw  B-cells_sw  E2F5    NCF1    PALM2-AKAP2 IRF4    SLC2A1  ...
165    Bladder_sw  Bladder_sw  PLCD4   ANGPTL1 LOC286191   ST0N1   LOC283904   ...
166    cerebellum_sw   cerebellum_sw   C19orf43    LOC653464   KI110802    ...
167    Cervix_sw   Cervix_sw   LAMA4   GSTM5   SNX19   DKK1    NT5E    ...
168    """
169
170    def hline(s):
171        tabs = [tab.strip() for tab in s.split("\t")]
172        return GeneSet(id=tabs[0], description=tabs[1],
173                                   hierarchy=("Custom Gene Sets",name), genes=tabs[2:])
174
175    def handleNELines(s, fn):
176        """
177        Run function on nonempty lines of a string.
178        Return a list of results for each line.
179        """
180        lines = (l.strip() for l in s.splitlines())
181        return [fn(l) for l in lines if l]
182
183    return GeneSets(handleNELines(contents, hline))
184
185def getGenesetsStats(genesets):
186    num_sets = len(genesets)
187    unique_genes = len(set([gene for geneset in genesets for gene in geneset.genes]))
188    genes_per_geneset = sum([len(geneset.genes) for geneset in genesets])/num_sets
189    return num_sets, unique_genes, genes_per_geneset
190"""
191We have multiple paths for gene set data:
192buffer/bigfiles/gene_sets
193and
194buffer/gene_sets_local
195both have available.txt
196"""
197
198def omakedirs(dir):
199    try:
200        os.makedirs(dir)
201    except OSError:
202        pass
203
204def local_path():
205    """ Returns local path for gene sets. Creates it if it does not exists
206    yet. """
207    from Orange.orng import orngEnviron
208    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
209    omakedirs(pth)
210    return pth
211
212def build_index(dir):
213    """ Returns gene set availability index for some folder. """
214    pass
215
216def filename(hierarchy, organism):
217    """ Obtain a filename for given hierarchy and organism. """
218    return "gs_" + "_._".join(hierarchy + \
219        (organism if organism != None else "",)) + ".pck"
220
221def filename_parse(fn):
222    """ Returns a hierarchy and the organism from the filename."""
223    fn = fn[3:-4]
224    parts = fn.split("_._")
225    hierarchy = tuple(parts[:-1])
226    org = parts[-1] if parts[-1] != "" else None
227    return hierarchy, org
228
229def is_genesets_file(fn):
230    return fn.startswith("gs_") and fn.endswith(".pck")
231
232def list_local():
233    """ Returns available gene sets from the local repository:
234    a list of (hierarchy, organism, on_local) """
235    pth = local_path()
236    gs_files = filter(is_genesets_file, os.listdir(pth))
237    return [ filename_parse(fn) + (True,) for fn in gs_files ]
238
239def remove_local(gene_set):
240    """ Removes a given gene set from the local repository. """
241    pth = local_path()
242    gs_files = filter(is_genesets_file, os.listdir(pth)) 
243    for setfile in gs_files:
244        if setfile.__contains__(gene_set):
245            setBgone = os.path.join(pth, setfile)
246            os.remove(setBgone) 
247
248def modification_date(file):
249    t = os.path.getmtime(file)
250    return datetime.datetime.fromtimestamp(t)
251
252def list_serverfiles_from_flist(flist):
253    gs_files = filter(is_genesets_file, flist)
254    localfiles = set(orngServerFiles.listfiles(sfdomain))
255    return [ filename_parse(fn) + \
256        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
257
258def list_serverfiles_conn(serverfiles=None):
259    """ Returns available gene sets from the server files
260    repository: a list of (hierarchy, organism, on_local) """
261    if serverfiles == None:
262        serverfiles = orngServerFiles.ServerFiles()
263    flist = serverfiles.listfiles(sfdomain)
264    return list_serverfiles_from_flist(flist)
265
266def list_serverfiles():
267    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
268    flist = pickle.load(open(fname, 'r'))
269    return list_serverfiles_from_flist(flist)
270
271def list_all():
272    """
273    return a list of (hier, org, avalable_locally)
274    If something for a specific (hier, org) is not downloaded
275    yet, show it as not-local. """
276    flist = list_local() + list_serverfiles()
277    d = {}
278    for h,o,local in flist:
279        d[h,o] = min(local, d.get((h,o),True))
280    return [ (h,o,local) for (h,o),local in d.items() ]
281
282def update_server_list(serverfiles_upload, serverfiles_list=None):
283    if serverfiles_list == None:
284        serverfiles_list = orngServerFiles.ServerFiles()
285
286    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
287
288    tfname = pickle_temp(flist)
289
290    try:
291        fn = "index.pck"
292        title = "Gene sets: index"
293        tags = [ "gene sets", "index", "essential" ]
294        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
295        serverfiles_upload.unprotect(sfdomain, fn)
296    except Exception,e:
297        raise e
298    finally:
299        os.remove(tfname)
300
301def _register_local(genesets):
302    """ Registers using the common hierarchy and organism. """
303    pth = local_path()
304
305    org = genesets.common_org()
306    hierarchy = genesets.common_hierarchy()
307    fn = filename(hierarchy, org)
308
309    with open(os.path.join(pth, fn), "w") as f:
310        pickle.dump(genesets, f)
311
312    return fn
313
314def pickle_temp(obj):
315    """ Pickle a file to a temporary file and returns its name """
316    fd,tfname = tempfile.mkstemp()
317    os.close(fd)
318    f = open(tfname, 'wb')
319    pickle.dump(obj, f)
320    f.close()
321    return tfname
322
323def _register_serverfiles(genesets, serverFiles):
324    """ Registers using the common hierarchy and organism. """
325    org = genesets.common_org()
326    hierarchy = genesets.common_hierarchy()
327    fn = filename(hierarchy, org)
328
329    #save to temporary file
330    tfname = pickle_temp(genesets)
331
332    try:
333        taxname = obiTaxonomy.name(org)
334        title = "Gene sets: " + ", ".join(hierarchy) + \
335            ((" (" + taxname + ")") if org != None else "")
336        tags = list(hierarchy) + [ "gene sets", taxname ] + \
337            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
338        serverFiles.upload(sfdomain, fn, tfname, title, tags)
339        serverFiles.unprotect(sfdomain, fn)
340    finally:
341        os.remove(tfname)
342
343    update_server_list(serverFiles)
344
345def register(genesets, serverFiles=None):
346    """
347    Hierarchy is induced from the gene set names.
348    """
349    if serverFiles == None:
350        _register_local(genesets)
351    else:
352        _register_serverfiles(genesets, serverFiles)
353
354def build_hierarchy_dict(files):
355    hierd = defaultdict(list)
356    for ind,f in enumerate(files):
357        hier, org = f
358        for i in range(len(hier)+1):
359            hierd[(hier[:i], org)].append(ind)
360    return hierd
361
362def load_local(hierarchy, organism):
363    files = map(lambda x: x[:2], list_local())
364    hierd = build_hierarchy_dict(files)
365
366    out = GeneSets()
367    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
368        fname = os.path.join(local_path(), filename(h, o))
369        out.update(pickle.load(open(fname, 'r')))
370    return out
371
372def load_serverfiles(hierarchy, organism):
373    files = map(lambda x: x[:2], list_serverfiles())
374    hierd = build_hierarchy_dict(files)
375    out = GeneSets()
376    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
377        fname = orngServerFiles.localpath_download(sfdomain,
378            filename(h, o))
379        out.update(pickle.load(open(fname, 'r')))
380    return out
381
382def load(hierarchy, organism):
383    """ First try to load from the local registred folder. If the file
384    is not available, load it from the server files. """
385    ret = load_local(hierarchy, organism)
386    if len(ret) == 0:
387        ret.update(load_serverfiles(hierarchy, organism))
388    return ret
389
390def collections(*args):
391    """
392    Input is a list of collections.
393    Collection can either be a tuple (hierarchy, orgranism), where
394    hierarchy is a tuple also.
395    """
396    result = GeneSets()
397
398    for collection in args:
399        try:
400            result.update(collection)
401        except (ValueError, TypeError):
402            if issequencens(collection): #have a hierarchy, organism specification
403                new = load(*collection)
404                result.update(new)
405            else:
406                if collection.lower()[-4:] == ".gmt": #format from webpage
407                    result.update(loadGMT(open(collection,"rt").read(), collection))
408                else:
409                    raise Exception("collection() accepts files in .gmt format only.")
410
411    return result
412
413def issequencens(x):
414    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
415    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
416
417class TException(Exception): pass
418
419def upload_genesets(rsf):
420    """
421    Builds the default gene sets and
422    """
423    orngServerFiles.update_local_files()
424
425    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
426    organisms = obiTaxonomy.common_taxids()
427    for fn in genesetsfn:
428        for org in organisms:
429            try:
430                print "Uploading ORG", org, fn
431                genesets = fn(org).split_by_hierarchy()
432                for gs in genesets:
433                    print "registering", gs.common_hierarchy()
434                    register(gs, rsf) #server files
435                    #register(gs)
436                    print "successful", gs.common_hierarchy()
437            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
438                print "organism not found", org
439
440if __name__ == "__main__":
441    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
442    upload_genesets(rsf)
443    pass
444
445
Note: See TracBrowser for help on using the repository browser.