source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1835:fa00a0fd32a9

Revision 1835:fa00a0fd32a9, 15.2 KB checked in by Flashpoint <vid.flashpoint@…>, 10 months ago (diff)

Fixed the exception handling in obiGeneSets.py

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12import datetime
13
14import Orange.core as orange
15from Orange.orng import orngServerFiles
16
17from . import obiGO, obiKEGG, obiTaxonomy
18
19sfdomain = "gene_sets"
20
21def nth(l,n):
22    return [ a[n] for a in l]
23
24from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
25
26def goGeneSets(org):
27    """Returns gene sets from GO."""
28    ontology = obiGO.Ontology()
29    annotations = obiGO.Annotations(org, ontology=ontology)
30
31    genesets = []
32    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
33    for termn, term in ontology.terms.items():
34        genes = annotations.GetAllGenes(termn)
35        hier = ("GO", term.namespace)
36        if len(genes) > 0:
37            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
38            genesets.append(gs)
39
40    return GeneSets(genesets)
41
42def keggGeneSets(org):
43    """
44    Returns gene sets from KEGG pathways.
45    """
46
47    kegg = obiKEGG.KEGGOrganism(org)
48
49    genesets = []
50    for id in kegg.pathways():
51        print id
52        pway = obiKEGG.KEGGPathway(id)
53        hier = ("KEGG","pathways")
54        gs = GeneSet(id=id,
55                                 name=pway.title,
56                                 genes=kegg.get_genes_by_pathway(id),
57                                 hierarchy=hier,
58                                 organism=org,
59                                 link=pway.link)
60        genesets.append(gs)
61
62    return GeneSets(genesets)
63
64def dictyMutantSets():
65    """
66    Return dicty mutant phenotype gene sets from Dictybase
67    """
68    from . import obiDictyMutants
69    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
70    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
71    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
72    #                    for mutant in obiDictyMutants.mutants()]
73 
74    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
75                        link="") \
76                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
77
78    return GeneSets(genesets)
79
80def cytobandGeneSets():
81    """
82    Create cytoband gene sets from Stanford Microarray Database
83    """
84    import urllib2
85
86    url = "http://www-stat.stanford.edu/~tibs/GSA/cytobands-stanford.gmt"
87    stream = urllib2.urlopen(url)
88    data = stream.read().splitlines()
89
90    genesets = []
91    for band in data:
92        b = band.split("\t")
93        genesets.append(GeneSet(id=b[0], name=b[1], genes=b[2:] if b[2:] else [], hierarchy=("Cytobands",), organism="9606", link=""))         
94
95    return GeneSets(genesets)
96
97def reactomePathwaysGeneSets():
98    """
99    Prepare human pathways gene sets from reactome pathways
100    """
101    import urllib
102    import io
103    from zipfile import ZipFile
104
105    url = urllib.urlopen("http://www.reactome.org/download/current/ReactomePathways.gmt.zip")
106    memfile = io.BytesIO(url.read())
107    with ZipFile(memfile, "r") as myzip:
108        f = myzip.open("ReactomePathways.gmt")
109        content = f.read().splitlines()     
110
111    genesets = [GeneSet(id=path.split("\t")[0], name=path.split("\t")[0], genes=path.split("\t")[2:] if path.split("\t")[2:] else [], hierarchy=("Reactome", "Pathways"), organism="9606", link="") for path in content]
112    return GeneSets(genesets)
113
114
115def omimGeneSets():
116    """
117    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
118    """
119    from . import obiOMIM   
120    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
121                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
122                    for disease in obiOMIM.diseases()]
123    return GeneSets(genesets)
124
125def miRNAGeneSets(org):
126    """
127    Return gene sets from miRNA targets
128    """
129    from . import obimiRNA
130    org_code = obiKEGG.from_taxid(org)
131    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
132    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
133    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
134                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
135    return GeneSets(genesets)
136
137def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
138    from . import obimiRNA, obiGO
139    mirnas = obimiRNA.ids(int(org))
140    if ontology is None:
141        ontology = obiGO.Ontology()
142
143    annotations = obiGO.Annotations(org, ontology=ontology)
144
145    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
146
147    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
148
149    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
150    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
151                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
152    gset = GeneSets(gsets)
153    return gset
154
155def loadGMT(contents, name):
156    """
157    Eech line consists of tab separated elements. First is
158    the geneset name, next is it's description.
159
160    For now the description is skipped.
161
162    Example Gene Set (.gmt) file:
163    anti-liver_sw   anti-liver_sw   BMPR1A  APOD    WSB1    BMI1    SLC2A1  ...
164    B-cells_sw  B-cells_sw  E2F5    NCF1    PALM2-AKAP2 IRF4    SLC2A1  ...
165    Bladder_sw  Bladder_sw  PLCD4   ANGPTL1 LOC286191   ST0N1   LOC283904   ...
166    cerebellum_sw   cerebellum_sw   C19orf43    LOC653464   KI110802    ...
167    Cervix_sw   Cervix_sw   LAMA4   GSTM5   SNX19   DKK1    NT5E    ...
168    """
169
170    def hline(s):
171        tabs = [tab.strip() for tab in s.split("\t")]
172        return GeneSet(id=tabs[0], description=tabs[1],
173                                   hierarchy=("Custom Gene Sets",name), genes=tabs[2:])
174
175    def handleNELines(s, fn):
176        """
177        Run function on nonempty lines of a string.
178        Return a list of results for each line.
179        """
180        lines = (l.strip() for l in s.splitlines())
181        return [fn(l) for l in lines if l]
182
183    return GeneSets(handleNELines(contents, hline))
184
185def getGenesetsStats(genesets):
186    num_sets = len(genesets)
187    unique_genes = len(set([gene for geneset in genesets for gene in geneset.genes]))
188    genes_per_geneset = sum([len(geneset.genes) for geneset in genesets])/num_sets
189    return num_sets, unique_genes, genes_per_geneset
190"""
191We have multiple paths for gene set data:
192buffer/bigfiles/gene_sets
193and
194buffer/gene_sets_local
195both have available.txt
196"""
197
198def omakedirs(dir):
199    try:
200        os.makedirs(dir)
201    except OSError:
202        pass
203
204def local_path():
205    """ Returns local path for gene sets. Creates it if it does not exists
206    yet. """
207    from Orange.orng import orngEnviron
208    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
209    omakedirs(pth)
210    return pth
211
212def build_index(dir):
213    """ Returns gene set availability index for some folder. """
214    pass
215
216def filename(hierarchy, organism):
217    """ Obtain a filename for given hierarchy and organism. """
218    return "gs_" + "_._".join(hierarchy + \
219        (organism if organism != None else "",)) + ".pck"
220
221def filename_parse(fn):
222    """ Returns a hierarchy and the organism from the filename."""
223    fn = fn[3:-4]
224    parts = fn.split("_._")
225    hierarchy = tuple(parts[:-1])
226    org = parts[-1] if parts[-1] != "" else None
227    return hierarchy, org
228
229def is_genesets_file(fn):
230    return fn.startswith("gs_") and fn.endswith(".pck")
231
232def list_local():
233    """ Returns available gene sets from the local repository:
234    a list of (hierarchy, organism, on_local) """
235    pth = local_path()
236    gs_files = filter(is_genesets_file, os.listdir(pth))
237    return [ filename_parse(fn) + (True,) for fn in gs_files ]
238
239def remove_local(gene_set):
240    """ Removes a given gene set from the local repository. """
241    pth = local_path()
242    gs_files = filter(is_genesets_file, os.listdir(pth)) 
243    for setfile in gs_files:
244        if setfile.__contains__(gene_set):
245            setBgone = os.path.join(pth, setfile)
246            os.remove(setBgone) 
247
248def modification_date(file):
249    t = os.path.getmtime(file)
250    return datetime.datetime.fromtimestamp(t)
251
252def list_serverfiles_from_flist(flist):
253    gs_files = filter(is_genesets_file, flist)
254    localfiles = set(orngServerFiles.listfiles(sfdomain))
255    return [ filename_parse(fn) + \
256        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
257
258def list_serverfiles_conn(serverfiles=None):
259    """ Returns available gene sets from the server files
260    repository: a list of (hierarchy, organism, on_local) """
261    if serverfiles == None:
262        serverfiles = orngServerFiles.ServerFiles()
263    flist = serverfiles.listfiles(sfdomain)
264    return list_serverfiles_from_flist(flist)
265
266def list_serverfiles():
267    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
268    flist = pickle.load(open(fname, 'r'))
269    return list_serverfiles_from_flist(flist)
270
271def list_all():
272    """
273    return a list of (hier, org, avalable_locally)
274    If something for a specific (hier, org) is not downloaded
275    yet, show it as not-local. """
276    flist = list_local() + list_serverfiles()
277    d = {}
278    for h,o,local in flist:
279        d[h,o] = min(local, d.get((h,o),True))
280    return [ (h,o,local) for (h,o),local in d.items() ]
281
282def update_server_list(serverfiles_upload, serverfiles_list=None):
283    if serverfiles_list == None:
284        serverfiles_list = orngServerFiles.ServerFiles()
285
286    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
287
288    tfname = pickle_temp(flist)
289
290    try:
291        fn = "index.pck"
292        title = "Gene sets: index"
293        tags = [ "gene sets", "index", "essential" ]
294        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
295        serverfiles_upload.unprotect(sfdomain, fn)
296    except Exception,e:
297        raise e
298    finally:
299        os.remove(tfname)
300
301def _register_local(genesets):
302    """ Registers using the common hierarchy and organism. """
303    pth = local_path()
304    org = genesets.common_org()
305    hierarchy = genesets.common_hierarchy()
306    fn = filename(hierarchy, org)
307
308    with open(os.path.join(pth, fn), "w") as f:
309        pickle.dump(genesets, f)
310
311    return fn
312
313def pickle_temp(obj):
314    """ Pickle a file to a temporary file and returns its name """
315    fd,tfname = tempfile.mkstemp()
316    os.close(fd)
317    f = open(tfname, 'wb')
318    pickle.dump(obj, f)
319    f.close()
320    return tfname
321
322def _register_serverfiles(genesets, serverFiles):
323    """ Registers using the common hierarchy and organism. """
324    org = genesets.common_org()
325    hierarchy = genesets.common_hierarchy()
326    fn = filename(hierarchy, org)
327
328    #save to temporary file
329    tfname = pickle_temp(genesets)
330
331    try:
332        taxname = obiTaxonomy.name(org)
333        title = "Gene sets: " + ", ".join(hierarchy) + \
334            ((" (" + taxname + ")") if org != None else "")
335        tags = list(hierarchy) + [ "gene sets", taxname ] + obiTaxonomy.shortname(org) +\
336            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
337        serverFiles.upload(sfdomain, fn, tfname, title, tags)
338        serverFiles.unprotect(sfdomain, fn)
339    finally:
340        os.remove(tfname)
341
342    update_server_list(serverFiles)
343
344def register(genesets, serverFiles=None):
345    """
346    Hierarchy is induced from the gene set names.
347    """
348    if serverFiles == None:
349        _register_local(genesets)
350    else:
351        _register_serverfiles(genesets, serverFiles)
352
353def build_hierarchy_dict(files):
354    hierd = defaultdict(list)
355    for ind,f in enumerate(files):
356        hier, org = f
357        for i in range(len(hier)+1):
358            hierd[(hier[:i], org)].append(ind)
359    return hierd
360
361
362
363
364    files = map(lambda x: x[:2], list_local())
365    hierd = build_hierarchy_dict(files)
366
367    out = GeneSets()
368    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
369        fname = os.path.join(local_path(), filename(h, o))
370        out.update(pickle.load(open(fname, 'r')))
371    return out
372
373def load_serverfiles(hierarchy, organism):
374    files = map(lambda x: x[:2], list_serverfiles())
375    hierd = build_hierarchy_dict(files)
376    out = GeneSets()
377    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
378        fname = orngServerFiles.localpath_download(sfdomain,
379            filename(h, o))
380        out.update(pickle.load(open(fname, 'r')))
381    return out
382
383def load(hierarchy, organism):
384    """ First try to load from the local registred folder. If the file
385    is not available, load it from the server files. """
386    ret = load_local(hierarchy, organism)
387    if len(ret) == 0:
388        ret.update(load_serverfiles(hierarchy, organism))
389    return ret
390
391def collections(*args):
392    """
393    Input is a list of collections.
394    Collection can either be a tuple (hierarchy, orgranism), where
395    hierarchy is a tuple also.
396    """
397    result = GeneSets()
398
399    for collection in args:
400        try:
401            result.update(collection)
402        except (ValueError, TypeError):
403            if issequencens(collection): #have a hierarchy, organism specification
404                new = load(*collection)
405                result.update(new)
406            else:
407                if collection.lower()[-4:] == ".gmt": #format from webpage
408                    result.update(loadGMT(open(collection,"rt").read(), collection))
409                else:
410                    raise Exception("collection() accepts files in .gmt format only.")
411
412    return result
413
414def issequencens(x):
415    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
416    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
417
418class TException(Exception): pass
419
420def upload_genesets(rsf):
421    """
422    Builds the default gene sets and
423    """
424    orngServerFiles.update_local_files()
425
426    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
427    organisms = obiTaxonomy.common_taxids()
428    for fn in genesetsfn:
429        for org in organisms:
430            try:
431                print "Uploading ORG", org, fn
432                genesets = fn(org).split_by_hierarchy()
433                for gs in genesets:
434                    print "registering", gs.common_hierarchy()
435                    register(gs, rsf) #server files
436                    #register(gs)
437                    print "successful", gs.common_hierarchy()
438            except obiTaxonomy.UnknownSpeciesIdentifier:
439                print "Organism ontology not available", org
440            except GenesetRegException:
441                print "Empty gene sets.", org
442
443if __name__ == "__main__":
444    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
445    upload_genesets(rsf)
446    pass
447
448
Note: See TracBrowser for help on using the repository browser.