source: orange-bioinformatics/_bioinformatics/obiGeneSets.py @ 1794:3a530b619d7f

Revision 1794:3a530b619d7f, 13.2 KB checked in by Flashpoint <vid.flashpoint@…>, 11 months ago (diff)

Added Cytobands genesets and modified obiOMIM.py. Both need some extra work.

Line 
1"""
2Maintainer: Marko Toplak
3"""
4
5from __future__ import absolute_import, with_statement
6
7if __name__ == "__main__":
8    __package__ = "Orange.bio"
9
10import cPickle as pickle, os, tempfile, sys
11from collections import defaultdict
12
13import Orange.core as orange
14from Orange.orng import orngServerFiles
15
16from . import obiGO, obiKEGG, obiTaxonomy
17
18sfdomain = "gene_sets"
19
20def nth(l,n):
21    return [ a[n] for a in l]
22
23from Orange.bio.geneset import GeneSet, GeneSets, GenesetRegException
24
25def goGeneSets(org):
26    """Returns gene sets from GO."""
27    ontology = obiGO.Ontology()
28    annotations = obiGO.Annotations(org, ontology=ontology)
29
30    genesets = []
31    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
32    for termn, term in ontology.terms.items():
33        genes = annotations.GetAllGenes(termn)
34        hier = ("GO", term.namespace)
35        if len(genes) > 0:
36            gs = GeneSet(id=termn, name=term.name, genes=genes, hierarchy=hier, organism=org, link=link_fmt % termn)
37            genesets.append(gs)
38
39    return GeneSets(genesets)
40
41def keggGeneSets(org):
42    """
43    Returns gene sets from KEGG pathways.
44    """
45
46    kegg = obiKEGG.KEGGOrganism(org)
47
48    genesets = []
49    for id in kegg.pathways():
50        print id
51        pway = obiKEGG.KEGGPathway(id)
52        hier = ("KEGG","pathways")
53        gs = GeneSet(id=id,
54                                 name=pway.title,
55                                 genes=kegg.get_genes_by_pathway(id),
56                                 hierarchy=hier,
57                                 organism=org,
58                                 link=pway.link)
59        genesets.append(gs)
60
61    return GeneSets(genesets)
62
63def dictyMutantSets():
64    """
65    Return dicty mutant phenotype gene sets from Dictybase
66    """
67    from . import obiDictyMutants
68    link_fmt = "http://dictybase.org/db/cgi-bin/dictyBase/SC/scsearch.pl?searchdb=strains&search_term=%s&column=all&B1=Submit" 
69    #genesets = [GeneSet(id=mutant.name, name=mutant.descriptor, genes=obiDictyMutants.mutant_genes(mutant), hierarchy=("Dictybase", "Mutants"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
70    #                    link=(link_fmt % mutant.name if mutant.name else None)) \
71    #                    for mutant in obiDictyMutants.mutants()]
72 
73    genesets = [GeneSet(id=phenotype, name=phenotype, genes=[obiDictyMutants.mutant_genes(mutant)[0] for mutant in mutants], hierarchy=("Dictybase", "Phenotypes"), organism="352472", # 352472 gathered from obiGO.py code_map -> Dicty identifier
74                        link="") \
75                        for phenotype, mutants in obiDictyMutants.phenotype_mutants().items()]
76
77    return GeneSets(genesets)
78
79def cytobandGeneSets():
80    """
81    Return cytoband gene sets from Stanford Microarray Database
82    """
83    from . import obiCytobands
84   
85    genesets = [GeneSet(id=band.name, name=band.descriptor, genes=obiCytobands.band_genes(band), hierarchy=("Cytobands",), organism="9606", # 352472 gathered from obiGO.py code_map -> Dicty identifier
86                        link="") \
87                        for band in obiCytobands.bands()]
88 
89    return GeneSets(genesets)
90
91def omimGeneSets():
92    """
93    Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
94    """
95    from . import obiOMIM   
96    genesets = [GeneSet(id=disease.id, name=disease.name, genes=obiOMIM.disease_genes(disease), hierarchy=("OMIM",), organism="9606",
97                    link=("http://www.omim.org/entry/%s" % disease.id if disease.id else None)) \
98                    for disease in obiOMIM.diseases()]
99    return GeneSets(genesets)
100
101def miRNAGeneSets(org):
102    """
103    Return gene sets from miRNA targets
104    """
105    from . import obimiRNA
106    org_code = obiKEGG.from_taxid(org)
107    link_fmt = "http://www.mirbase.org/cgi-bin/mirna_entry.pl?acc=%s"
108    mirnas = [(id, obimiRNA.get_info(id)) for id in obimiRNA.ids(org_code)]
109    genesets = [GeneSet(id=mirna.matACC, name=mirna.matID, genes=mirna.targets.split(","), hierarchy=("miRNA", "Targets"),
110                        organism=org, link=link_fmt % mirna.matID) for id, mirna in mirnas]
111    return GeneSets(genesets)
112
113def go_miRNASets(org, ontology=None, enrichment=True, pval=0.05, treshold=0.04):
114    from . import obimiRNA, obiGO
115    mirnas = obimiRNA.ids(int(org))
116    if ontology is None:
117        ontology = obiGO.Ontology()
118
119    annotations = obiGO.Annotations(org, ontology=ontology)
120
121    go_sets = obimiRNA.get_GO(mirnas, annotations, enrichment=enrichment, pval=pval, goSwitch=False)
122    print go_sets
123
124    go_sets = obimiRNA.filter_GO(go_sets, annotations, treshold=treshold)
125
126    link_fmt = "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=%s"
127    gsets = [GeneSet(id=key, name=ontology[key].name, genes=value, hierarchy=("miRNA", "go_sets",),
128                        organism=org, link=link_fmt % key) for key, value in go_sets.items()]
129    gset = GeneSets(gsets)
130    return gset
131
132def loadGMT(contents, name):
133    """
134    Eech line consists of tab separated elements. First is
135    the geneset name, next is it's description.
136
137    For now the description is skipped.
138    """
139
140    def hline(s):
141        tabs = [tab.strip() for tab in s.split("\t")]
142        return GeneSet(id=tabs[0], description=tabs[1],
143                                   hierarchy=(name,), genes=tabs[2:])
144
145    def handleNELines(s, fn):
146        """
147        Run function on nonempty lines of a string.
148        Return a list of results for each line.
149        """
150        lines = (l.strip() for l in s.splitlines())
151        return [fn(l) for l in lines if l]
152
153    return GeneSets(handleNELines(contents, hline))
154
155"""
156We have multiple paths for gene set data:
157buffer/bigfiles/gene_sets
158and
159buffer/gene_sets_local
160both have available.txt
161"""
162
163def omakedirs(dir):
164    try:
165        os.makedirs(dir)
166    except OSError:
167        pass
168
169def local_path():
170    """ Returns local path for gene sets. Creates it if it does not exists
171    yet. """
172    from Orange.orng import orngEnviron
173    pth = os.path.join(orngEnviron.directoryNames["bufferDir"], "gene_sets_local")
174    omakedirs(pth)
175    return pth
176
177def build_index(dir):
178    """ Returns gene set availability index for some folder. """
179    pass
180
181def filename(hierarchy, organism):
182    """ Obtain a filename for given hierarchy and organism. """
183    return "gs_" + "_._".join(hierarchy + \
184        (organism if organism != None else "",)) + ".pck"
185
186def filename_parse(fn):
187    """ Returns a hierarchy and the organism from the filename."""
188    fn = fn[3:-4]
189    parts = fn.split("_._")
190    hierarchy = tuple(parts[:-1])
191    org = parts[-1] if parts[-1] != "" else None
192    return hierarchy, org
193
194def is_genesets_file(fn):
195    return fn.startswith("gs_") and fn.endswith(".pck")
196
197def list_local():
198    """ Returns available gene sets from the local repository:
199    a list of (hierarchy, organism, on_local) """
200    pth = local_path()
201    gs_files = filter(is_genesets_file, os.listdir(pth))
202    return [ filename_parse(fn) + (True,) for fn in gs_files ]
203
204def list_serverfiles_from_flist(flist):
205    gs_files = filter(is_genesets_file, flist)
206    localfiles = set(orngServerFiles.listfiles(sfdomain))
207    return [ filename_parse(fn) + \
208        ((True,) if fn in localfiles else (False,)) for fn in gs_files ]
209
210def list_serverfiles_conn(serverfiles=None):
211    """ Returns available gene sets from the server files
212    repository: a list of (hierarchy, organism, on_local) """
213    if serverfiles == None:
214        serverfiles = orngServerFiles.ServerFiles()
215    flist = serverfiles.listfiles(sfdomain)
216    return list_serverfiles_from_flist(flist)
217
218def list_serverfiles():
219    fname = orngServerFiles.localpath_download(sfdomain, "index.pck")
220    flist = pickle.load(open(fname, 'r'))
221    return list_serverfiles_from_flist(flist)
222
223def list_all():
224    """
225    return a list of (hier, org, avalable_locally)
226    If something for a specific (hier, org) is not downloaded
227    yet, show it as not-local. """
228    flist = list_local() + list_serverfiles()
229    d = {}
230    for h,o,local in flist:
231        d[h,o] = min(local, d.get((h,o),True))
232    return [ (h,o,local) for (h,o),local in d.items() ]
233
234def update_server_list(serverfiles_upload, serverfiles_list=None):
235    if serverfiles_list == None:
236        serverfiles_list = orngServerFiles.ServerFiles()
237
238    flist = map(lambda x: filename(*x[:2]), list_serverfiles_conn(serverfiles_list))
239
240    tfname = pickle_temp(flist)
241
242    try:
243        fn = "index.pck"
244        title = "Gene sets: index"
245        tags = [ "gene sets", "index", "essential" ]
246        serverfiles_upload.upload(sfdomain, fn, tfname, title, tags)
247        serverfiles_upload.unprotect(sfdomain, fn)
248    except Exception,e:
249        raise e
250    finally:
251        os.remove(tfname)
252
253def _register_local(genesets):
254    """ Registers using the common hierarchy and organism. """
255    pth = local_path()
256
257    org = genesets.common_org()
258    hierarchy = genesets.common_hierarchy()
259    fn = filename(hierarchy, org)
260
261    with open(os.path.join(pth, fn), "w") as f:
262        pickle.dump(genesets, f)
263
264    return fn
265
266def pickle_temp(obj):
267    """ Pickle a file to a temporary file and returns its name """
268    fd,tfname = tempfile.mkstemp()
269    os.close(fd)
270    f = open(tfname, 'wb')
271    pickle.dump(obj, f)
272    f.close()
273    return tfname
274
275def _register_serverfiles(genesets, serverFiles):
276    """ Registers using the common hierarchy and organism. """
277    org = genesets.common_org()
278    hierarchy = genesets.common_hierarchy()
279    fn = filename(hierarchy, org)
280
281    #save to temporary file
282    tfname = pickle_temp(genesets)
283
284    try:
285        taxname = obiTaxonomy.name(org)
286        title = "Gene sets: " + ", ".join(hierarchy) + \
287            ((" (" + taxname + ")") if org != None else "")
288        tags = list(hierarchy) + [ "gene sets", taxname ] + \
289            ([ "essential" ] if org in obiTaxonomy.essential_taxids() else [] )
290        serverFiles.upload(sfdomain, fn, tfname, title, tags)
291        serverFiles.unprotect(sfdomain, fn)
292    finally:
293        os.remove(tfname)
294
295    update_server_list(serverFiles)
296
297def register(genesets, serverFiles=None):
298    """
299    Hierarchy is induced from the gene set names.
300    """
301    if serverFiles == None:
302        _register_local(genesets)
303    else:
304        _register_serverfiles(genesets, serverFiles)
305
306def build_hierarchy_dict(files):
307    hierd = defaultdict(list)
308    for ind,f in enumerate(files):
309        hier, org = f
310        for i in range(len(hier)+1):
311            hierd[(hier[:i], org)].append(ind)
312    return hierd
313
314def load_local(hierarchy, organism):
315    files = map(lambda x: x[:2], list_local())
316    hierd = build_hierarchy_dict(files)
317
318    out = GeneSets()
319    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
320        fname = os.path.join(local_path(), filename(h, o))
321        out.update(pickle.load(open(fname, 'r')))
322    return out
323
324def load_serverfiles(hierarchy, organism):
325    files = map(lambda x: x[:2], list_serverfiles())
326    hierd = build_hierarchy_dict(files)
327    out = GeneSets()
328    for (h, o) in [ files[i] for i in hierd[(hierarchy, organism)]]:
329        fname = orngServerFiles.localpath_download(sfdomain,
330            filename(h, o))
331        out.update(pickle.load(open(fname, 'r')))
332    return out
333
334def load(hierarchy, organism):
335    """ First try to load from the local registred folder. If the file
336    is not available, load it from the server files. """
337    ret = load_local(hierarchy, organism)
338    if len(ret) == 0:
339        ret.update(load_serverfiles(hierarchy, organism))
340    return ret
341
342def collections(*args):
343    """
344    Input is a list of collections.
345    Collection can either be a tuple (hierarchy, orgranism), where
346    hierarchy is a tuple also.
347    """
348    result = GeneSets()
349
350    for collection in args:
351        try:
352            result.update(collection)
353        except (ValueError, TypeError):
354            if issequencens(collection): #have a hierarchy, organism specification
355                new = load(*collection)
356                result.update(new)
357            else:
358                if collection.lower()[-4:] == ".gmt": #format from webpage
359                    result.update(loadGMT(open(collection,"rt").read(), collection))
360                else:
361                    raise Exception("collection() accepts files in .gmt format only.")
362
363    return result
364
365def issequencens(x):
366    "Is x a sequence and not string ? We say it is if it has a __getitem__ method and it is not an instance of basestring."
367    return hasattr(x, '__getitem__') and not isinstance(x, basestring)
368
369class TException(Exception): pass
370
371def upload_genesets(rsf):
372    """
373    Builds the default gene sets and
374    """
375    orngServerFiles.update_local_files()
376
377    genesetsfn = [ keggGeneSets, goGeneSets, miRNAGeneSets]
378    organisms = obiTaxonomy.common_taxids()
379    for fn in genesetsfn:
380        for org in organisms:
381            try:
382                print "Uploading ORG", org, fn
383                genesets = fn(org).split_by_hierarchy()
384                for gs in genesets:
385                    print "registering", gs.common_hierarchy()
386                    register(gs, rsf) #server files
387                    #register(gs)
388                    print "successful", gs.common_hierarchy()
389            except (obiKEGG.OrganismNotFoundError, GenesetRegException):
390                print "organism not found", org
391
392if __name__ == "__main__":
393    rsf = orngServerFiles.ServerFiles(username=sys.argv[1], password=sys.argv[2])
394    upload_genesets(rsf)
395    pass
396
397
Note: See TracBrowser for help on using the repository browser.