source: orange-bioinformatics/_bioinformatics/obiGeneAtlasSets.py @ 1826:f402285698e5

Revision 1826:f402285698e5, 2.7 KB checked in by Flashpoint <vid.flashpoint@…>, 10 months ago (diff)

Created the obiGeneAtlasSets.py script, to easily prepare gene sets from GXA

Line 
1#!/usr/bin/env python
2
3from collections import defaultdict
4from Orange.bio.obiGeneAtlas import run_simple_query
5from Orange.bio.obiGeneSets import GeneSets, GeneSet
6
7def display_string(string):
8    return string.capitalize().replace("_", " ")
9
10regulation = "updown"
11organism   = "Homo sapiens"
12condition  = "cell_line"
13org_code   = "9606"
14max_pvalue = 1e-5
15
16query = run_simple_query(regulation=regulation, organism=organism, condition=condition, start=0, rows=5) # Do a short pre-query to see how many genes there are
17total_genes = query["totalResults"]
18
19no_of_genes = 50 # 50 genes per query used, since the HTTP requests don't seem stable with more than 50 genes at a time.
20no_of_pages = total_genes / no_of_genes  # Gene Expression Atlas API only permits access to [50,200] genes at a time.
21
22print total_genes
23#exit()
24
25i=0
26sets = defaultdict(list)
27
28for start in range(no_of_pages+1):
29    query = run_simple_query(regulation=regulation, organism=organism, condition=condition, start=start*no_of_genes, rows=no_of_genes)
30    for result in query["results"]:
31        i += 1
32        #print result["gene"]["name"] + "\t" + str(i) # For printing out gene names and the current number of genes during debugging
33        for exp in result["expressions"]: 
34            diff_exp = [e for e in exp["experiments"] if e["pvalue"] <= max_pvalue] # Use only genes that are significantly diff. expressed
35            if diff_exp:
36                try:
37                    sets[exp["ef"], exp["efv"]].append(result["gene"]["name"]) # The Gene Expression Atlas entries are not consistent.
38                except:
39                    sets[exp["efoTerm"], exp["efoId"]].append(result["gene"]["name"])
40       
41gene_sets = []
42for (ef, efv), genes in sets.items():
43    ef_display = display_string(ef)
44    gs = GeneSet(genes, "Diff. expressed in %s=%s." % (ef_display, efv), id=ef + ":" + efv,
45                 description="Diff. expressed in %s=%s." % (ef_display, efv),
46                 link="http://www.ebi.ac.uk/gxa/qrs?specie_0={organism}&gprop_0=&gnot_0=&gval_0=&fact_1=&fexp_1=UPDOWN&fmex_1=&fval_1=%22{efv}%22+&view=hm".format( \
47                         organism = "+".join(organism.lower().split()), efv = "+".join(efv.lower().split())), organism=org_code, hierarchy=("Gene Expression Atlas", display_string(condition)))
48    gene_sets.append(gs)
49
50final_set = GeneSets(gene_sets)
51
52#print final_set
53#exit()
54
55from Orange.bio.obiGeneSets import register
56import Orange.utils.serverfiles as serverfiles
57import sys
58
59try:
60    sf_server = serverfiles.ServerFiles(sys.argv[1], sys.argv[2])
61except:
62    print "argv[1] = username, argv[2] = password"
63    exit()
64
65set_split = final_set.split_by_hierarchy()
66
67for s in set_split:
68    register(s, sf_server)
69
70print "Gene sets successfully registered..."
Note: See TracBrowser for help on using the repository browser.