source: orange-bioinformatics/server_update/updateGEO.py @ 1721:921b3780c6cc

Revision 1721:921b3780c6cc, 4.4 KB checked in by markotoplak, 20 months ago (diff)

Moved common functionality of the server files update scripts to a separate file.

Line 
1##!interval=7
2##!contact=blaz.zupan@fri.uni-lj.si
3
4from Orange.bio import obiTaxonomy, obiGEO
5import cPickle
6import re
7import ftplib
8import time
9from datetime import datetime
10
11from common import *
12
13DOMAIN = "GEO"
14GDS_INFO = "gds_info.pickled"
15TITLE = "Gene Expression Omnibus data sets information"
16TAGS = ["Gene Expression Omnibus", "data sets", "GEO", "GDS"]
17
18FTP_NCBI = "ftp.ncbi.nih.gov"
19NCBI_DIR = "pub/geo/DATA/SOFT/GDS"
20
21force_update = False
22# check if the DOMAIN/files are already on the server, else, create
23if DOMAIN not in sf_server.listdomains():
24    # DOMAIN does not exist on the server, create it
25    sf_server.create_domain(DOMAIN)
26
27localfile = sf_local.localpath(DOMAIN, GDS_INFO)
28
29def _create_path_for_file(target): #KEGG uses this!
30    try:
31        os.makedirs(os.path.dirname(target))
32    except OSError:
33        pass
34
35path = sf_local.localpath(DOMAIN)
36if GDS_INFO in sf_server.listfiles(DOMAIN):
37    print "Updating info file from server ..."
38    sf_local.update(DOMAIN, GDS_INFO)
39    info = sf_local.info(DOMAIN, GDS_INFO)
40    gds_info_datetime = datetime.strptime(info["datetime"], "%Y-%m-%d %H:%M:%S.%f")
41   
42else:
43    print "Creating a local path..."
44    _create_path_for_file(localfile)
45    f = file(localfile, "wb")
46    cPickle.dump(({}, {}), f, True)
47    f.close()
48    sf_server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
49    sf_server.protect(DOMAIN, GDS_INFO, "0")
50    gds_info_datetime = datetime.fromtimestamp(0)
51   
52
53
54# read the information from the local file
55gds_info, excluded = cPickle.load(file(localfile, "rb"))
56# excluded should be a dictionary (GEO_ID, TAX_ID)
57
58# if need to refresh the data base
59if force_update:
60    gds_info, excluded = ({}, {})
61
62# list of common organisms may have changed, rescan excluded list
63excluded = dict([(id, taxid) for id, taxid in excluded.items() 
64                 if taxid not in obiTaxonomy.common_taxids()])
65excluded.update([(id, info["taxid"]) for id, info in gds_info.items() 
66                 if info["taxid"] not in obiTaxonomy.common_taxids()])
67gds_info = dict([(id, info) for id, info in gds_info.items() 
68                 if info["taxid"] in obiTaxonomy.common_taxids()])
69
70# get the list of GDS files from NCBI directory
71
72
73print "Retrieving ftp directory ..."
74ftp = ftplib.FTP(FTP_NCBI)
75ftp.login()
76ftp.cwd(NCBI_DIR)
77dirlist = []
78ftp.dir(dirlist.append)
79
80from datetime import datetime
81def modified(line):
82    line = line.split()
83    try:
84        date  = " ".join(line[5: 8] + [str(datetime.today().year)])
85        return datetime.strptime(date, "%b %d %H:%M %Y")
86    except ValueError:
87        pass
88    try:
89        date = " ".join(line[5: 8])
90        return datetime.strptime(date, "%b %d %Y")
91    except ValueError:
92        print "Warning: could not retrieve modified date for\n%s" % line
93    return datetime.today()
94   
95m = re.compile("GDS[0-9]*")
96gds_names = [(m.search(d).group(0), modified(d)) for d in dirlist if m.search(d)]
97#gds_names = [name for name, time_m in gds_names if time_t > gds_info_datetime]
98#gds_names = [m.search(d).group(0) for d in dirlist if m.search(d)]
99#gds_names = [name for name in gds_names if not(name in gds_info or name in excluded)]
100gds_names = [name for name, time_m in gds_names if not(name in gds_info or name in excluded) or time_m > gds_info_datetime]
101skipped = []
102
103if len(gds_names):
104    for count, gds_name in enumerate(gds_names):
105        print "%3d of %3d -- Adding %s ..." % (count+1, len(gds_names), gds_name)
106        try:
107            time.sleep(1)
108            gds = obiGEO.GDS(gds_name)
109            if gds.info["taxid"] not in obiTaxonomy.common_taxids():
110                excluded[gds_name] = gds.info["taxid"]
111                print "... excluded (%s)." % gds.info["sample_organism"]
112            else:
113                gds_info.update({gds_name: gds.info})
114                f = file(localfile, "wb")
115                cPickle.dump((gds_info, excluded), f, True)
116                f.close()
117                print "... added."
118        except Exception, ex:
119            print "... skipped (error):", str(ex)
120            skipped.append(gds_name)
121   
122    print "Updating %s:%s on the server ..." % (DOMAIN, GDS_INFO)
123 
124    sf_server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
125    sf_server.protect(DOMAIN, GDS_INFO, "0")
126else:
127    print "No update required."
128
129print
130print "GDS data sets: %d" % len(gds_info)
131print "Organisms:"
132organisms = [info["sample_organism"] for info in gds_info.values()]
133for org in set(organisms):
134    print %s (%d)" % (org, organisms.count(org))
Note: See TracBrowser for help on using the repository browser.