source: orange/install-scripts/orngServer/serverUpdateScripts/updateGEO.py @ 7116:0d4858ea0613

Revision 7116:0d4858ea0613, 4.5 KB checked in by ales_erjavec <ales.erjavec@…>, 3 years ago (diff)
  • updateGEO check the modified dates of existing files on the ftp and updates the info.
Line 
1##!interval=7
2##!contact=blaz.zupan@fri.uni-lj.si
3
4import obiTaxonomy
5import sys
6import orngServerFiles
7from getopt import getopt
8import cPickle
9import re
10import ftplib
11import time
12from datetime import datetime
13import obiGEO
14
15DOMAIN = "GEO"
16GDS_INFO = "gds_info.pickled"
17TITLE = "Gene Expression Omnibus data sets information"
18TAGS = ["Gene Expression Omnibus", "data sets", "GEO", "GDS"]
19
20FTP_NCBI = "ftp.ncbi.nih.gov"
21NCBI_DIR = "pub/geo/DATA/SOFT/GDS"
22
23opt = dict(getopt(sys.argv[1:], "u:p:", ["user=", "password="])[0])
24username = opt.get("-u", opt.get("--user", "username"))
25password = opt.get("-p", opt.get("--password", "password"))
26server = orngServerFiles.ServerFiles(username, password)
27
28force_update = False
29# check if the DOMAIN/files are already on the server, else, create
30if DOMAIN not in server.listdomains():
31    # DOMAIN does not exist on the server, create it
32    server.create_domain(DOMAIN)
33
34localfile = orngServerFiles.localpath(DOMAIN, GDS_INFO)
35
36path = orngServerFiles.localpath(DOMAIN)
37if GDS_INFO in server.listfiles(DOMAIN):
38    print "Updating info file from server ..."
39    orngServerFiles.update(DOMAIN, GDS_INFO)
40    info = orngServerFiles.info(DOMAIN, GDS_INFO)
41    gds_info_datetime = datetime.strptime(info["datetime"], "%Y-%m-%d %H:%M:%S.%f")
42   
43else:
44    print "Creating a local path..."
45    orngServerFiles.createPathForFile(localfile)
46    f = file(localfile, "wb")
47    cPickle.dump(({}, {}), f, True)
48    f.close()
49    server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
50    server.protect(DOMAIN, GDS_INFO, "0")
51    gds_info_datetime = datetime.fromtimestamp(0)
52   
53
54
55# read the information from the local file
56gds_info, excluded = cPickle.load(file(localfile, "rb"))
57# excluded should be a dictionary (GEO_ID, TAX_ID)
58
59# if need to refresh the data base
60if force_update:
61    gds_info, excluded = ({}, {})
62
63# list of common organisms may have changed, rescan excluded list
64excluded = dict([(id, taxid) for id, taxid in excluded.items() 
65                 if taxid not in obiTaxonomy.common_taxids()])
66excluded.update([(id, info["taxid"]) for id, info in gds_info.items() 
67                 if info["taxid"] not in obiTaxonomy.common_taxids()])
68gds_info = dict([(id, info) for id, info in gds_info.items() 
69                 if info["taxid"] in obiTaxonomy.common_taxids()])
70
71# get the list of GDS files from NCBI directory
72
73
74print "Retreiving ftp directory ..."
75ftp = ftplib.FTP(FTP_NCBI)
76ftp.login()
77ftp.cwd(NCBI_DIR)
78dirlist = []
79ftp.dir(dirlist.append)
80
81from datetime import datetime
82def modified(line):
83    line = line.split()
84    try:
85        date  = " ".join(line[5: 8] + [str(datetime.today().year)])
86        return datetime.strptime(date, "%b %d %H:%M %Y")
87    except ValueError:
88        pass
89    try:
90        date = " ".join(line[5: 8])
91        return datetime.strptime(date, "%b %d %Y")
92    except ValueError:
93        print "Warning: could not retrieve modified date for\n%s" % line
94    return datetime.today()
95   
96m = re.compile("GDS[0-9]*")
97gds_names = [(m.search(d).group(0), modified(d)) for d in dirlist if m.search(d)]
98#gds_names = [name for name, time_m in gds_names if time_t > gds_info_datetime]
99#gds_names = [m.search(d).group(0) for d in dirlist if m.search(d)]
100#gds_names = [name for name in gds_names if not(name in gds_info or name in excluded)]
101gds_names = [name for name, time_m in gds_names if not(name in gds_info or name in excluded) or time_m > gds_info_datetime]
102skipped = []
103
104if len(gds_names):
105    for count, gds_name in enumerate(gds_names):
106        print "%3d of %3d -- Adding %s ..." % (count+1, len(gds_names), gds_name)
107        try:
108            time.sleep(1)
109            gds = obiGEO.GDS(gds_name)
110            if gds.info["taxid"] not in obiTaxonomy.common_taxids():
111                excluded[gds_name] = gds.info["taxid"]
112                print "... excluded (%s)." % gds.info["sample_organism"]
113            else:
114                gds_info.update({gds_name: gds.info})
115                f = file(localfile, "wb")
116                cPickle.dump((gds_info, excluded), f, True)
117                f.close()
118                print "... added."
119        except Exception, ex:
120            print "... skipped (error):", str(ex)
121            skipped.append(gds_name)
122   
123    print "Updating %s:%s on the server ..." % (DOMAIN, GDS_INFO)
124 
125    server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
126    server.protect(DOMAIN, GDS_INFO, "0")
127else:
128    print "No update required."
129
130print
131print "GDS data sets: %d" % len(gds_info)
132print "Organisms:"
133organisms = [info["sample_organism"] for info in gds_info.values()]
134for org in set(organisms):
135    print %s (%d)" % (org, organisms.count(org))
Note: See TracBrowser for help on using the repository browser.