source: orange/install-scripts/orngServer/serverUpdateScripts/updateGEO.py @ 7741:83ddc19a3068

Revision 7741:83ddc19a3068, 4.7 KB checked in by markotoplak, 3 years ago (diff)

Removed dependency on orngServerFiles.createPath

Line 
1##!interval=7
2##!contact=blaz.zupan@fri.uni-lj.si
3
4import obiTaxonomy
5import sys
6import orngServerFiles
7from getopt import getopt
8import cPickle
9import re
10import ftplib
11import time
12from datetime import datetime
13import obiGEO
14import os
15
16DOMAIN = "GEO"
17GDS_INFO = "gds_info.pickled"
18TITLE = "Gene Expression Omnibus data sets information"
19TAGS = ["Gene Expression Omnibus", "data sets", "GEO", "GDS"]
20
21FTP_NCBI = "ftp.ncbi.nih.gov"
22NCBI_DIR = "pub/geo/DATA/SOFT/GDS"
23
24opt = dict(getopt(sys.argv[1:], "u:p:", ["user=", "password="])[0])
25username = opt.get("-u", opt.get("--user", "username"))
26password = opt.get("-p", opt.get("--password", "password"))
27server = orngServerFiles.ServerFiles(username, password)
28
29force_update = False
30# check if the DOMAIN/files are already on the server, else, create
31if DOMAIN not in server.listdomains():
32    # DOMAIN does not exist on the server, create it
33    server.create_domain(DOMAIN)
34
35localfile = orngServerFiles.localpath(DOMAIN, GDS_INFO)
36
37def _create_path_for_file(target): #KEGG uses this!
38    try:
39        os.makedirs(os.path.dirname(target))
40    except OSError:
41        pass
42
43path = orngServerFiles.localpath(DOMAIN)
44if GDS_INFO in server.listfiles(DOMAIN):
45    print "Updating info file from server ..."
46    orngServerFiles.update(DOMAIN, GDS_INFO)
47    info = orngServerFiles.info(DOMAIN, GDS_INFO)
48    gds_info_datetime = datetime.strptime(info["datetime"], "%Y-%m-%d %H:%M:%S.%f")
49   
50else:
51    print "Creating a local path..."
52    _create_path_for_file(localfile)
53    f = file(localfile, "wb")
54    cPickle.dump(({}, {}), f, True)
55    f.close()
56    server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
57    server.protect(DOMAIN, GDS_INFO, "0")
58    gds_info_datetime = datetime.fromtimestamp(0)
59   
60
61
62# read the information from the local file
63gds_info, excluded = cPickle.load(file(localfile, "rb"))
64# excluded should be a dictionary (GEO_ID, TAX_ID)
65
66# if need to refresh the data base
67if force_update:
68    gds_info, excluded = ({}, {})
69
70# list of common organisms may have changed, rescan excluded list
71excluded = dict([(id, taxid) for id, taxid in excluded.items() 
72                 if taxid not in obiTaxonomy.common_taxids()])
73excluded.update([(id, info["taxid"]) for id, info in gds_info.items() 
74                 if info["taxid"] not in obiTaxonomy.common_taxids()])
75gds_info = dict([(id, info) for id, info in gds_info.items() 
76                 if info["taxid"] in obiTaxonomy.common_taxids()])
77
78# get the list of GDS files from NCBI directory
79
80
81print "Retreiving ftp directory ..."
82ftp = ftplib.FTP(FTP_NCBI)
83ftp.login()
84ftp.cwd(NCBI_DIR)
85dirlist = []
86ftp.dir(dirlist.append)
87
88from datetime import datetime
89def modified(line):
90    line = line.split()
91    try:
92        date  = " ".join(line[5: 8] + [str(datetime.today().year)])
93        return datetime.strptime(date, "%b %d %H:%M %Y")
94    except ValueError:
95        pass
96    try:
97        date = " ".join(line[5: 8])
98        return datetime.strptime(date, "%b %d %Y")
99    except ValueError:
100        print "Warning: could not retrieve modified date for\n%s" % line
101    return datetime.today()
102   
103m = re.compile("GDS[0-9]*")
104gds_names = [(m.search(d).group(0), modified(d)) for d in dirlist if m.search(d)]
105#gds_names = [name for name, time_m in gds_names if time_t > gds_info_datetime]
106#gds_names = [m.search(d).group(0) for d in dirlist if m.search(d)]
107#gds_names = [name for name in gds_names if not(name in gds_info or name in excluded)]
108gds_names = [name for name, time_m in gds_names if not(name in gds_info or name in excluded) or time_m > gds_info_datetime]
109skipped = []
110
111if len(gds_names):
112    for count, gds_name in enumerate(gds_names):
113        print "%3d of %3d -- Adding %s ..." % (count+1, len(gds_names), gds_name)
114        try:
115            time.sleep(1)
116            gds = obiGEO.GDS(gds_name)
117            if gds.info["taxid"] not in obiTaxonomy.common_taxids():
118                excluded[gds_name] = gds.info["taxid"]
119                print "... excluded (%s)." % gds.info["sample_organism"]
120            else:
121                gds_info.update({gds_name: gds.info})
122                f = file(localfile, "wb")
123                cPickle.dump((gds_info, excluded), f, True)
124                f.close()
125                print "... added."
126        except Exception, ex:
127            print "... skipped (error):", str(ex)
128            skipped.append(gds_name)
129   
130    print "Updating %s:%s on the server ..." % (DOMAIN, GDS_INFO)
131 
132    server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
133    server.protect(DOMAIN, GDS_INFO, "0")
134else:
135    print "No update required."
136
137print
138print "GDS data sets: %d" % len(gds_info)
139print "Organisms:"
140organisms = [info["sample_organism"] for info in gds_info.values()]
141for org in set(organisms):
142    print %s (%d)" % (org, organisms.count(org))
Note: See TracBrowser for help on using the repository browser.