source: orange-bioinformatics/server_update/updateGEO.py @ 1718:146dc04e42de

Revision 1718:146dc04e42de, 4.7 KB checked in by markotoplak, 20 months ago (diff)

Removed KEGG update scripts, updated (and ran) dicty base and GEO.

Line 
1##!interval=7
2##!contact=blaz.zupan@fri.uni-lj.si
3
4from Orange.bio import obiTaxonomy, obiGEO
5import sys
6import Orange.utils.serverfiles as orngServerFiles
7from getopt import getopt
8import cPickle
9import re
10import ftplib
11import time
12from datetime import datetime
13import os
14
15DOMAIN = "GEO"
16GDS_INFO = "gds_info.pickled"
17TITLE = "Gene Expression Omnibus data sets information"
18TAGS = ["Gene Expression Omnibus", "data sets", "GEO", "GDS"]
19
20FTP_NCBI = "ftp.ncbi.nih.gov"
21NCBI_DIR = "pub/geo/DATA/SOFT/GDS"
22
23opt = dict(getopt(sys.argv[1:], "u:p:", ["user=", "password="])[0])
24username = opt.get("-u", opt.get("--user", "username"))
25password = opt.get("-p", opt.get("--password", "password"))
26server = orngServerFiles.ServerFiles(username, password)
27
28force_update = False
29# check if the DOMAIN/files are already on the server, else, create
30if DOMAIN not in server.listdomains():
31    # DOMAIN does not exist on the server, create it
32    server.create_domain(DOMAIN)
33
34localfile = orngServerFiles.localpath(DOMAIN, GDS_INFO)
35
36def _create_path_for_file(target): #KEGG uses this!
37    try:
38        os.makedirs(os.path.dirname(target))
39    except OSError:
40        pass
41
42path = orngServerFiles.localpath(DOMAIN)
43if GDS_INFO in server.listfiles(DOMAIN):
44    print "Updating info file from server ..."
45    orngServerFiles.update(DOMAIN, GDS_INFO)
46    info = orngServerFiles.info(DOMAIN, GDS_INFO)
47    gds_info_datetime = datetime.strptime(info["datetime"], "%Y-%m-%d %H:%M:%S.%f")
48   
49else:
50    print "Creating a local path..."
51    _create_path_for_file(localfile)
52    f = file(localfile, "wb")
53    cPickle.dump(({}, {}), f, True)
54    f.close()
55    server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
56    server.protect(DOMAIN, GDS_INFO, "0")
57    gds_info_datetime = datetime.fromtimestamp(0)
58   
59
60
61# read the information from the local file
62gds_info, excluded = cPickle.load(file(localfile, "rb"))
63# excluded should be a dictionary (GEO_ID, TAX_ID)
64
65# if need to refresh the data base
66if force_update:
67    gds_info, excluded = ({}, {})
68
69# list of common organisms may have changed, rescan excluded list
70excluded = dict([(id, taxid) for id, taxid in excluded.items() 
71                 if taxid not in obiTaxonomy.common_taxids()])
72excluded.update([(id, info["taxid"]) for id, info in gds_info.items() 
73                 if info["taxid"] not in obiTaxonomy.common_taxids()])
74gds_info = dict([(id, info) for id, info in gds_info.items() 
75                 if info["taxid"] in obiTaxonomy.common_taxids()])
76
77# get the list of GDS files from NCBI directory
78
79
80print "Retrieving ftp directory ..."
81ftp = ftplib.FTP(FTP_NCBI)
82ftp.login()
83ftp.cwd(NCBI_DIR)
84dirlist = []
85ftp.dir(dirlist.append)
86
87from datetime import datetime
88def modified(line):
89    line = line.split()
90    try:
91        date  = " ".join(line[5: 8] + [str(datetime.today().year)])
92        return datetime.strptime(date, "%b %d %H:%M %Y")
93    except ValueError:
94        pass
95    try:
96        date = " ".join(line[5: 8])
97        return datetime.strptime(date, "%b %d %Y")
98    except ValueError:
99        print "Warning: could not retrieve modified date for\n%s" % line
100    return datetime.today()
101   
102m = re.compile("GDS[0-9]*")
103gds_names = [(m.search(d).group(0), modified(d)) for d in dirlist if m.search(d)]
104#gds_names = [name for name, time_m in gds_names if time_t > gds_info_datetime]
105#gds_names = [m.search(d).group(0) for d in dirlist if m.search(d)]
106#gds_names = [name for name in gds_names if not(name in gds_info or name in excluded)]
107gds_names = [name for name, time_m in gds_names if not(name in gds_info or name in excluded) or time_m > gds_info_datetime]
108skipped = []
109
110if len(gds_names):
111    for count, gds_name in enumerate(gds_names):
112        print "%3d of %3d -- Adding %s ..." % (count+1, len(gds_names), gds_name)
113        try:
114            time.sleep(1)
115            gds = obiGEO.GDS(gds_name)
116            if gds.info["taxid"] not in obiTaxonomy.common_taxids():
117                excluded[gds_name] = gds.info["taxid"]
118                print "... excluded (%s)." % gds.info["sample_organism"]
119            else:
120                gds_info.update({gds_name: gds.info})
121                f = file(localfile, "wb")
122                cPickle.dump((gds_info, excluded), f, True)
123                f.close()
124                print "... added."
125        except Exception, ex:
126            print "... skipped (error):", str(ex)
127            skipped.append(gds_name)
128   
129    print "Updating %s:%s on the server ..." % (DOMAIN, GDS_INFO)
130 
131    server.upload(DOMAIN, GDS_INFO, localfile, TITLE, TAGS)
132    server.protect(DOMAIN, GDS_INFO, "0")
133else:
134    print "No update required."
135
136print
137print "GDS data sets: %d" % len(gds_info)
138print "Organisms:"
139organisms = [info["sample_organism"] for info in gds_info.values()]
140for org in set(organisms):
141    print %s (%d)" % (org, organisms.count(org))
Note: See TracBrowser for help on using the repository browser.