Changeset 1756:ec01f8aa98a9 in orange-bioinformatics for _bioinformatics/obiPPI.py


Ignore:
Timestamp:
04/19/13 16:40:44 (12 months ago)
Author:
markotoplak
Branch:
default
Message:

STRING database build: do not unzip files.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • _bioinformatics/obiPPI.py

    r1720 r1756  
    653653 
    654654        base_url = "http://www.string-db.org/newstring_download/" 
    655         links = base_url + "protein.links.{version}.txt.gz" 
    656         actions = base_url + "protein.actions.{version}.txt.gz" 
    657         aliases = base_url + "protein.aliases.{version}.txt.gz" 
    658          
    659         wget(links.format(version=version), dir, progress=True) 
    660         wget(actions.format(version=version), dir, progress=True) 
    661         wget(aliases.format(version=version), dir, progress=True) 
    662          
    663         links_filename = os.path.join(dir, "protein.links.{version}.txt".format(version=version)) 
    664         actions_filename = os.path.join(dir, "protein.actions.{version}.txt".format(version=version)) 
    665         aliases_filename = os.path.join(dir, "protein.aliases.{version}.txt".format(version=version)) 
    666          
    667         progress = ConsoleProgressBar("Extracting files:") 
    668         progress(1.0) 
    669         links_file = gzip.GzipFile(links_filename + ".gz", "rb") 
    670         shutil.copyfileobj(links_file, open(links_filename, "wb")) 
    671          
    672         progress(60.0) 
    673         actions_file = gzip.GzipFile(actions_filename + ".gz", "rb") 
    674         shutil.copyfileobj(actions_file, open(actions_filename, "wb")) 
    675          
    676         progress(90.0) 
    677         aliases_file = gzip.GzipFile(aliases_filename + ".gz", "rb") 
    678         shutil.copyfileobj(aliases_file, open(aliases_filename, "wb")) 
    679         progress.finish() 
     655        links = "protein.links.{version}.txt.gz".format(version=version) 
     656        actions = "protein.actions.{version}.txt.gz".format(version=version) 
     657        aliases = "protein.aliases.{version}.txt.gz".format(version=version) 
     658 
     659        def wgeti(f, dir, progress): 
     660            if not os.path.exists(os.path.join(dir, f)): 
     661                print "Downloading:", f 
     662                wget(base_url + f, dir, progress=progress) 
     663            else: 
     664                print "Already downloaded - skiping:", f 
     665 
     666        wgeti(links, dir, progress=True) 
     667        wgeti(actions, dir, progress=True) 
     668        wgeti(aliases, dir, progress=True) 
    680669         
    681670        cls.init_db(version, taxids) 
     
    700689        dir = orngServerFiles.localpath(cls.DOMAIN) 
    701690         
    702         links_filename = os.path.join(dir, "protein.links.{version}.txt".format(version=version)) 
    703         actions_filename = os.path.join(dir, "protein.actions.{version}.txt".format(version=version)) 
    704         aliases_filename = os.path.join(dir, "protein.aliases.{version}.txt".format(version=version)) 
    705          
    706         links_file = open(links_filename, "rb") 
    707         actions_file = open(actions_filename, "rb") 
    708         aliases_file = open(aliases_filename, "rb") 
     691        links_filename = os.path.join(dir, "protein.links.{version}.txt.gz".format(version=version)) 
     692        actions_filename = os.path.join(dir, "protein.actions.{version}.txt.gz".format(version=version)) 
     693        aliases_filename = os.path.join(dir, "protein.aliases.{version}.txt.gz".format(version=version)) 
     694         
     695        links_file = gzip.GzipFile(links_filename, "rb") 
     696        actions_file = gzip.GzipFile(actions_filename, "rb") 
     697        aliases_file = gzip.GzipFile(aliases_filename, "rb") 
    709698         
    710699        progress = ConsoleProgressBar("Processing links:") 
    711700        progress(0.0) 
    712         filesize = os.stat(links_filename).st_size 
     701        filesize = os.stat(links_filename).st_size*10 #not the correct size! 
    713702         
    714703        if taxids: 
     
    773762            progress.finish() 
    774763             
    775             filesize = os.stat(actions_filename).st_size 
     764            filesize = os.stat(actions_filename).st_size*10 
    776765             
    777766            actions_file.readline() # read header 
     
    796785            progress.finish() 
    797786             
    798             filesize = os.stat(aliases_filename).st_size 
     787            filesize = os.stat(aliases_filename).st_size*10 
    799788            aliases_file.readline() # read header 
    800789             
     
    939928            print "Already downloaded - skiping" 
    940929             
    941         gz = gzip.open(os.path.join(dir, links_filename), "rb") 
    942  
    943         # Strip .gz extension 
    944         links_filename = os.path.join(dir, os.path.splitext(links_filename)[0]) 
    945         if not os.path.exists(links_filename): 
    946             shutil.copyfileobj(gz, open(links_filename, "wb")) 
    947          
    948930        cls.init_db(version, taxids) 
    949931             
     
    953935        dir = orngServerFiles.localpath(cls.DOMAIN) 
    954936         
    955         links_filename = "protein.links.detailed.{version}.txt".format(version=version) 
     937        links_filename = "protein.links.detailed.{version}.txt.gz".format(version=version) 
    956938        links_filename = os.path.join(dir, links_filename) 
    957939         
     
    961943            taxids = set(cls.common_taxids()) 
    962944         
    963         links_file = open(links_filename, "rb") 
     945        links_file = gzip.GzipFile(links_filename, "rb") 
    964946         
    965947        con = sqlite3.connect(os.path.join(dir, cls.FILENAME_DETAILED)) 
     
    985967            links = csv.reader(links_file, delimiter=" ") 
    986968            links.next() # Read header 
    987             filesize = os.stat(links_filename).st_size 
     969            filesize = os.stat(links_filename).st_size*10 #not the correct size 
    988970             
    989971            progress = ConsoleProgressBar("Processing links file:") 
Note: See TracChangeset for help on using the changeset viewer.