1 # Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2 # 3 # Use of this source code is governed by a BSD-style license 4 # that can be found in the LICENSE file in the root of the source 5 # tree. An additional intellectual property rights grant can be found 6 # in the file PATENTS. All contributing project authors may 7 # be found in the AUTHORS file in the root of the source tree. 8 # 9 # This simple script pulls test files from the webm homepage 10 # It is intelligent enough to only pull files if 11 # 1) File / test_data folder does not exist 12 # 2) SHA mismatch 13 14 import pycurl 15 import csv 16 import hashlib 17 import re 18 import os.path 19 import time 20 import itertools 21 import sys 22 import getopt 23 24 #globals 25 url = '' 26 file_list_path = '' 27 local_resource_path = '' 28 29 # Helper functions: 30 # A simple function which returns the sha hash of a file in hex 31 def get_file_sha(filename): 32 try: 33 sha_hash = hashlib.sha1() 34 with open(filename, 'rb') as file: 35 buf = file.read(HASH_CHUNK) 36 while len(buf) > 0: 37 sha_hash.update(buf) 38 buf = file.read(HASH_CHUNK) 39 return sha_hash.hexdigest() 40 except IOError: 41 print "Error reading " + filename 42 43 # Downloads a file from a url, and then checks the sha against the passed 44 # in sha 45 def download_and_check_sha(url, filename, sha): 46 path = os.path.join(local_resource_path, filename) 47 fp = open(path, "wb") 48 curl = pycurl.Curl() 49 curl.setopt(pycurl.URL, url + "/" + filename) 50 curl.setopt(pycurl.WRITEDATA, fp) 51 curl.perform() 52 curl.close() 53 fp.close() 54 return get_file_sha(path) == sha 55 56 #constants 57 ftp_retries = 3 58 59 SHA_COL = 0 60 NAME_COL = 1 61 EXPECTED_COL = 2 62 HASH_CHUNK = 65536 63 64 # Main script 65 try: 66 opts, args = \ 67 getopt.getopt(sys.argv[1:], \ 68 "u:i:o:", ["url=", "input_csv=", "output_dir="]) 69 except: 70 print 'get_files.py -u <url> -i <input_csv> -o <output_dir>' 71 sys.exit(2) 72 73 for opt, arg in opts: 74 if opt == '-u': 75 url = arg 76 elif opt in ("-i", "--input_csv"): 77 file_list_path = os.path.join(arg) 78 elif opt in ("-o", "--output_dir"): 79 local_resource_path = os.path.join(arg) 80 81 if len(sys.argv) != 7: 82 print "Expects two paths and a url!" 83 exit(1) 84 85 if not os.path.isdir(local_resource_path): 86 os.makedirs(local_resource_path) 87 88 file_list_csv = open(file_list_path, "rb") 89 90 # Our 'csv' file uses multiple spaces as a delimiter, python's 91 # csv class only uses single character delimiters, so we convert them below 92 file_list_reader = csv.reader((re.sub(' +', ' ', line) \ 93 for line in file_list_csv), delimiter = ' ') 94 95 file_shas = [] 96 file_names = [] 97 98 for row in file_list_reader: 99 if len(row) != EXPECTED_COL: 100 continue 101 file_shas.append(row[SHA_COL]) 102 file_names.append(row[NAME_COL]) 103 104 file_list_csv.close() 105 106 # Download files, only if they don't already exist and have correct shas 107 for filename, sha in itertools.izip(file_names, file_shas): 108 path = os.path.join(local_resource_path, filename) 109 if os.path.isfile(path) \ 110 and get_file_sha(path) == sha: 111 print path + ' exists, skipping' 112 continue 113 for retry in range(0, ftp_retries): 114 print "Downloading " + path 115 if not download_and_check_sha(url, filename, sha): 116 print "Sha does not match, retrying..." 117 else: 118 break 119