1 #!/usr/bin/python2.4 2 # 3 # Copyright (C) 2008 Google Inc. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Module to compress directories in to series of zip files. 19 20 This module will take a directory and compress all its contents, including 21 child directories into a series of zip files named N.zip where 'N' ranges from 22 0 to infinity. The zip files will all be below a certain specified maximum 23 threshold. 24 25 The directory is compressed with a depth first traversal, each directory's 26 file contents being compressed as it is visisted, before the compression of any 27 child directory's contents. In this way the files within an archive are ordered 28 and the archives themselves are ordered. 29 30 The class also constructs a 'main.py' file intended for use with Google App 31 Engine with a custom App Engine program not currently distributed with this 32 code base. The custom App Engine runtime can leverage the index files written 33 out by this class to more quickly locate which zip file to serve a given URL 34 from. 35 """ 36 37 __author__ = 'jmatt (at] google.com (Justin Mattson)' 38 39 import optparse 40 import os 41 import stat 42 import sys 43 import zipfile 44 import divide_and_compress_constants 45 46 47 def CreateOptionsParser(): 48 """Creates the parser for command line arguments. 49 50 Returns: 51 A configured optparse.OptionParser object. 52 """ 53 rtn = optparse.OptionParser() 54 rtn.add_option('-s', '--sourcefiles', dest='sourcefiles', default=None, 55 help='The directory containing the files to compress') 56 rtn.add_option('-d', '--destination', dest='destination', default=None, 57 help=('Where to put the archive files, this should not be' 58 ' a child of where the source files exist.')) 59 rtn.add_option('-f', '--filesize', dest='filesize', default='1M', 60 help=('Maximum size of archive files. A number followed by ' 61 'a magnitude indicator either "B", "K", "M", or "G". ' 62 'Examples:\n 1000000B == one million BYTES\n' 63 ' 1.2M == one point two MEGABYTES\n' 64 ' 1M == 1048576 BYTES')) 65 rtn.add_option('-n', '--nocompress', action='store_false', dest='compress', 66 default=True, 67 help=('Whether the archive files should be compressed, or ' 68 'just a concatenation of the source files')) 69 return rtn 70 71 72 def VerifyArguments(options, parser): 73 """Runs simple checks on correctness of commandline arguments. 74 75 Args: 76 options: The command line options passed. 77 parser: The parser object used to parse the command string. 78 """ 79 try: 80 if options.sourcefiles is None or options.destination is None: 81 parser.print_help() 82 sys.exit(-1) 83 except AttributeError: 84 parser.print_help() 85 sys.exit(-1) 86 87 88 def ParseSize(size_str): 89 """Parse the file size argument from a string to a number of bytes. 90 91 Args: 92 size_str: The string representation of the file size. 93 94 Returns: 95 The file size in bytes. 96 97 Raises: 98 ValueError: Raises an error if the numeric or qualifier portions of the 99 file size argument is invalid. 100 """ 101 if len(size_str) < 2: 102 raise ValueError(('filesize argument not understood, please include' 103 ' a numeric value and magnitude indicator')) 104 magnitude = size_str[-1] 105 if not magnitude in ('B', 'K', 'M', 'G'): 106 raise ValueError(('filesize magnitude indicator not valid, must be "B",' 107 '"K","M", or "G"')) 108 numeral = float(size_str[:-1]) 109 if magnitude == 'K': 110 numeral *= 1024 111 elif magnitude == 'M': 112 numeral *= 1048576 113 elif magnitude == 'G': 114 numeral *= 1073741824 115 return int(numeral) 116 117 118 class DirectoryZipper(object): 119 """Class to compress a directory and all its sub-directories.""" 120 121 def __init__(self, output_path, base_dir, archive_size, enable_compression): 122 """DirectoryZipper constructor. 123 124 Args: 125 output_path: A string, the path to write the archives and index file to. 126 base_dir: A string, the directory to compress. 127 archive_size: An number, the maximum size, in bytes, of a single 128 archive file. 129 enable_compression: A boolean, whether or not compression should be 130 enabled, if disabled, the files will be written into an uncompresed 131 zip. 132 """ 133 self.output_dir = output_path 134 self.current_archive = '0.zip' 135 self.base_path = base_dir 136 self.max_size = archive_size 137 self.compress = enable_compression 138 139 # Set index_fp to None, because we don't know what it will be yet. 140 self.index_fp = None 141 142 def StartCompress(self): 143 """Start compress of the directory. 144 145 This will start the compression process and write the archives to the 146 specified output directory. It will also produce an 'index.txt' file in the 147 output directory that maps from file to archive. 148 """ 149 self.index_fp = open(os.path.join(self.output_dir, 'main.py'), 'w') 150 self.index_fp.write(divide_and_compress_constants.file_preamble) 151 os.path.walk(self.base_path, self.CompressDirectory, 1) 152 self.index_fp.write(divide_and_compress_constants.file_endpiece) 153 self.index_fp.close() 154 155 def RemoveLastFile(self, archive_path=None): 156 """Removes the last item in the archive. 157 158 This removes the last item in the archive by reading the items out of the 159 archive, adding them to a new archive, deleting the old archive, and 160 moving the new archive to the location of the old archive. 161 162 Args: 163 archive_path: Path to the archive to modify. This archive should not be 164 open elsewhere, since it will need to be deleted. 165 166 Returns: 167 A new ZipFile object that points to the modified archive file. 168 """ 169 if archive_path is None: 170 archive_path = os.path.join(self.output_dir, self.current_archive) 171 172 # Move the old file and create a new one at its old location. 173 root, ext = os.path.splitext(archive_path) 174 old_archive = ''.join([root, '-old', ext]) 175 os.rename(archive_path, old_archive) 176 old_fp = self.OpenZipFileAtPath(old_archive, mode='r') 177 178 # By default, store uncompressed. 179 compress_bit = zipfile.ZIP_STORED 180 if self.compress: 181 compress_bit = zipfile.ZIP_DEFLATED 182 new_fp = self.OpenZipFileAtPath(archive_path, 183 mode='w', 184 compress=compress_bit) 185 186 # Read the old archive in a new archive, except the last one. 187 for zip_member in old_fp.infolist()[:-1]: 188 new_fp.writestr(zip_member, old_fp.read(zip_member.filename)) 189 190 # Close files and delete the old one. 191 old_fp.close() 192 new_fp.close() 193 os.unlink(old_archive) 194 195 def OpenZipFileAtPath(self, path, mode=None, compress=zipfile.ZIP_DEFLATED): 196 """This method is mainly for testing purposes, eg dependency injection.""" 197 if mode is None: 198 if os.path.exists(path): 199 mode = 'a' 200 else: 201 mode = 'w' 202 203 if mode == 'r': 204 return zipfile.ZipFile(path, mode) 205 else: 206 return zipfile.ZipFile(path, mode, compress) 207 208 def CompressDirectory(self, unused_id, dir_path, dir_contents): 209 """Method to compress the given directory. 210 211 This method compresses the directory 'dir_path'. It will add to an existing 212 zip file that still has space and create new ones as necessary to keep zip 213 file sizes under the maximum specified size. This also writes out the 214 mapping of files to archives to the self.index_fp file descriptor 215 216 Args: 217 unused_id: A numeric identifier passed by the os.path.walk method, this 218 is not used by this method. 219 dir_path: A string, the path to the directory to compress. 220 dir_contents: A list of directory contents to be compressed. 221 """ 222 # Construct the queue of files to be added that this method will use 223 # it seems that dir_contents is given in reverse alphabetical order, 224 # so put them in alphabetical order by inserting to front of the list. 225 dir_contents.sort() 226 zip_queue = [] 227 for filename in dir_contents: 228 zip_queue.append(os.path.join(dir_path, filename)) 229 compress_bit = zipfile.ZIP_DEFLATED 230 if not self.compress: 231 compress_bit = zipfile.ZIP_STORED 232 233 # Zip all files in this directory, adding to existing archives and creating 234 # as necessary. 235 while zip_queue: 236 target_file = zip_queue[0] 237 if os.path.isfile(target_file): 238 self.AddFileToArchive(target_file, compress_bit) 239 240 # See if adding the new file made our archive too large. 241 if not self.ArchiveIsValid(): 242 243 # IF fixing fails, the last added file was to large, skip it 244 # ELSE the current archive filled normally, make a new one and try 245 # adding the file again. 246 if not self.FixArchive('SIZE'): 247 zip_queue.pop(0) 248 else: 249 self.current_archive = '%i.zip' % ( 250 int(self.current_archive[ 251 0:self.current_archive.rfind('.zip')]) + 1) 252 else: 253 254 # Write an index record if necessary. 255 self.WriteIndexRecord() 256 zip_queue.pop(0) 257 else: 258 zip_queue.pop(0) 259 260 def WriteIndexRecord(self): 261 """Write an index record to the index file. 262 263 Only write an index record if this is the first file to go into archive 264 265 Returns: 266 True if an archive record is written, False if it isn't. 267 """ 268 archive = self.OpenZipFileAtPath( 269 os.path.join(self.output_dir, self.current_archive), 'r') 270 archive_index = archive.infolist() 271 if len(archive_index) == 1: 272 self.index_fp.write( 273 '[\'%s\', \'%s\'],\n' % (self.current_archive, 274 archive_index[0].filename)) 275 archive.close() 276 return True 277 else: 278 archive.close() 279 return False 280 281 def FixArchive(self, problem): 282 """Make the archive compliant. 283 284 Args: 285 problem: An enum, the reason the archive is invalid. 286 287 Returns: 288 Whether the file(s) removed to fix the archive could conceivably be 289 in an archive, but for some reason can't be added to this one. 290 """ 291 archive_path = os.path.join(self.output_dir, self.current_archive) 292 return_value = None 293 294 if problem == 'SIZE': 295 archive_obj = self.OpenZipFileAtPath(archive_path, mode='r') 296 num_archive_files = len(archive_obj.infolist()) 297 298 # IF there is a single file, that means its too large to compress, 299 # delete the created archive 300 # ELSE do normal finalization. 301 if num_archive_files == 1: 302 print ('WARNING: %s%s is too large to store.' % ( 303 self.base_path, archive_obj.infolist()[0].filename)) 304 archive_obj.close() 305 os.unlink(archive_path) 306 return_value = False 307 else: 308 archive_obj.close() 309 self.RemoveLastFile( 310 os.path.join(self.output_dir, self.current_archive)) 311 print 'Final archive size for %s is %i' % ( 312 self.current_archive, os.path.getsize(archive_path)) 313 return_value = True 314 return return_value 315 316 def AddFileToArchive(self, filepath, compress_bit): 317 """Add the file at filepath to the current archive. 318 319 Args: 320 filepath: A string, the path of the file to add. 321 compress_bit: A boolean, whether or not this file should be compressed 322 when added. 323 324 Returns: 325 True if the file could be added (typically because this is a file) or 326 False if it couldn't be added (typically because its a directory). 327 """ 328 curr_archive_path = os.path.join(self.output_dir, self.current_archive) 329 if os.path.isfile(filepath) and not os.path.islink(filepath): 330 if os.path.getsize(filepath) > 1048576: 331 print 'Warning: %s is potentially too large to serve on GAE' % filepath 332 archive = self.OpenZipFileAtPath(curr_archive_path, 333 compress=compress_bit) 334 # Add the file to the archive. 335 archive.write(filepath, filepath[len(self.base_path):]) 336 archive.close() 337 return True 338 else: 339 return False 340 341 def ArchiveIsValid(self): 342 """Check whether the archive is valid. 343 344 Currently this only checks whether the archive is under the required size. 345 The thought is that eventually this will do additional validation 346 347 Returns: 348 True if the archive is valid, False if its not. 349 """ 350 archive_path = os.path.join(self.output_dir, self.current_archive) 351 return os.path.getsize(archive_path) <= self.max_size 352 353 354 def main(argv): 355 parser = CreateOptionsParser() 356 (options, unused_args) = parser.parse_args(args=argv[1:]) 357 VerifyArguments(options, parser) 358 zipper = DirectoryZipper(options.destination, 359 options.sourcefiles, 360 ParseSize(options.filesize), 361 options.compress) 362 zipper.StartCompress() 363 364 365 if __name__ == '__main__': 366 main(sys.argv) 367