Home | History | Annotate | Download | only in scripts
      1 #!/usr/bin/python2.4
      2 #
      3 # Copyright (C) 2008 Google Inc.
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #      http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 # Unless required by applicable law or agreed to in writing, software
     12 # distributed under the License is distributed on an "AS IS" BASIS,
     13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 # See the License for the specific language governing permissions and
     15 # limitations under the License.
     16 #
     17 
     18 """Module to compress directories in to series of zip files.
     19 
     20 This module will take a directory and compress all its contents, including
     21 child directories into a series of zip files named N.zip where 'N' ranges from
     22 0 to infinity. The zip files will all be below a certain specified maximum
     23 threshold.
     24 
     25 The directory is compressed with a depth first traversal, each directory's
     26 file contents being compressed as it is visisted, before the compression of any
     27 child directory's contents. In this way the files within an archive are ordered
     28 and the archives themselves are ordered.
     29 
     30 The class also constructs a 'main.py' file intended for use with Google App
     31 Engine with a custom App Engine program not currently distributed with this
     32 code base. The custom App Engine runtime can leverage the index files written
     33 out by this class to more quickly locate which zip file to serve a given URL
     34 from.
     35 """
     36 
     37 __author__ = 'jmatt (at] google.com (Justin Mattson)'
     38 
     39 import optparse
     40 import os
     41 import stat
     42 import sys
     43 import zipfile
     44 import divide_and_compress_constants
     45 
     46 
     47 def CreateOptionsParser():
     48   """Creates the parser for command line arguments.
     49 
     50   Returns:
     51     A configured optparse.OptionParser object.
     52   """
     53   rtn = optparse.OptionParser()
     54   rtn.add_option('-s', '--sourcefiles', dest='sourcefiles', default=None,
     55                  help='The directory containing the files to compress')
     56   rtn.add_option('-d', '--destination', dest='destination', default=None,
     57                  help=('Where to put the archive files, this should not be'
     58                        ' a child of where the source files exist.'))
     59   rtn.add_option('-f', '--filesize', dest='filesize', default='1M',
     60                  help=('Maximum size of archive files. A number followed by '
     61                        'a magnitude indicator either "B", "K", "M", or "G". '
     62                        'Examples:\n  1000000B == one million BYTES\n'
     63                        '  1.2M == one point two MEGABYTES\n'
     64                        '  1M == 1048576 BYTES'))
     65   rtn.add_option('-n', '--nocompress', action='store_false', dest='compress',
     66                  default=True,
     67                  help=('Whether the archive files should be compressed, or '
     68                        'just a concatenation of the source files'))
     69   return rtn
     70 
     71 
     72 def VerifyArguments(options, parser):
     73   """Runs simple checks on correctness of commandline arguments.
     74 
     75   Args:
     76     options: The command line options passed.
     77     parser: The parser object used to parse the command string.
     78   """
     79   try:
     80     if options.sourcefiles is None or options.destination is None:
     81       parser.print_help()
     82       sys.exit(-1)
     83   except AttributeError:
     84     parser.print_help()
     85     sys.exit(-1)
     86 
     87 
     88 def ParseSize(size_str):
     89   """Parse the file size argument from a string to a number of bytes.
     90 
     91   Args:
     92     size_str: The string representation of the file size.
     93 
     94   Returns:
     95     The file size in bytes.
     96 
     97   Raises:
     98     ValueError: Raises an error if the numeric or qualifier portions of the
     99       file size argument is invalid.
    100   """
    101   if len(size_str) < 2:
    102     raise ValueError(('filesize argument not understood, please include'
    103                       ' a numeric value and magnitude indicator'))
    104   magnitude = size_str[-1]
    105   if not magnitude in ('B', 'K', 'M', 'G'):
    106     raise ValueError(('filesize magnitude indicator not valid, must be "B",'
    107                       '"K","M", or "G"'))
    108   numeral = float(size_str[:-1])
    109   if magnitude == 'K':
    110     numeral *= 1024
    111   elif magnitude == 'M':
    112     numeral *= 1048576
    113   elif magnitude == 'G':
    114     numeral *= 1073741824
    115   return int(numeral)
    116 
    117 
    118 class DirectoryZipper(object):
    119   """Class to compress a directory and all its sub-directories."""
    120 
    121   def __init__(self, output_path, base_dir, archive_size, enable_compression):
    122     """DirectoryZipper constructor.
    123 
    124     Args:
    125       output_path: A string, the path to write the archives and index file to.
    126       base_dir: A string, the directory to compress.
    127       archive_size: An number, the maximum size, in bytes, of a single
    128         archive file.
    129       enable_compression: A boolean, whether or not compression should be
    130         enabled, if disabled, the files will be written into an uncompresed
    131         zip.
    132     """
    133     self.output_dir = output_path
    134     self.current_archive = '0.zip'
    135     self.base_path = base_dir
    136     self.max_size = archive_size
    137     self.compress = enable_compression
    138 
    139     # Set index_fp to None, because we don't know what it will be yet.
    140     self.index_fp = None
    141 
    142   def StartCompress(self):
    143     """Start compress of the directory.
    144 
    145     This will start the compression process and write the archives to the
    146     specified output directory. It will also produce an 'index.txt' file in the
    147     output directory that maps from file to archive.
    148     """
    149     self.index_fp = open(os.path.join(self.output_dir, 'main.py'), 'w')
    150     self.index_fp.write(divide_and_compress_constants.file_preamble)
    151     os.path.walk(self.base_path, self.CompressDirectory, 1)
    152     self.index_fp.write(divide_and_compress_constants.file_endpiece)
    153     self.index_fp.close()
    154 
    155   def RemoveLastFile(self, archive_path=None):
    156     """Removes the last item in the archive.
    157 
    158     This removes the last item in the archive by reading the items out of the
    159     archive, adding them to a new archive, deleting the old archive, and
    160     moving the new archive to the location of the old archive.
    161 
    162     Args:
    163       archive_path: Path to the archive to modify. This archive should not be
    164         open elsewhere, since it will need to be deleted.
    165 
    166     Returns:
    167       A new ZipFile object that points to the modified archive file.
    168     """
    169     if archive_path is None:
    170       archive_path = os.path.join(self.output_dir, self.current_archive)
    171 
    172     # Move the old file and create a new one at its old location.
    173     root, ext = os.path.splitext(archive_path)
    174     old_archive = ''.join([root, '-old', ext])
    175     os.rename(archive_path, old_archive)
    176     old_fp = self.OpenZipFileAtPath(old_archive, mode='r')
    177 
    178     # By default, store uncompressed.
    179     compress_bit = zipfile.ZIP_STORED
    180     if self.compress:
    181       compress_bit = zipfile.ZIP_DEFLATED
    182     new_fp = self.OpenZipFileAtPath(archive_path,
    183                                     mode='w',
    184                                     compress=compress_bit)
    185 
    186     # Read the old archive in a new archive, except the last one.
    187     for zip_member in old_fp.infolist()[:-1]:
    188       new_fp.writestr(zip_member, old_fp.read(zip_member.filename))
    189 
    190     # Close files and delete the old one.
    191     old_fp.close()
    192     new_fp.close()
    193     os.unlink(old_archive)
    194 
    195   def OpenZipFileAtPath(self, path, mode=None, compress=zipfile.ZIP_DEFLATED):
    196     """This method is mainly for testing purposes, eg dependency injection."""
    197     if mode is None:
    198       if os.path.exists(path):
    199         mode = 'a'
    200       else:
    201         mode = 'w'
    202 
    203     if mode == 'r':
    204       return zipfile.ZipFile(path, mode)
    205     else:
    206       return zipfile.ZipFile(path, mode, compress)
    207 
    208   def CompressDirectory(self, unused_id, dir_path, dir_contents):
    209     """Method to compress the given directory.
    210 
    211     This method compresses the directory 'dir_path'. It will add to an existing
    212     zip file that still has space and create new ones as necessary to keep zip
    213     file sizes under the maximum specified size. This also writes out the
    214     mapping of files to archives to the self.index_fp file descriptor
    215 
    216     Args:
    217       unused_id: A numeric identifier passed by the os.path.walk method, this
    218         is not used by this method.
    219       dir_path: A string, the path to the directory to compress.
    220       dir_contents: A list of directory contents to be compressed.
    221     """
    222     # Construct the queue of files to be added that this method will use
    223     # it seems that dir_contents is given in reverse alphabetical order,
    224     # so put them in alphabetical order by inserting to front of the list.
    225     dir_contents.sort()
    226     zip_queue = []
    227     for filename in dir_contents:
    228       zip_queue.append(os.path.join(dir_path, filename))
    229     compress_bit = zipfile.ZIP_DEFLATED
    230     if not self.compress:
    231       compress_bit = zipfile.ZIP_STORED
    232 
    233     # Zip all files in this directory, adding to existing archives and creating
    234     # as necessary.
    235     while zip_queue:
    236       target_file = zip_queue[0]
    237       if os.path.isfile(target_file):
    238         self.AddFileToArchive(target_file, compress_bit)
    239 
    240         # See if adding the new file made our archive too large.
    241         if not self.ArchiveIsValid():
    242 
    243           # IF fixing fails, the last added file was to large, skip it
    244           # ELSE the current archive filled normally, make a new one and try
    245           #  adding the file again.
    246           if not self.FixArchive('SIZE'):
    247             zip_queue.pop(0)
    248           else:
    249             self.current_archive = '%i.zip' % (
    250                 int(self.current_archive[
    251                     0:self.current_archive.rfind('.zip')]) + 1)
    252         else:
    253 
    254           # Write an index record if necessary.
    255           self.WriteIndexRecord()
    256           zip_queue.pop(0)
    257       else:
    258         zip_queue.pop(0)
    259 
    260   def WriteIndexRecord(self):
    261     """Write an index record to the index file.
    262 
    263     Only write an index record if this is the first file to go into archive
    264 
    265     Returns:
    266       True if an archive record is written, False if it isn't.
    267     """
    268     archive = self.OpenZipFileAtPath(
    269         os.path.join(self.output_dir, self.current_archive), 'r')
    270     archive_index = archive.infolist()
    271     if len(archive_index) == 1:
    272       self.index_fp.write(
    273           '[\'%s\', \'%s\'],\n' % (self.current_archive,
    274                                    archive_index[0].filename))
    275       archive.close()
    276       return True
    277     else:
    278       archive.close()
    279       return False
    280 
    281   def FixArchive(self, problem):
    282     """Make the archive compliant.
    283 
    284     Args:
    285       problem: An enum, the reason the archive is invalid.
    286 
    287     Returns:
    288       Whether the file(s) removed to fix the archive could conceivably be
    289       in an archive, but for some reason can't be added to this one.
    290     """
    291     archive_path = os.path.join(self.output_dir, self.current_archive)
    292     return_value = None
    293 
    294     if problem == 'SIZE':
    295       archive_obj = self.OpenZipFileAtPath(archive_path, mode='r')
    296       num_archive_files = len(archive_obj.infolist())
    297 
    298       # IF there is a single file, that means its too large to compress,
    299       # delete the created archive
    300       # ELSE do normal finalization.
    301       if num_archive_files == 1:
    302         print ('WARNING: %s%s is too large to store.' % (
    303             self.base_path, archive_obj.infolist()[0].filename))
    304         archive_obj.close()
    305         os.unlink(archive_path)
    306         return_value = False
    307       else:
    308         archive_obj.close()
    309         self.RemoveLastFile(
    310           os.path.join(self.output_dir, self.current_archive))
    311         print 'Final archive size for %s is %i' % (
    312             self.current_archive, os.path.getsize(archive_path))
    313         return_value = True
    314     return return_value
    315 
    316   def AddFileToArchive(self, filepath, compress_bit):
    317     """Add the file at filepath to the current archive.
    318 
    319     Args:
    320       filepath: A string, the path of the file to add.
    321       compress_bit: A boolean, whether or not this file should be compressed
    322         when added.
    323 
    324     Returns:
    325       True if the file could be added (typically because this is a file) or
    326       False if it couldn't be added (typically because its a directory).
    327     """
    328     curr_archive_path = os.path.join(self.output_dir, self.current_archive)
    329     if os.path.isfile(filepath) and not os.path.islink(filepath):
    330       if os.path.getsize(filepath) > 1048576:
    331         print 'Warning: %s is potentially too large to serve on GAE' % filepath
    332       archive = self.OpenZipFileAtPath(curr_archive_path,
    333                                        compress=compress_bit)
    334       # Add the file to the archive.
    335       archive.write(filepath, filepath[len(self.base_path):])
    336       archive.close()
    337       return True
    338     else:
    339       return False
    340 
    341   def ArchiveIsValid(self):
    342     """Check whether the archive is valid.
    343 
    344     Currently this only checks whether the archive is under the required size.
    345     The thought is that eventually this will do additional validation
    346 
    347     Returns:
    348       True if the archive is valid, False if its not.
    349     """
    350     archive_path = os.path.join(self.output_dir, self.current_archive)
    351     return os.path.getsize(archive_path) <= self.max_size
    352 
    353 
    354 def main(argv):
    355   parser = CreateOptionsParser()
    356   (options, unused_args) = parser.parse_args(args=argv[1:])
    357   VerifyArguments(options, parser)
    358   zipper = DirectoryZipper(options.destination,
    359                            options.sourcefiles,
    360                            ParseSize(options.filesize),
    361                            options.compress)
    362   zipper.StartCompress()
    363 
    364 
    365 if __name__ == '__main__':
    366   main(sys.argv)
    367