Home | History | Annotate | Download | only in util
      1 # Copyright 2013 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import difflib
      6 import hashlib
      7 import itertools
      8 import json
      9 import os
     10 import sys
     11 import zipfile
     12 
     13 
     14 # When set and a difference is detected, a diff of what changed is printed.
     15 PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0))
     16 
     17 # An escape hatch that causes all targets to be rebuilt.
     18 _FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0))
     19 
     20 
     21 def CallAndRecordIfStale(
     22     function, record_path=None, input_paths=None, input_strings=None,
     23     output_paths=None, force=False, pass_changes=False):
     24   """Calls function if outputs are stale.
     25 
     26   Outputs are considered stale if:
     27   - any output_paths are missing, or
     28   - the contents of any file within input_paths has changed, or
     29   - the contents of input_strings has changed.
     30 
     31   To debug which files are out-of-date, set the environment variable:
     32       PRINT_MD5_DIFFS=1
     33 
     34   Args:
     35     function: The function to call.
     36     record_path: Path to record metadata.
     37       Defaults to output_paths[0] + '.md5.stamp'
     38     input_paths: List of paths to calcualte an md5 sum on.
     39     input_strings: List of strings to record verbatim.
     40     output_paths: List of output paths.
     41     force: Whether to treat outputs as missing regardless of whether they
     42       actually are.
     43     pass_changes: Whether to pass a Changes instance to |function|.
     44   """
     45   assert record_path or output_paths
     46   input_paths = input_paths or []
     47   input_strings = input_strings or []
     48   output_paths = output_paths or []
     49   record_path = record_path or output_paths[0] + '.md5.stamp'
     50 
     51   assert record_path.endswith('.stamp'), (
     52       'record paths must end in \'.stamp\' so that they are easy to find '
     53       'and delete')
     54 
     55   new_metadata = _Metadata()
     56   new_metadata.AddStrings(input_strings)
     57 
     58   for path in input_paths:
     59     if _IsZipFile(path):
     60       entries = _ExtractZipEntries(path)
     61       new_metadata.AddZipFile(path, entries)
     62     else:
     63       new_metadata.AddFile(path, _Md5ForPath(path))
     64 
     65   old_metadata = None
     66   force = force or _FORCE_REBUILD
     67   missing_outputs = [x for x in output_paths if force or not os.path.exists(x)]
     68   # When outputs are missing, don't bother gathering change information.
     69   if not missing_outputs and os.path.exists(record_path):
     70     with open(record_path, 'r') as jsonfile:
     71       try:
     72         old_metadata = _Metadata.FromFile(jsonfile)
     73       except:  # pylint: disable=bare-except
     74         pass  # Not yet using new file format.
     75 
     76   changes = Changes(old_metadata, new_metadata, force, missing_outputs)
     77   if not changes.HasChanges():
     78     return
     79 
     80   if PRINT_EXPLANATIONS:
     81     print '=' * 80
     82     print 'Target is stale: %s' % record_path
     83     print changes.DescribeDifference()
     84     print '=' * 80
     85 
     86   args = (changes,) if pass_changes else ()
     87   function(*args)
     88 
     89   with open(record_path, 'w') as f:
     90     new_metadata.ToFile(f)
     91 
     92 
     93 class Changes(object):
     94   """Provides and API for querying what changed between runs."""
     95 
     96   def __init__(self, old_metadata, new_metadata, force, missing_outputs):
     97     self.old_metadata = old_metadata
     98     self.new_metadata = new_metadata
     99     self.force = force
    100     self.missing_outputs = missing_outputs
    101 
    102   def _GetOldTag(self, path, subpath=None):
    103     return self.old_metadata and self.old_metadata.GetTag(path, subpath)
    104 
    105   def HasChanges(self):
    106     """Returns whether any changes exist."""
    107     return (self.force or
    108             not self.old_metadata or
    109             self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5() or
    110             self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5())
    111 
    112   def AddedOrModifiedOnly(self):
    113     """Returns whether the only changes were from added or modified (sub)files.
    114 
    115     No missing outputs, no removed paths/subpaths.
    116     """
    117     if (self.force or
    118         not self.old_metadata or
    119         self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5()):
    120       return False
    121     if any(self.IterRemovedPaths()):
    122       return False
    123     for path in self.IterModifiedPaths():
    124       if any(self.IterRemovedSubpaths(path)):
    125         return False
    126     return True
    127 
    128   def IterAllPaths(self):
    129     """Generator for paths."""
    130     return self.new_metadata.IterPaths();
    131 
    132   def IterAllSubpaths(self, path):
    133     """Generator for subpaths."""
    134     return self.new_metadata.IterSubpaths(path);
    135 
    136   def IterAddedPaths(self):
    137     """Generator for paths that were added."""
    138     for path in self.new_metadata.IterPaths():
    139       if self._GetOldTag(path) is None:
    140         yield path
    141 
    142   def IterAddedSubpaths(self, path):
    143     """Generator for paths that were added within the given zip file."""
    144     for subpath in self.new_metadata.IterSubpaths(path):
    145       if self._GetOldTag(path, subpath) is None:
    146         yield subpath
    147 
    148   def IterRemovedPaths(self):
    149     """Generator for paths that were removed."""
    150     if self.old_metadata:
    151       for path in self.old_metadata.IterPaths():
    152         if self.new_metadata.GetTag(path) is None:
    153           yield path
    154 
    155   def IterRemovedSubpaths(self, path):
    156     """Generator for paths that were removed within the given zip file."""
    157     if self.old_metadata:
    158       for subpath in self.old_metadata.IterSubpaths(path):
    159         if self.new_metadata.GetTag(path, subpath) is None:
    160           yield subpath
    161 
    162   def IterModifiedPaths(self):
    163     """Generator for paths whose contents have changed."""
    164     for path in self.new_metadata.IterPaths():
    165       old_tag = self._GetOldTag(path)
    166       new_tag = self.new_metadata.GetTag(path)
    167       if old_tag is not None and old_tag != new_tag:
    168         yield path
    169 
    170   def IterModifiedSubpaths(self, path):
    171     """Generator for paths within a zip file whose contents have changed."""
    172     for subpath in self.new_metadata.IterSubpaths(path):
    173       old_tag = self._GetOldTag(path, subpath)
    174       new_tag = self.new_metadata.GetTag(path, subpath)
    175       if old_tag is not None and old_tag != new_tag:
    176         yield subpath
    177 
    178   def IterChangedPaths(self):
    179     """Generator for all changed paths (added/removed/modified)."""
    180     return itertools.chain(self.IterRemovedPaths(),
    181                            self.IterModifiedPaths(),
    182                            self.IterAddedPaths())
    183 
    184   def IterChangedSubpaths(self, path):
    185     """Generator for paths within a zip that were added/removed/modified."""
    186     return itertools.chain(self.IterRemovedSubpaths(path),
    187                            self.IterModifiedSubpaths(path),
    188                            self.IterAddedSubpaths(path))
    189 
    190   def DescribeDifference(self):
    191     """Returns a human-readable description of what changed."""
    192     if self.force:
    193       return 'force=True'
    194     elif self.missing_outputs:
    195       return 'Outputs do not exist:\n  ' + '\n  '.join(self.missing_outputs)
    196     elif self.old_metadata is None:
    197       return 'Previous stamp file not found.'
    198 
    199     if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5():
    200       ndiff = difflib.ndiff(self.old_metadata.GetStrings(),
    201                             self.new_metadata.GetStrings())
    202       changed = [s for s in ndiff if not s.startswith(' ')]
    203       return 'Input strings changed:\n  ' + '\n  '.join(changed)
    204 
    205     if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5():
    206       return "There's no difference."
    207 
    208     lines = []
    209     lines.extend('Added: ' + p for p in self.IterAddedPaths())
    210     lines.extend('Removed: ' + p for p in self.IterRemovedPaths())
    211     for path in self.IterModifiedPaths():
    212       lines.append('Modified: ' + path)
    213       lines.extend('  -> Subpath added: ' + p
    214                    for p in self.IterAddedSubpaths(path))
    215       lines.extend('  -> Subpath removed: ' + p
    216                    for p in self.IterRemovedSubpaths(path))
    217       lines.extend('  -> Subpath modified: ' + p
    218                    for p in self.IterModifiedSubpaths(path))
    219     if lines:
    220       return 'Input files changed:\n  ' + '\n  '.join(lines)
    221     return 'I have no idea what changed (there is a bug).'
    222 
    223 
    224 class _Metadata(object):
    225   """Data model for tracking change metadata."""
    226   # Schema:
    227   # {
    228   #   "files-md5": "VALUE",
    229   #   "strings-md5": "VALUE",
    230   #   "input-files": [
    231   #     {
    232   #       "path": "path.jar",
    233   #       "tag": "{MD5 of entries}",
    234   #       "entries": [
    235   #         { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ...
    236   #       ]
    237   #     }, {
    238   #       "path": "path.txt",
    239   #       "tag": "{MD5}",
    240   #     }
    241   #   ],
    242   #   "input-strings": ["a", "b", ...],
    243   # }
    244   def __init__(self):
    245     self._files_md5 = None
    246     self._strings_md5 = None
    247     self._files = []
    248     self._strings = []
    249     # Map of (path, subpath) -> entry. Created upon first call to _GetEntry().
    250     self._file_map = None
    251 
    252   @classmethod
    253   def FromFile(cls, fileobj):
    254     """Returns a _Metadata initialized from a file object."""
    255     ret = cls()
    256     obj = json.load(fileobj)
    257     ret._files_md5 = obj['files-md5']
    258     ret._strings_md5 = obj['strings-md5']
    259     ret._files = obj['input-files']
    260     ret._strings = obj['input-strings']
    261     return ret
    262 
    263   def ToFile(self, fileobj):
    264     """Serializes metadata to the given file object."""
    265     obj = {
    266         "files-md5": self.FilesMd5(),
    267         "strings-md5": self.StringsMd5(),
    268         "input-files": self._files,
    269         "input-strings": self._strings,
    270     }
    271     json.dump(obj, fileobj, indent=2)
    272 
    273   def _AssertNotQueried(self):
    274     assert self._files_md5 is None
    275     assert self._strings_md5 is None
    276     assert self._file_map is None
    277 
    278   def AddStrings(self, values):
    279     self._AssertNotQueried()
    280     self._strings.extend(str(v) for v in values)
    281 
    282   def AddFile(self, path, tag):
    283     """Adds metadata for a non-zip file.
    284 
    285     Args:
    286       path: Path to the file.
    287       tag: A short string representative of the file contents.
    288     """
    289     self._AssertNotQueried()
    290     self._files.append({
    291         'path': path,
    292         'tag': tag,
    293     })
    294 
    295   def AddZipFile(self, path, entries):
    296     """Adds metadata for a zip file.
    297 
    298     Args:
    299       path: Path to the file.
    300       entries: List of (subpath, tag) tuples for entries within the zip.
    301     """
    302     self._AssertNotQueried()
    303     tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries),
    304                                             (e[1] for e in entries)))
    305     self._files.append({
    306         'path': path,
    307         'tag': tag,
    308         'entries': [{"path": e[0], "tag": e[1]} for e in entries],
    309     })
    310 
    311   def GetStrings(self):
    312     """Returns the list of input strings."""
    313     return self._strings
    314 
    315   def FilesMd5(self):
    316     """Lazily computes and returns the aggregate md5 of input files."""
    317     if self._files_md5 is None:
    318       # Omit paths from md5 since temporary files have random names.
    319       self._files_md5 = _ComputeInlineMd5(
    320           self.GetTag(p) for p in sorted(self.IterPaths()))
    321     return self._files_md5
    322 
    323   def StringsMd5(self):
    324     """Lazily computes and returns the aggregate md5 of input strings."""
    325     if self._strings_md5 is None:
    326       self._strings_md5 = _ComputeInlineMd5(self._strings)
    327     return self._strings_md5
    328 
    329   def _GetEntry(self, path, subpath=None):
    330     """Returns the JSON entry for the given path / subpath."""
    331     if self._file_map is None:
    332       self._file_map = {}
    333       for entry in self._files:
    334         self._file_map[(entry['path'], None)] = entry
    335         for subentry in entry.get('entries', ()):
    336           self._file_map[(entry['path'], subentry['path'])] = subentry
    337     return self._file_map.get((path, subpath))
    338 
    339   def GetTag(self, path, subpath=None):
    340     """Returns the tag for the given path / subpath."""
    341     ret = self._GetEntry(path, subpath)
    342     return ret and ret['tag']
    343 
    344   def IterPaths(self):
    345     """Returns a generator for all top-level paths."""
    346     return (e['path'] for e in self._files)
    347 
    348   def IterSubpaths(self, path):
    349     """Returns a generator for all subpaths in the given zip.
    350 
    351     If the given path is not a zip file or doesn't exist, returns an empty
    352     iterable.
    353     """
    354     outer_entry = self._GetEntry(path)
    355     if not outer_entry:
    356       return ()
    357     subentries = outer_entry.get('entries', [])
    358     return (entry['path'] for entry in subentries)
    359 
    360 
    361 def _UpdateMd5ForFile(md5, path, block_size=2**16):
    362   with open(path, 'rb') as infile:
    363     while True:
    364       data = infile.read(block_size)
    365       if not data:
    366         break
    367       md5.update(data)
    368 
    369 
    370 def _UpdateMd5ForDirectory(md5, dir_path):
    371   for root, _, files in os.walk(dir_path):
    372     for f in files:
    373       _UpdateMd5ForFile(md5, os.path.join(root, f))
    374 
    375 
    376 def _Md5ForPath(path):
    377   md5 = hashlib.md5()
    378   if os.path.isdir(path):
    379     _UpdateMd5ForDirectory(md5, path)
    380   else:
    381     _UpdateMd5ForFile(md5, path)
    382   return md5.hexdigest()
    383 
    384 
    385 def _ComputeInlineMd5(iterable):
    386   """Computes the md5 of the concatenated parameters."""
    387   md5 = hashlib.md5()
    388   for item in iterable:
    389     md5.update(str(item))
    390   return md5.hexdigest()
    391 
    392 
    393 def _IsZipFile(path):
    394   """Returns whether to treat the given file as a zip file."""
    395   # ijar doesn't set the CRC32 field.
    396   if path.endswith('.interface.jar'):
    397     return False
    398   return path[-4:] in ('.zip', '.apk', '.jar') or path.endswith('.srcjar')
    399 
    400 
    401 def _ExtractZipEntries(path):
    402   """Returns a list of (path, CRC32) of all files within |path|."""
    403   entries = []
    404   with zipfile.ZipFile(path) as zip_file:
    405     for zip_info in zip_file.infolist():
    406       # Skip directories and empty files.
    407       if zip_info.CRC:
    408         entries.append(
    409             (zip_info.filename, zip_info.CRC + zip_info.compress_type))
    410   return entries
    411