Home | History | Annotate | Download | only in rebaseline_server
      1 #!/usr/bin/python
      2 
      3 """
      4 Copyright 2013 Google Inc.
      5 
      6 Use of this source code is governed by a BSD-style license that can be
      7 found in the LICENSE file.
      8 
      9 Calulate differences between image pairs, and store them in a database.
     10 """
     11 
     12 # System-level imports
     13 import contextlib
     14 import errno
     15 import json
     16 import logging
     17 import os
     18 import Queue
     19 import re
     20 import shutil
     21 import tempfile
     22 import threading
     23 import time
     24 import urllib
     25 
     26 # Must fix up PYTHONPATH before importing from within Skia
     27 import rs_fixpypath  # pylint: disable=W0611
     28 
     29 # Imports from within Skia
     30 import find_run_binary
     31 from py.utils import gs_utils
     32 
     33 
     34 SKPDIFF_BINARY = find_run_binary.find_path_to_program('skpdiff')
     35 
     36 DEFAULT_IMAGE_SUFFIX = '.png'
     37 DEFAULT_IMAGES_SUBDIR = 'images'
     38 # TODO(epoger): Figure out a better default number of threads; for now,
     39 # using a conservative default value.
     40 DEFAULT_NUM_WORKER_THREADS = 1
     41 
     42 DISALLOWED_FILEPATH_CHAR_REGEX = re.compile('[^\w\-]')
     43 
     44 RGBDIFFS_SUBDIR = 'diffs'
     45 WHITEDIFFS_SUBDIR = 'whitediffs'
     46 
     47 # Keys used within DiffRecord dictionary representations.
     48 # NOTE: Keep these in sync with static/constants.js
     49 KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL = 'maxDiffPerChannel'
     50 KEY__DIFFERENCES__NUM_DIFF_PIXELS = 'numDifferingPixels'
     51 KEY__DIFFERENCES__PERCENT_DIFF_PIXELS = 'percentDifferingPixels'
     52 KEY__DIFFERENCES__PERCEPTUAL_DIFF = 'perceptualDifference'
     53 KEY__DIFFERENCES__DIFF_URL = 'diffUrl'
     54 KEY__DIFFERENCES__WHITE_DIFF_URL = 'whiteDiffUrl'
     55 
     56 # Special values within ImageDiffDB._diff_dict
     57 _DIFFRECORD_FAILED = 'failed'
     58 _DIFFRECORD_PENDING = 'pending'
     59 
     60 # How often to report tasks_queue size
     61 QUEUE_LOGGING_GRANULARITY = 1000
     62 
     63 # Temporary variable to keep track of how many times we download
     64 # the same file in multiple threads.
     65 # TODO(epoger): Delete this, once we see that the number stays close to 0.
     66 global_file_collisions = 0
     67 
     68 
     69 class DiffRecord(object):
     70   """ Record of differences between two images. """
     71 
     72   def __init__(self, gs, storage_root,
     73                expected_image_url, expected_image_locator,
     74                actual_image_url, actual_image_locator,
     75                expected_images_subdir=DEFAULT_IMAGES_SUBDIR,
     76                actual_images_subdir=DEFAULT_IMAGES_SUBDIR,
     77                image_suffix=DEFAULT_IMAGE_SUFFIX):
     78     """Download this pair of images (unless we already have them on local disk),
     79     and prepare a DiffRecord for them.
     80 
     81     Args:
     82       gs: instance of GSUtils object we can use to download images
     83       storage_root: root directory on local disk within which we store all
     84           images
     85       expected_image_url: file, GS, or HTTP url from which we will download the
     86           expected image
     87       expected_image_locator: a unique ID string under which we will store the
     88           expected image within storage_root (probably including a checksum to
     89           guarantee uniqueness)
     90       actual_image_url: file, GS, or HTTP url from which we will download the
     91           actual image
     92       actual_image_locator: a unique ID string under which we will store the
     93           actual image within storage_root (probably including a checksum to
     94           guarantee uniqueness)
     95       expected_images_subdir: the subdirectory expected images are stored in.
     96       actual_images_subdir: the subdirectory actual images are stored in.
     97       image_suffix: the suffix of images.
     98     """
     99     expected_image_locator = _sanitize_locator(expected_image_locator)
    100     actual_image_locator = _sanitize_locator(actual_image_locator)
    101 
    102     # Download the expected/actual images, if we don't have them already.
    103     expected_image_file = os.path.join(
    104         storage_root, expected_images_subdir,
    105         str(expected_image_locator) + image_suffix)
    106     actual_image_file = os.path.join(
    107         storage_root, actual_images_subdir,
    108         str(actual_image_locator) + image_suffix)
    109     for image_file, image_url in [
    110         (expected_image_file, expected_image_url),
    111         (actual_image_file, actual_image_url)]:
    112       if image_file and image_url:
    113         try:
    114           _download_file(gs, image_file, image_url)
    115         except Exception:
    116           logging.exception('unable to download image_url %s to file %s' %
    117                             (image_url, image_file))
    118           raise
    119 
    120     # Return early if we do not need to generate diffs.
    121     if (expected_image_url == actual_image_url or
    122         not expected_image_url or not actual_image_url):
    123       return
    124 
    125     # Get all diff images and values using the skpdiff binary.
    126     skpdiff_output_dir = tempfile.mkdtemp()
    127     try:
    128       skpdiff_summary_file = os.path.join(skpdiff_output_dir,
    129                                           'skpdiff-output.json')
    130       skpdiff_rgbdiff_dir = os.path.join(storage_root, RGBDIFFS_SUBDIR)
    131       skpdiff_whitediff_dir = os.path.join(storage_root, WHITEDIFFS_SUBDIR)
    132       _mkdir_unless_exists(skpdiff_rgbdiff_dir)
    133       _mkdir_unless_exists(skpdiff_rgbdiff_dir)
    134 
    135       # TODO(epoger): Consider calling skpdiff ONCE for all image pairs,
    136       # instead of calling it separately for each image pair.
    137       # Pro: we'll incur less overhead from making repeated system calls,
    138       # spinning up the skpdiff binary, etc.
    139       # Con: we would have to wait until all image pairs were loaded before
    140       # generating any of the diffs?
    141       # Note(stephana): '--longnames' was added to allow for this 
    142       # case (multiple files at once) versus specifying output diffs 
    143       # directly.
    144       find_run_binary.run_command(
    145           [SKPDIFF_BINARY, '-p', expected_image_file, actual_image_file,
    146            '--jsonp', 'false',
    147            '--longnames', 'true',
    148            '--output', skpdiff_summary_file,
    149            '--differs', 'perceptual', 'different_pixels',
    150            '--rgbDiffDir', skpdiff_rgbdiff_dir,
    151            '--whiteDiffDir', skpdiff_whitediff_dir,
    152            ])
    153 
    154       # Get information out of the skpdiff_summary_file.
    155       with contextlib.closing(open(skpdiff_summary_file)) as fp:
    156         data = json.load(fp)
    157 
    158       # For now, we can assume there is only one record in the output summary,
    159       # since we passed skpdiff only one pair of images.
    160       record = data['records'][0]
    161       self._width = record['width']
    162       self._height = record['height']
    163       self._diffUrl = os.path.split(record['rgbDiffPath'])[1]
    164       self._whiteDiffUrl = os.path.split(record['whiteDiffPath'])[1]
    165 
    166       # TODO: make max_diff_per_channel a tuple instead of a list, because the
    167       # structure is meaningful (first element is red, second is green, etc.)
    168       # See http://stackoverflow.com/a/626871
    169       self._max_diff_per_channel = [
    170           record['maxRedDiff'], record['maxGreenDiff'], record['maxBlueDiff']]
    171       per_differ_stats = record['diffs']
    172       for stats in per_differ_stats:
    173         differ_name = stats['differName']
    174         if differ_name == 'different_pixels':
    175           self._num_pixels_differing = stats['pointsOfInterest']
    176         elif differ_name == 'perceptual':
    177           perceptual_similarity = stats['result']
    178 
    179       # skpdiff returns the perceptual similarity; convert it to get the
    180       # perceptual difference percentage.
    181       # skpdiff outputs -1 if the images are different sizes. Treat any
    182       # output that does not lie in [0, 1] as having 0% perceptual
    183       # similarity.
    184       if not 0 <= perceptual_similarity <= 1:
    185         perceptual_similarity = 0
    186       self._perceptual_difference = 100 - (perceptual_similarity * 100)
    187     finally:
    188       shutil.rmtree(skpdiff_output_dir)
    189 
    190   # TODO(epoger): Use properties instead of getters throughout.
    191   # See http://stackoverflow.com/a/6618176
    192   def get_num_pixels_differing(self):
    193     """Returns the absolute number of pixels that differ."""
    194     return self._num_pixels_differing
    195 
    196   def get_percent_pixels_differing(self):
    197     """Returns the percentage of pixels that differ, as a float between
    198     0 and 100 (inclusive)."""
    199     return ((float(self._num_pixels_differing) * 100) /
    200             (self._width * self._height))
    201 
    202   def get_perceptual_difference(self):
    203     """Returns the perceptual difference percentage."""
    204     return self._perceptual_difference
    205 
    206   def get_max_diff_per_channel(self):
    207     """Returns the maximum difference between the expected and actual images
    208     for each R/G/B channel, as a list."""
    209     return self._max_diff_per_channel
    210 
    211   def as_dict(self):
    212     """Returns a dictionary representation of this DiffRecord, as needed when
    213     constructing the JSON representation."""
    214     return {
    215         KEY__DIFFERENCES__NUM_DIFF_PIXELS: self._num_pixels_differing,
    216         KEY__DIFFERENCES__PERCENT_DIFF_PIXELS:
    217             self.get_percent_pixels_differing(),
    218         KEY__DIFFERENCES__MAX_DIFF_PER_CHANNEL: self._max_diff_per_channel,
    219         KEY__DIFFERENCES__PERCEPTUAL_DIFF: self._perceptual_difference,
    220         KEY__DIFFERENCES__DIFF_URL: self._diffUrl,
    221         KEY__DIFFERENCES__WHITE_DIFF_URL: self._whiteDiffUrl, 
    222     }
    223 
    224 
    225 
    226 class ImageDiffDB(object):
    227   """ Calculates differences between image pairs, maintaining a database of
    228   them for download."""
    229 
    230   def __init__(self, storage_root, gs=None,
    231                num_worker_threads=DEFAULT_NUM_WORKER_THREADS):
    232     """
    233     Args:
    234       storage_root: string; root path within the DB will store all of its stuff
    235       gs: instance of GSUtils object we can use to download images
    236       num_worker_threads: how many threads that download images and
    237           generate diffs simultaneously
    238     """
    239     self._storage_root = storage_root
    240     self._gs = gs
    241 
    242     # Mechanism for reporting queue size periodically.
    243     self._last_queue_size_reported = None
    244     self._queue_size_report_lock = threading.RLock()
    245 
    246     # Dictionary of DiffRecords, keyed by (expected_image_locator,
    247     # actual_image_locator) tuples.
    248     # Values can also be _DIFFRECORD_PENDING, _DIFFRECORD_FAILED.
    249     #
    250     # Any thread that modifies _diff_dict must first acquire
    251     # _diff_dict_writelock!
    252     #
    253     # TODO(epoger): Disk is limitless, but RAM is not... so, we should probably
    254     # remove items from self._diff_dict if they haven't been accessed for a
    255     # long time.  We can always regenerate them by diffing the images we
    256     # previously downloaded to local disk.
    257     # I guess we should figure out how expensive it is to download vs diff the
    258     # image pairs... if diffing them is expensive too, we can write these
    259     # _diff_dict objects out to disk if there's too many to hold in RAM.
    260     # Or we could use virtual memory to handle that automatically.
    261     self._diff_dict = {}
    262     self._diff_dict_writelock = threading.RLock()
    263 
    264     # Set up the queue for asynchronously loading DiffRecords, and start the
    265     # worker threads reading from it.
    266     # The queue maxsize must be 0 (infinite size queue), so that asynchronous
    267     # calls can return as soon as possible.
    268     self._tasks_queue = Queue.Queue(maxsize=0)
    269     self._workers = []
    270     for i in range(num_worker_threads):
    271       worker = threading.Thread(target=self.worker, args=(i,))
    272       worker.daemon = True
    273       worker.start()
    274       self._workers.append(worker)
    275 
    276   def log_queue_size_if_changed(self, limit_verbosity=True):
    277     """Log the size of self._tasks_queue, if it has changed since the last call.
    278 
    279     Reports the current queue size, using log.info(), unless the queue is the
    280     same size as the last time we reported it.
    281 
    282     Args:
    283       limit_verbosity: if True, only log if the queue size is a multiple of
    284           QUEUE_LOGGING_GRANULARITY
    285     """
    286     # Acquire the lock, to synchronize access to self._last_queue_size_reported
    287     self._queue_size_report_lock.acquire()
    288     try:
    289       size = self._tasks_queue.qsize()
    290       if size == self._last_queue_size_reported:
    291         return
    292       if limit_verbosity and (size % QUEUE_LOGGING_GRANULARITY != 0):
    293         return
    294       logging.info('tasks_queue size is %d' % size)
    295       self._last_queue_size_reported = size
    296     finally:
    297       self._queue_size_report_lock.release()
    298 
    299   def worker(self, worker_num):
    300     """Launch a worker thread that pulls tasks off self._tasks_queue.
    301 
    302     Args:
    303       worker_num: (integer) which worker this is
    304     """
    305     while True:
    306       self.log_queue_size_if_changed()
    307       params = self._tasks_queue.get()
    308       key, expected_image_url, actual_image_url = params
    309       try:
    310         diff_record = DiffRecord(
    311             self._gs, self._storage_root,
    312             expected_image_url=expected_image_url,
    313             expected_image_locator=key[0],
    314             actual_image_url=actual_image_url,
    315             actual_image_locator=key[1])
    316       except Exception:
    317         logging.exception(
    318             'exception while creating DiffRecord for key %s' % str(key))
    319         diff_record = _DIFFRECORD_FAILED
    320       self._diff_dict_writelock.acquire()
    321       try:
    322         self._diff_dict[key] = diff_record
    323       finally:
    324         self._diff_dict_writelock.release()
    325 
    326   @property
    327   def storage_root(self):
    328     return self._storage_root
    329 
    330   def add_image_pair(self,
    331                      expected_image_url, expected_image_locator,
    332                      actual_image_url, actual_image_locator):
    333     """Asynchronously prepare a DiffRecord for a pair of images.
    334 
    335     This method will return quickly; calls to get_diff_record() will block
    336     until the DiffRecord is available (or we have given up on creating it).
    337 
    338     If we already have a DiffRecord for this particular image pair, no work
    339     will be done.
    340 
    341     If expected_image_url (or its locator) is None, just download actual_image.
    342     If actual_image_url (or its locator) is None, just download expected_image.
    343 
    344     Args:
    345       expected_image_url: file, GS, or HTTP url from which we will download the
    346           expected image
    347       expected_image_locator: a unique ID string under which we will store the
    348           expected image within storage_root (probably including a checksum to
    349           guarantee uniqueness)
    350       actual_image_url: file, GS, or HTTP url from which we will download the
    351           actual image
    352       actual_image_locator: a unique ID string under which we will store the
    353           actual image within storage_root (probably including a checksum to
    354           guarantee uniqueness)
    355     """
    356     expected_image_locator = _sanitize_locator(expected_image_locator)
    357     actual_image_locator = _sanitize_locator(actual_image_locator)
    358     key = (expected_image_locator, actual_image_locator)
    359     must_add_to_queue = False
    360 
    361     self._diff_dict_writelock.acquire()
    362     try:
    363       if not key in self._diff_dict:
    364         # If we have already requested a diff between these two images,
    365         # we don't need to request it again.
    366         must_add_to_queue = True
    367         self._diff_dict[key] = _DIFFRECORD_PENDING
    368     finally:
    369       self._diff_dict_writelock.release()
    370 
    371     if must_add_to_queue:
    372       self._tasks_queue.put((key, expected_image_url, actual_image_url))
    373       self.log_queue_size_if_changed()
    374 
    375   def get_diff_record(self, expected_image_locator, actual_image_locator):
    376     """Returns the DiffRecord for this image pair.
    377 
    378     This call will block until the diff record is available, or we were unable
    379     to generate it.
    380 
    381     Args:
    382       expected_image_locator: a unique ID string under which we will store the
    383           expected image within storage_root (probably including a checksum to
    384           guarantee uniqueness)
    385       actual_image_locator: a unique ID string under which we will store the
    386           actual image within storage_root (probably including a checksum to
    387           guarantee uniqueness)
    388 
    389     Returns the DiffRecord for this image pair, or None if we were unable to
    390     generate one.
    391     """
    392     key = (_sanitize_locator(expected_image_locator),
    393            _sanitize_locator(actual_image_locator))
    394     diff_record = self._diff_dict[key]
    395 
    396     # If we have no results yet, block until we do.
    397     while diff_record == _DIFFRECORD_PENDING:
    398       time.sleep(1)
    399       diff_record = self._diff_dict[key]
    400 
    401     # Once we have the result...
    402     if diff_record == _DIFFRECORD_FAILED:
    403       logging.error(
    404           'failed to create a DiffRecord for expected_image_locator=%s , '
    405           'actual_image_locator=%s' % (
    406               expected_image_locator, actual_image_locator))
    407       return None
    408     else:
    409       return diff_record
    410 
    411 
    412 # Utility functions
    413 
    414 def _download_file(gs, local_filepath, url):
    415   """Download a file from url to local_filepath, unless it is already there.
    416 
    417   Args:
    418     gs: instance of GSUtils object, in case the url points at Google Storage
    419     local_filepath: path on local disk where the image should be stored
    420     url: HTTP or GS URL from which we can download the image if we don't have
    421         it yet
    422   """
    423   global global_file_collisions
    424   if not os.path.exists(local_filepath):
    425     _mkdir_unless_exists(os.path.dirname(local_filepath))
    426 
    427     # First download the file contents into a unique filename, and
    428     # then rename that file.  That way, if multiple threads are downloading
    429     # the same filename at the same time, they won't interfere with each
    430     # other (they will both download the file, and one will "win" in the end)
    431     temp_filename = '%s-%d' % (local_filepath,
    432                                threading.current_thread().ident)
    433     if gs_utils.GSUtils.is_gs_url(url):
    434       (bucket, path) = gs_utils.GSUtils.split_gs_url(url)
    435       gs.download_file(source_bucket=bucket, source_path=path,
    436                        dest_path=temp_filename)
    437     else:
    438       with contextlib.closing(urllib.urlopen(url)) as url_handle:
    439         with open(temp_filename, 'wb') as file_handle:
    440           shutil.copyfileobj(fsrc=url_handle, fdst=file_handle)
    441 
    442     # Rename the file to its real filename.
    443     # Keep count of how many colliding downloads we encounter;
    444     # if it's a large number, we may want to change our download strategy
    445     # to minimize repeated downloads.
    446     if os.path.exists(local_filepath):
    447       global_file_collisions += 1
    448     else:
    449       os.rename(temp_filename, local_filepath)
    450 
    451 
    452 def _mkdir_unless_exists(path):
    453   """Unless path refers to an already-existing directory, create it.
    454 
    455   Args:
    456     path: path on local disk
    457   """
    458   try:
    459     os.makedirs(path)
    460   except OSError as e:
    461     if e.errno == errno.EEXIST:
    462       pass
    463 
    464 
    465 def _sanitize_locator(locator):
    466   """Returns a sanitized version of a locator (one in which we know none of the
    467   characters will have special meaning in filenames).
    468 
    469   Args:
    470     locator: string, or something that can be represented as a string.
    471         If None or '', it is returned without modification, because empty
    472         locators have a particular meaning ("there is no image for this")
    473   """
    474   if locator:
    475     return DISALLOWED_FILEPATH_CHAR_REGEX.sub('_', str(locator))
    476   else:
    477     return locator
    478