Home | History | Annotate | Download | only in commands
      1 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 """Component for automatically creating masks of changing areas of a website.
      6 
      7 Works by repeated invokation of a browser and scraping of the resulting page.
      8 Areas that differ will be added to the auto-generated mask. The mask generator
      9 considers the mask complete when further scrapes fail to produce any differences
     10 in the mask.
     11 """
     12 
     13 import os            # Functions for walking the directory tree
     14 import tempfile      # Get a temporary directory to hold intermediates
     15 import time          # Used for sleep() and naming masks by time
     16 
     17 import command_line
     18 import drivers
     19 from PIL import Image
     20 from PIL import ImageChops
     21 import scrapers
     22 
     23 
     24 def CreateCommand(cmdline):
     25   """Inserts the command and arguments into a command line for parsing."""
     26   cmd = cmdline.AddCommand(
     27     ["maskmaker"],
     28     "Automatically generates a mask from a list of URLs",
     29     ValidateMaskmaker,
     30     ExecuteMaskmaker)
     31 
     32   cmd.AddArgument(
     33     ["-bp", "--browserpath"], "Full path to browser's executable",
     34     type="readfile", metaname="PATH")
     35   cmd.AddArgument(
     36     ["-b", "--browser"], "Which browser to use", type="string",
     37     default="chrome")
     38   cmd.AddArgument(
     39     ["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
     40   cmd.AddArgument(
     41     ["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
     42     required=True)
     43   cmd.AddArgument(
     44     ["-u", "--url"], "URL to compare")
     45   cmd.AddArgument(
     46     ["-l", "--list"], "List of URLs to compare", type="readfile")
     47   cmd.AddMutualExclusion(["--url", "--list"])
     48   cmd.AddArgument(
     49     ["-s", "--startline"], "First line of URL list", type="int")
     50   cmd.AddArgument(
     51     ["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
     52   cmd.AddArgument(
     53     ["-c", "--count"], "Number of lines of URL file to use", type="int")
     54   cmd.AddDependency("--startline", "--list")
     55   cmd.AddRequiredGroup(["--url", "--list"])
     56   cmd.AddDependency("--endline", "--list")
     57   cmd.AddDependency("--count", "--list")
     58   cmd.AddMutualExclusion(["--count", "--endline"])
     59   cmd.AddDependency("--count", "--startline")
     60   cmd.AddArgument(
     61     ["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
     62     "finish loading",
     63     type="int", default=60)
     64   cmd.AddArgument(
     65     ["-w", "--wait"],
     66     "Amount of time (in seconds) to wait between successive scrapes",
     67     type="int", default=60)
     68   cmd.AddArgument(
     69     ["-sc", "--scrapes"],
     70     "Number of successive scrapes which must result in no change to a mask "
     71     "before mask creation is considered complete", type="int", default=10)
     72   cmd.AddArgument(
     73     ["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
     74   cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
     75   cmd.AddArgument(
     76     ["-gu", "--giveup"],
     77     "Number of times to scrape before giving up", type="int", default=50)
     78   cmd.AddArgument(
     79     ["-th", "--threshhold"],
     80     "Percentage of different pixels (0-100) above which the scrape will be"
     81     "discarded and the mask not updated.", type="int", default=100)
     82   cmd.AddArgument(
     83     ["--er", "--errors"],
     84     "Number of times a scrape can fail before giving up on the URL.",
     85     type="int", default=1)
     86 
     87 
     88 def ValidateMaskmaker(command):
     89   """Validate the arguments to maskmaker. Raises ParseError if failed."""
     90   executables = [".exe", ".com", ".bat"]
     91   if command["--browserpath"]:
     92     if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
     93       raise command_line.ParseError("Browser filename must be an executable")
     94 
     95 
     96 def ExecuteMaskmaker(command):
     97   """Performs automatic mask generation."""
     98 
     99   # Get the list of URLs to generate masks for
    100   class MaskmakerURL(object):
    101     """Helper class for holding information about a URL passed to maskmaker."""
    102     __slots__ = ['url', 'consecutive_successes', 'errors']
    103     def __init__(self, url):
    104       self.url = url
    105       self.consecutive_successes = 0
    106       self.errors = 0
    107 
    108   if command["--url"]:
    109     url_list = [MaskmakerURL(command["--url"])]
    110   else:
    111     startline = command["--startline"]
    112     if command["--count"]:
    113       endline = startline+command["--count"]
    114     else:
    115       endline = command["--endline"]
    116     url_list = [MaskmakerURL(url.strip()) for url in
    117                 open(command["--list"], "r").readlines()[startline:endline]]
    118 
    119   complete_list = []
    120   error_list = []
    121 
    122   outdir = command["--outdir"]
    123   scrapes = command["--scrapes"]
    124   errors = command["--errors"]
    125   size = command["--size"]
    126   scrape_pass = 0
    127 
    128   scrapedir = command["--scrapedir"]
    129   if not scrapedir: scrapedir = tempfile.gettempdir()
    130 
    131   # Get the scraper
    132   scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))
    133 
    134   # Repeatedly iterate through the list of URLs until either every URL has
    135   # a successful mask or too many errors, or we've exceeded the giveup limit
    136   while url_list and scrape_pass < command["--giveup"]:
    137     # Scrape each URL
    138     for url in url_list:
    139       print "Processing %r..." % url.url
    140       mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")
    141 
    142       # Load the existing mask. This is in a loop so we can try to recover
    143       # from error conditions
    144       while True:
    145         try:
    146           mask = Image.open(mask_filename)
    147           if mask.size != size:
    148             print "  %r already exists and is the wrong size! (%r vs %r)" % (
    149               mask_filename, mask.size, size)
    150             mask_filename = "%s_%r%s" % (
    151               mask_filename[:-4], size, mask_filename[-4:])
    152             print "  Trying again as %r..." % mask_filename
    153             continue
    154           break
    155         except IOError:
    156           print "  %r does not exist, creating" % mask_filename
    157           mask = Image.new("1", size, 1)
    158           mask.save(mask_filename)
    159 
    160       # Find the stored scrape path
    161       mask_scrape_dir = os.path.join(
    162         scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
    163       drivers.windowing.PreparePath(mask_scrape_dir)
    164 
    165       # Find the baseline image
    166       mask_scrapes = os.listdir(mask_scrape_dir)
    167       mask_scrapes.sort()
    168 
    169       if not mask_scrapes:
    170         print "  No baseline image found, mask will not be updated"
    171         baseline = None
    172       else:
    173         baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))
    174 
    175       mask_scrape_filename = os.path.join(mask_scrape_dir,
    176                                           time.strftime("%y%m%d-%H%M%S.bmp"))
    177 
    178       # Do the scrape
    179       result = scraper.Scrape(
    180         [url.url], mask_scrape_dir, size, (0, 0),
    181         command["--timeout"], path=command["--browserpath"],
    182         filename=mask_scrape_filename)
    183 
    184       if result:
    185         # Return value other than None means an error
    186         print "  Scrape failed with error '%r'" % result
    187         url.errors += 1
    188         if url.errors >= errors:
    189           print "  ** Exceeded maximum error count for this URL, giving up"
    190         continue
    191 
    192       # Load the new scrape
    193       scrape = Image.open(mask_scrape_filename)
    194 
    195       # Calculate the difference between the new scrape and the baseline,
    196       # subject to the current mask
    197       if baseline:
    198         diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
    199                                    mask.convert(scrape.mode))
    200 
    201         # If the difference is none, there's nothing to update
    202         if max(diff.getextrema()) == (0, 0):
    203           print "  Scrape identical to baseline, no change in mask"
    204           url.consecutive_successes += 1
    205           if url.consecutive_successes >= scrapes:
    206             print "  ** No change for %r scrapes, done!" % scrapes
    207         else:
    208           # convert the difference to black and white, then change all
    209           # black pixels (where the scrape and the baseline were identical)
    210           # to white, all others (where the scrape and the baseline differed)
    211           # to black.
    212           #
    213           # Since the below command is a little unclear, here's how it works.
    214           #    1. convert("L") converts the RGB image to grayscale
    215           #    2. point() maps grayscale values (or the individual channels)
    216           #       of an RGB image) to different ones. Because it operates on
    217           #       individual channels, the grayscale conversion from step 1
    218           #       is necessary.
    219           #    3. The "1" second parameter to point() outputs the result as
    220           #       a monochrome bitmap. If the original RGB image were converted
    221           #       directly to monochrome, PIL would dither it.
    222           diff = diff.convert("L").point([255]+[0]*255, "1")
    223 
    224           # count the number of different pixels
    225           diff_pixels = diff.getcolors()[0][0]
    226 
    227           # is this too much?
    228           diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
    229           if diff_pixel_percent > command["--threshhold"]:
    230             print ("  Scrape differed from baseline by %.2f percent, ignoring"
    231                    % diff_pixel_percent)
    232           else:
    233             print "  Scrape differed in %d pixels, updating mask" % diff_pixels
    234             mask = ImageChops.multiply(mask, diff)
    235             mask.save(mask_filename)
    236 
    237             # reset the number of consecutive "good" scrapes
    238             url.consecutive_successes = 0
    239 
    240     # Remove URLs whose mask is deemed done
    241     complete_list.extend(
    242       [url for url in url_list if url.consecutive_successes >= scrapes])
    243     error_list.extend(
    244       [url for url in url_list if url.errors >= errors])
    245     url_list = [
    246       url for url in url_list if
    247       url.consecutive_successes < scrapes and
    248       url.errors < errors]
    249 
    250     scrape_pass += 1
    251     print "**Done with scrape pass %d\n" % scrape_pass
    252 
    253     if scrape_pass >= command["--giveup"]:
    254       print "**Exceeded giveup threshhold. Giving up."
    255     else:
    256       print "Waiting %d seconds..." % command["--wait"]
    257       time.sleep(command["--wait"])
    258 
    259   print
    260   print "*** MASKMAKER COMPLETE ***"
    261   print "Summary report:"
    262   print "  %d masks successfully generated" % len(complete_list)
    263   for url in complete_list:
    264     print "    ", url.url
    265   print "  %d masks failed with too many errors" % len(error_list)
    266   for url in error_list:
    267     print "    ", url.url
    268   if scrape_pass >= command["--giveup"]:
    269     print ("  %d masks were not completed before "
    270            "reaching the giveup threshhold" % len(url_list))
    271     for url in url_list:
    272       print "    ", url.url
    273