1 # Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 """Component for automatically creating masks of changing areas of a website. 6 7 Works by repeated invokation of a browser and scraping of the resulting page. 8 Areas that differ will be added to the auto-generated mask. The mask generator 9 considers the mask complete when further scrapes fail to produce any differences 10 in the mask. 11 """ 12 13 import os # Functions for walking the directory tree 14 import tempfile # Get a temporary directory to hold intermediates 15 import time # Used for sleep() and naming masks by time 16 17 import command_line 18 import drivers 19 from PIL import Image 20 from PIL import ImageChops 21 import scrapers 22 23 24 def CreateCommand(cmdline): 25 """Inserts the command and arguments into a command line for parsing.""" 26 cmd = cmdline.AddCommand( 27 ["maskmaker"], 28 "Automatically generates a mask from a list of URLs", 29 ValidateMaskmaker, 30 ExecuteMaskmaker) 31 32 cmd.AddArgument( 33 ["-bp", "--browserpath"], "Full path to browser's executable", 34 type="readfile", metaname="PATH") 35 cmd.AddArgument( 36 ["-b", "--browser"], "Which browser to use", type="string", 37 default="chrome") 38 cmd.AddArgument( 39 ["-bv", "--browserver"], "Version of the browser", metaname="VERSION") 40 cmd.AddArgument( 41 ["-o", "--outdir"], "Directory to store generated masks", metaname="DIR", 42 required=True) 43 cmd.AddArgument( 44 ["-u", "--url"], "URL to compare") 45 cmd.AddArgument( 46 ["-l", "--list"], "List of URLs to compare", type="readfile") 47 cmd.AddMutualExclusion(["--url", "--list"]) 48 cmd.AddArgument( 49 ["-s", "--startline"], "First line of URL list", type="int") 50 cmd.AddArgument( 51 ["-e", "--endline"], "Last line of URL list (exclusive)", type="int") 52 cmd.AddArgument( 53 ["-c", "--count"], "Number of lines of URL file to use", type="int") 54 cmd.AddDependency("--startline", "--list") 55 cmd.AddRequiredGroup(["--url", "--list"]) 56 cmd.AddDependency("--endline", "--list") 57 cmd.AddDependency("--count", "--list") 58 cmd.AddMutualExclusion(["--count", "--endline"]) 59 cmd.AddDependency("--count", "--startline") 60 cmd.AddArgument( 61 ["-t", "--timeout"], "Amount of time (seconds) to wait for browser to " 62 "finish loading", 63 type="int", default=60) 64 cmd.AddArgument( 65 ["-w", "--wait"], 66 "Amount of time (in seconds) to wait between successive scrapes", 67 type="int", default=60) 68 cmd.AddArgument( 69 ["-sc", "--scrapes"], 70 "Number of successive scrapes which must result in no change to a mask " 71 "before mask creation is considered complete", type="int", default=10) 72 cmd.AddArgument( 73 ["-sz", "--size"], "Browser window size", default=(800, 600), type="coords") 74 cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes") 75 cmd.AddArgument( 76 ["-gu", "--giveup"], 77 "Number of times to scrape before giving up", type="int", default=50) 78 cmd.AddArgument( 79 ["-th", "--threshhold"], 80 "Percentage of different pixels (0-100) above which the scrape will be" 81 "discarded and the mask not updated.", type="int", default=100) 82 cmd.AddArgument( 83 ["--er", "--errors"], 84 "Number of times a scrape can fail before giving up on the URL.", 85 type="int", default=1) 86 87 88 def ValidateMaskmaker(command): 89 """Validate the arguments to maskmaker. Raises ParseError if failed.""" 90 executables = [".exe", ".com", ".bat"] 91 if command["--browserpath"]: 92 if os.path.splitext(command["--browserpath"])[1].lower() not in executables: 93 raise command_line.ParseError("Browser filename must be an executable") 94 95 96 def ExecuteMaskmaker(command): 97 """Performs automatic mask generation.""" 98 99 # Get the list of URLs to generate masks for 100 class MaskmakerURL(object): 101 """Helper class for holding information about a URL passed to maskmaker.""" 102 __slots__ = ['url', 'consecutive_successes', 'errors'] 103 def __init__(self, url): 104 self.url = url 105 self.consecutive_successes = 0 106 self.errors = 0 107 108 if command["--url"]: 109 url_list = [MaskmakerURL(command["--url"])] 110 else: 111 startline = command["--startline"] 112 if command["--count"]: 113 endline = startline+command["--count"] 114 else: 115 endline = command["--endline"] 116 url_list = [MaskmakerURL(url.strip()) for url in 117 open(command["--list"], "r").readlines()[startline:endline]] 118 119 complete_list = [] 120 error_list = [] 121 122 outdir = command["--outdir"] 123 scrapes = command["--scrapes"] 124 errors = command["--errors"] 125 size = command["--size"] 126 scrape_pass = 0 127 128 scrapedir = command["--scrapedir"] 129 if not scrapedir: scrapedir = tempfile.gettempdir() 130 131 # Get the scraper 132 scraper = scrapers.GetScraper((command["--browser"], command["--browserver"])) 133 134 # Repeatedly iterate through the list of URLs until either every URL has 135 # a successful mask or too many errors, or we've exceeded the giveup limit 136 while url_list and scrape_pass < command["--giveup"]: 137 # Scrape each URL 138 for url in url_list: 139 print "Processing %r..." % url.url 140 mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp") 141 142 # Load the existing mask. This is in a loop so we can try to recover 143 # from error conditions 144 while True: 145 try: 146 mask = Image.open(mask_filename) 147 if mask.size != size: 148 print " %r already exists and is the wrong size! (%r vs %r)" % ( 149 mask_filename, mask.size, size) 150 mask_filename = "%s_%r%s" % ( 151 mask_filename[:-4], size, mask_filename[-4:]) 152 print " Trying again as %r..." % mask_filename 153 continue 154 break 155 except IOError: 156 print " %r does not exist, creating" % mask_filename 157 mask = Image.new("1", size, 1) 158 mask.save(mask_filename) 159 160 # Find the stored scrape path 161 mask_scrape_dir = os.path.join( 162 scrapedir, os.path.splitext(os.path.basename(mask_filename))[0]) 163 drivers.windowing.PreparePath(mask_scrape_dir) 164 165 # Find the baseline image 166 mask_scrapes = os.listdir(mask_scrape_dir) 167 mask_scrapes.sort() 168 169 if not mask_scrapes: 170 print " No baseline image found, mask will not be updated" 171 baseline = None 172 else: 173 baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0])) 174 175 mask_scrape_filename = os.path.join(mask_scrape_dir, 176 time.strftime("%y%m%d-%H%M%S.bmp")) 177 178 # Do the scrape 179 result = scraper.Scrape( 180 [url.url], mask_scrape_dir, size, (0, 0), 181 command["--timeout"], path=command["--browserpath"], 182 filename=mask_scrape_filename) 183 184 if result: 185 # Return value other than None means an error 186 print " Scrape failed with error '%r'" % result 187 url.errors += 1 188 if url.errors >= errors: 189 print " ** Exceeded maximum error count for this URL, giving up" 190 continue 191 192 # Load the new scrape 193 scrape = Image.open(mask_scrape_filename) 194 195 # Calculate the difference between the new scrape and the baseline, 196 # subject to the current mask 197 if baseline: 198 diff = ImageChops.multiply(ImageChops.difference(scrape, baseline), 199 mask.convert(scrape.mode)) 200 201 # If the difference is none, there's nothing to update 202 if max(diff.getextrema()) == (0, 0): 203 print " Scrape identical to baseline, no change in mask" 204 url.consecutive_successes += 1 205 if url.consecutive_successes >= scrapes: 206 print " ** No change for %r scrapes, done!" % scrapes 207 else: 208 # convert the difference to black and white, then change all 209 # black pixels (where the scrape and the baseline were identical) 210 # to white, all others (where the scrape and the baseline differed) 211 # to black. 212 # 213 # Since the below command is a little unclear, here's how it works. 214 # 1. convert("L") converts the RGB image to grayscale 215 # 2. point() maps grayscale values (or the individual channels) 216 # of an RGB image) to different ones. Because it operates on 217 # individual channels, the grayscale conversion from step 1 218 # is necessary. 219 # 3. The "1" second parameter to point() outputs the result as 220 # a monochrome bitmap. If the original RGB image were converted 221 # directly to monochrome, PIL would dither it. 222 diff = diff.convert("L").point([255]+[0]*255, "1") 223 224 # count the number of different pixels 225 diff_pixels = diff.getcolors()[0][0] 226 227 # is this too much? 228 diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1]) 229 if diff_pixel_percent > command["--threshhold"]: 230 print (" Scrape differed from baseline by %.2f percent, ignoring" 231 % diff_pixel_percent) 232 else: 233 print " Scrape differed in %d pixels, updating mask" % diff_pixels 234 mask = ImageChops.multiply(mask, diff) 235 mask.save(mask_filename) 236 237 # reset the number of consecutive "good" scrapes 238 url.consecutive_successes = 0 239 240 # Remove URLs whose mask is deemed done 241 complete_list.extend( 242 [url for url in url_list if url.consecutive_successes >= scrapes]) 243 error_list.extend( 244 [url for url in url_list if url.errors >= errors]) 245 url_list = [ 246 url for url in url_list if 247 url.consecutive_successes < scrapes and 248 url.errors < errors] 249 250 scrape_pass += 1 251 print "**Done with scrape pass %d\n" % scrape_pass 252 253 if scrape_pass >= command["--giveup"]: 254 print "**Exceeded giveup threshhold. Giving up." 255 else: 256 print "Waiting %d seconds..." % command["--wait"] 257 time.sleep(command["--wait"]) 258 259 print 260 print "*** MASKMAKER COMPLETE ***" 261 print "Summary report:" 262 print " %d masks successfully generated" % len(complete_list) 263 for url in complete_list: 264 print " ", url.url 265 print " %d masks failed with too many errors" % len(error_list) 266 for url in error_list: 267 print " ", url.url 268 if scrape_pass >= command["--giveup"]: 269 print (" %d masks were not completed before " 270 "reaching the giveup threshhold" % len(url_list)) 271 for url in url_list: 272 print " ", url.url 273