Home | History | Annotate | Download | only in real_world_impact
      1 #!/usr/bin/env python
      2 # Copyright 2014 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 # Tool for seeing the real world impact of a patch.
      7 #
      8 # Layout Tests can tell you whether something has changed, but this can help
      9 # you determine whether a subtle/controversial change is beneficial or not.
     10 #
     11 # It dumps the rendering of a large number of sites, both with and without a
     12 # patch being evaluated, then sorts them by greatest difference in rendering,
     13 # such that a human reviewer can quickly review the most impacted sites,
     14 # rather than having to manually try sites to see if anything changes.
     15 #
     16 # In future it might be possible to extend this to other kinds of differences,
     17 # e.g. page load times.
     18 
     19 import argparse
     20 from argparse import RawTextHelpFormatter
     21 from contextlib import closing
     22 import datetime
     23 import errno
     24 from distutils.spawn import find_executable
     25 from operator import itemgetter
     26 import multiprocessing
     27 import os
     28 import re
     29 from cStringIO import StringIO
     30 import subprocess
     31 import sys
     32 import textwrap
     33 import time
     34 from urllib2 import urlopen
     35 from urlparse import urlparse
     36 import webbrowser
     37 from zipfile import ZipFile
     38 
     39 from nsfw_urls import nsfw_urls
     40 
     41 action = None
     42 allow_js = False
     43 additional_content_shell_flags = ""
     44 chromium_src_root = ""
     45 chromium_out_dir = ""
     46 image_diff = ""
     47 content_shell = ""
     48 output_dir = ""
     49 num_sites = 100
     50 urls = []
     51 print_lock = multiprocessing.Lock()
     52 
     53 
     54 def MakeDirsIfNotExist(dir):
     55   try:
     56     os.makedirs(dir)
     57   except OSError as e:
     58     if e.errno != errno.EEXIST:
     59       raise
     60 
     61 
     62 def SetupPathsAndOut():
     63   global chromium_src_root, chromium_out_dir, output_dir
     64   global image_diff, content_shell
     65   chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__),
     66                                                    os.pardir,
     67                                                    os.pardir))
     68   # Find out directory (might be out_linux for users of cr).
     69   for out_suffix in ["_linux", ""]:
     70     out_dir = os.path.join(chromium_src_root, "out" + out_suffix)
     71     if os.path.exists(out_dir):
     72       chromium_out_dir = out_dir
     73       break
     74   if not chromium_out_dir:
     75     return False
     76 
     77   this_script_name = "real_world_impact"
     78   output_dir = os.path.join(chromium_out_dir,
     79                             "Release",
     80                             this_script_name)
     81   MakeDirsIfNotExist(output_dir)
     82 
     83   image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")
     84 
     85   if sys.platform == 'darwin':
     86     content_shell = os.path.join(chromium_out_dir, "Release",
     87                     "Content Shell.app/Contents/MacOS/Content Shell")
     88   elif sys.platform.startswith('linux'):
     89     content_shell = os.path.join(chromium_out_dir, "Release",
     90                     "content_shell")
     91   elif sys.platform.startswith('win'):
     92     content_shell = os.path.join(chromium_out_dir, "Release",
     93                     "content_shell.exe")
     94   return True
     95 
     96 
     97 def CheckPrerequisites():
     98   if not find_executable("wget"):
     99     print "wget not found! Install wget and re-run this."
    100     return False
    101   if not os.path.exists(image_diff):
    102     print "image_diff not found (%s)!" % image_diff
    103     print "Build the image_diff target and re-run this."
    104     return False
    105   if not os.path.exists(content_shell):
    106     print "Content shell not found (%s)!" % content_shell
    107     print "Build Release/content_shell and re-run this."
    108     return False
    109   return True
    110 
    111 
    112 def PickSampleUrls():
    113   global urls
    114   data_dir = os.path.join(output_dir, "data")
    115   MakeDirsIfNotExist(data_dir)
    116 
    117   # Download Alexa top 1,000,000 sites
    118   # TODO(johnme): Should probably update this when it gets too stale...
    119   csv_path = os.path.join(data_dir, "top-1m.csv")
    120   if not os.path.exists(csv_path):
    121     print "Downloading list of top 1,000,000 sites from Alexa..."
    122     csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"
    123     with closing(urlopen(csv_url)) as stream:
    124       ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir)
    125 
    126   bad_urls_path = os.path.join(data_dir, "bad_urls.txt")
    127   if os.path.exists(bad_urls_path):
    128     with open(bad_urls_path) as f:
    129       bad_urls = set(f.read().splitlines())
    130   else:
    131     bad_urls = set()
    132 
    133   # See if we've already selected a sample of size num_sites (this way, if you
    134   # call this script with arguments "before N" then "after N", where N is the
    135   # same number, we'll use the same sample, as expected!).
    136   urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites)
    137   if not os.path.exists(urls_path):
    138     if action == 'compare':
    139       print ("Error: you must run 'before %d' and 'after %d' before "
    140              "running 'compare %d'") % (num_sites, num_sites, num_sites)
    141       return False
    142     print "Picking %d sample urls..." % num_sites
    143 
    144     # TODO(johnme): For now this just gets the top num_sites entries. In future
    145     # this should pick a weighted random sample. For example, it could fit a
    146     # power-law distribution, which is a good model of website popularity
    147     # (http://www.useit.com/alertbox/9704b.html).
    148     urls = []
    149     remaining_num_sites = num_sites
    150     with open(csv_path) as f:
    151       for entry in f:
    152         if remaining_num_sites <= 0:
    153           break
    154         remaining_num_sites -= 1
    155         hostname = entry.strip().split(',')[1]
    156         if not '/' in hostname:  # Skip Alexa 1,000,000 entries that have paths.
    157           url = "http://%s/" % hostname
    158           if not url in bad_urls:
    159             urls.append(url)
    160     # Don't write these to disk yet; we'll do that in SaveWorkingUrls below
    161     # once we have tried to download them and seen which ones fail.
    162   else:
    163     with open(urls_path) as f:
    164       urls = [u for u in f.read().splitlines() if not u in bad_urls]
    165   return True
    166 
    167 
    168 def SaveWorkingUrls():
    169   # TODO(johnme): Update the list if a url that used to work goes offline.
    170   urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites)
    171   if not os.path.exists(urls_path):
    172     with open(urls_path, 'w') as f:
    173       f.writelines(u + '\n' for u in urls)
    174 
    175 
    176 def PrintElapsedTime(elapsed, detail=""):
    177   elapsed = round(elapsed * 10) / 10.0
    178   m = elapsed / 60
    179   s = elapsed % 60
    180   print "Took %dm%.1fs" % (m, s), detail
    181 
    182 
    183 def DownloadStaticCopyTask(url):
    184   url_parts = urlparse(url)
    185   host_dir = os.path.join(output_dir, "data", url_parts.hostname)
    186   # Use wget for now, as does a reasonable job of spidering page dependencies
    187   # (e.g. CSS, JS, images).
    188   success = True
    189   try:
    190     subprocess.check_call(["wget",
    191                            "--execute", "robots=off",
    192                            ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS "
    193                             "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C"
    194                             "hrome/32.0.1700.14 Safari/537.36"),
    195                            "--page-requisites",
    196                            "--span-hosts",
    197                            "--adjust-extension",
    198                            "--convert-links",
    199                            "--directory-prefix=" + host_dir,
    200                            "--force-directories",
    201                            "--default-page=index.html",
    202                            "--no-check-certificate",
    203                            "--timeout=5", # 5s timeout
    204                            "--tries=2",
    205                            "--quiet",
    206                            url])
    207   except KeyboardInterrupt:
    208     success = False
    209   except subprocess.CalledProcessError:
    210     # Ignoring these for now, as some sites have issues with their subresources
    211     # yet still produce a renderable index.html
    212     pass #success = False
    213   if success:
    214     download_path = os.path.join(host_dir, url_parts.hostname, "index.html")
    215     if not os.path.exists(download_path):
    216       success = False
    217     else:
    218       with print_lock:
    219         print "Downloaded:", url
    220   if not success:
    221     with print_lock:
    222       print "Failed to download:", url
    223     return False
    224   return True
    225 
    226 
    227 def DownloadStaticCopies():
    228   global urls
    229   new_urls = []
    230   for url in urls:
    231     url_parts = urlparse(url)
    232     host_dir = os.path.join(output_dir, "data", url_parts.hostname)
    233     download_path = os.path.join(host_dir, url_parts.hostname, "index.html")
    234     if not os.path.exists(download_path):
    235       new_urls.append(url)
    236 
    237   if new_urls:
    238     print "Downloading static copies of %d sites..." % len(new_urls)
    239     start_time = time.time()
    240 
    241     results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)
    242     failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret]
    243     if failed_urls:
    244       bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt")
    245       with open(bad_urls_path, 'a') as f:
    246         f.writelines(u + '\n' for u in failed_urls)
    247       failed_urls_set = set(failed_urls)
    248       urls = [u for u in urls if u not in failed_urls_set]
    249 
    250     PrintElapsedTime(time.time() - start_time)
    251 
    252   SaveWorkingUrls()
    253 
    254 
    255 def RunDrtTask(url):
    256   url_parts = urlparse(url)
    257   host_dir = os.path.join(output_dir, "data", url_parts.hostname)
    258   html_path = os.path.join(host_dir, url_parts.hostname, "index.html")
    259 
    260   if not allow_js:
    261     nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html")
    262     if not os.path.exists(nojs_path):
    263       with open(html_path) as f:
    264         html = f.read()
    265       if not html:
    266         return False
    267       # These aren't intended to be XSS safe :)
    268       block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)'
    269                     r'\b.*?<\s*\/\s*\1\s*>')
    270       block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)'
    271       html = re.sub(block_tags, '', html, flags=re.I|re.S)
    272       html = re.sub(block_attrs, '', html, flags=re.I)
    273       with open(nojs_path, 'w') as f:
    274         f.write(html)
    275     html_path = nojs_path
    276 
    277   start_time = time.time()
    278 
    279   with open(os.devnull, "w") as fnull:
    280     p = subprocess.Popen([content_shell,
    281                           "--dump-render-tree",
    282                           additional_content_shell_flags,
    283                           # The single quote is not a typo, it's a separator!
    284                           html_path + "'--pixel-test"
    285                          ],
    286                          shell=False,
    287                          stdout=subprocess.PIPE,
    288                          stderr=fnull)
    289   result = p.stdout.read()
    290   PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"
    291   PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82"
    292   try:
    293     start = result.index(PNG_START)
    294     end = result.rindex(PNG_END) + 8
    295   except ValueError:
    296     return False
    297 
    298   png_path = os.path.join(output_dir, action, url_parts.hostname + ".png")
    299   MakeDirsIfNotExist(os.path.dirname(png_path))
    300   with open(png_path, 'wb') as f:
    301     f.write(result[start:end])
    302   elapsed_time = (time.time() - start_time, url)
    303   return elapsed_time
    304 
    305 
    306 def RunDrt():
    307   print "Taking screenshots of %d pages..." % len(urls)
    308   start_time = time.time()
    309 
    310   results = multiprocessing.Pool().map(RunDrtTask, urls, 1)
    311 
    312   max_time, url = max(t for t in results if t)
    313   elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url)
    314   PrintElapsedTime(time.time() - start_time, elapsed_detail)
    315 
    316 
    317 def CompareResultsTask(url):
    318   url_parts = urlparse(url)
    319   before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png")
    320   after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png")
    321   diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png")
    322   MakeDirsIfNotExist(os.path.join(output_dir, "diff"))
    323 
    324   # TODO(johnme): Don't hardcode "real_world_impact".
    325   red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA"
    326               "ABAAEAAAICRAEAOw==")
    327 
    328   before_exists = os.path.exists(before_path)
    329   after_exists = os.path.exists(after_path)
    330   if not before_exists and not after_exists:
    331     # TODO(johnme): Make this more informative.
    332     return (-100, url, red_path)
    333   if before_exists != after_exists:
    334     # TODO(johnme): Make this more informative.
    335     return (200, url, red_path)
    336 
    337   # Get percentage difference.
    338   p = subprocess.Popen([image_diff, "--histogram",
    339                         before_path, after_path],
    340                         shell=False,
    341                         stdout=subprocess.PIPE)
    342   output,_ = p.communicate()
    343   if p.returncode == 0:
    344     return (0, url, before_path)
    345   diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n'
    346                          'exact diff: (\d+\.\d{2})% (?:passed|failed)', output)
    347   if not diff_match:
    348     raise Exception("image_diff output format changed")
    349   histogram_diff = float(diff_match.group(1))
    350   exact_diff = float(diff_match.group(2))
    351   combined_diff = max(histogram_diff + exact_diff / 8, 0.001)
    352 
    353   # Produce diff PNG.
    354   subprocess.call([image_diff, "--diff", before_path, after_path, diff_path])
    355   return (combined_diff, url, diff_path)
    356 
    357 
    358 def CompareResults():
    359   print "Running image_diff on %d pages..." % len(urls)
    360   start_time = time.time()
    361 
    362   results = multiprocessing.Pool().map(CompareResultsTask, urls)
    363   results.sort(key=itemgetter(0), reverse=True)
    364 
    365   PrintElapsedTime(time.time() - start_time)
    366 
    367   now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M")
    368   html_start = textwrap.dedent("""\
    369   <!DOCTYPE html>
    370   <html>
    371   <head>
    372   <title>Real World Impact report %s</title>
    373   <script>
    374     var togglingImg = null;
    375     var toggleTimer = null;
    376 
    377     var before = true;
    378     function toggle() {
    379       var newFolder = before ? "before" : "after";
    380       togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder);
    381       before = !before;
    382       toggleTimer = setTimeout(toggle, 300);
    383     }
    384 
    385     function startToggle(img) {
    386       before = true;
    387       togglingImg = img;
    388       if (!img.origSrc)
    389         img.origSrc = img.src;
    390       toggle();
    391     }
    392     function stopToggle(img) {
    393       clearTimeout(toggleTimer);
    394       img.src = img.origSrc;
    395     }
    396 
    397     document.onkeydown = function(e) {
    398       e = e || window.event;
    399       var keyCode = e.keyCode || e.which;
    400       var newFolder;
    401       switch (keyCode) {
    402         case 49: //'1'
    403           newFolder = "before"; break;
    404         case 50: //'2'
    405           newFolder = "after"; break;
    406         case 51: //'3'
    407           newFolder = "diff"; break;
    408         default:
    409           return;
    410       }
    411       var imgs = document.getElementsByTagName("img");
    412       for (var i = 0; i < imgs.length; i++) {
    413         imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder);
    414       }
    415     };
    416   </script>
    417   <style>
    418     h1 {
    419       font-family: sans;
    420     }
    421     h2 {
    422       font-family: monospace;
    423       white-space: pre;
    424     }
    425     .nsfw-spacer {
    426       height: 50vh;
    427     }
    428     .nsfw-warning {
    429       background: yellow;
    430       border: 10px solid red;
    431     }
    432     .info {
    433       font-size: 1.2em;
    434       font-style: italic;
    435     }
    436     body:not(.details-supported) details {
    437       display: none;
    438     }
    439   </style>
    440   </head>
    441   <body>
    442     <script>
    443     if ('open' in document.createElement('details'))
    444       document.body.className = "details-supported";
    445     </script>
    446     <!--<div class="nsfw-spacer"></div>-->
    447     <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d
    448     and may be NSFW.</p>
    449     <!--<div class="nsfw-spacer"></div>-->
    450     <h1>Real World Impact report %s</h1>
    451     <p class="info">Press 1, 2 and 3 to switch between before, after and diff
    452     screenshots respectively; or hover over the images to rapidly alternate
    453     between before and after.</p>
    454   """ % (now, num_sites, now))
    455 
    456   html_same_row = """\
    457   <h2>No difference on <a href="%s">%s</a>.</h2>
    458   """
    459 
    460   html_diff_row = """\
    461   <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
    462   <img src="%s" width="800" height="600"
    463        onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
    464   """
    465 
    466   html_nsfw_diff_row = """\
    467   <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
    468   <details>
    469     <summary>This site may be NSFW. Click to expand/collapse.</summary>
    470     <img src="%s" width="800" height="600"
    471          onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
    472   </details>
    473   """
    474 
    475   html_end = textwrap.dedent("""\
    476   </body>
    477   </html>""")
    478 
    479   html_path = os.path.join(output_dir, "diff.html")
    480   with open(html_path, 'w') as f:
    481     f.write(html_start)
    482     for (diff_float, url, diff_path) in results:
    483       diff_path = os.path.relpath(diff_path, output_dir)
    484       if diff_float == 0:
    485         f.write(html_same_row % (url, url))
    486       elif url in nsfw_urls:
    487         f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path))
    488       else:
    489         f.write(html_diff_row % (diff_float, url, url, diff_path))
    490     f.write(html_end)
    491 
    492   webbrowser.open_new_tab("file://" + html_path)
    493 
    494 
    495 def main(argv):
    496   global num_sites, action, allow_js, additional_content_shell_flags
    497 
    498   parser = argparse.ArgumentParser(
    499       formatter_class=RawTextHelpFormatter,
    500       description="Compare the real world impact of a content shell change.",
    501       epilog=textwrap.dedent("""\
    502           Example usage:
    503             1. Build content_shell in out/Release without any changes.
    504             2. Run: %s before [num sites to test (default %d)].
    505             3. Either:
    506                  a. Apply your controversial patch and rebuild content_shell.
    507                  b. Pass --additional_flags="--enable_your_flag" in step 4.
    508             4. Run: %s after [num sites to test (default %d)].
    509             5. Run: %s compare [num sites to test (default %d)].
    510                This will open the results in your web browser.
    511           """ % (argv[0], num_sites, argv[0], num_sites, argv[0], num_sites)))
    512   parser.add_argument("--allow_js", help="Don't disable Javascript",
    513                       action="store_true")
    514   parser.add_argument("--additional_flags",
    515                       help="Additional flags to pass to content shell")
    516   parser.add_argument("action",
    517                       help=textwrap.dedent("""\
    518                         Action to perform.
    519                           download - Just download the sites.
    520                           before - Run content shell and record 'before' result.
    521                           after - Run content shell and record 'after' result.
    522                           compare - Compare before and after results.
    523                       """),
    524                       choices=["download", "before", "after", "compare"])
    525   parser.add_argument("num_sites",
    526                       help="Number of sites (default %s)" % num_sites,
    527                       type=int, default=num_sites, nargs='?')
    528   args = parser.parse_args()
    529 
    530   action = args.action
    531 
    532   if (args.num_sites):
    533     num_sites = args.num_sites
    534 
    535   if (args.allow_js):
    536     allow_js = args.allow_js
    537 
    538   if (args.additional_flags):
    539     additional_content_shell_flags = args.additional_flags
    540 
    541   if not SetupPathsAndOut() or not CheckPrerequisites() or not PickSampleUrls():
    542     return 1
    543 
    544   if action == 'compare':
    545     CompareResults()
    546   else:
    547     DownloadStaticCopies()
    548     if action != 'download':
    549       RunDrt()
    550   return 0
    551 
    552 
    553 if __name__ == '__main__':
    554   sys.exit(main(sys.argv))