Home | History | Annotate | Download | only in calmbench
      1 #!/usr/bin/python
      2 # encoding: utf-8
      3 
      4 # Copyright 2017 Google Inc.
      5 #
      6 # Use of this source code is governed by a BSD-style license that can be found
      7 # in the LICENSE file.
      8 #
      9 # This is an A/B test utility script used by calmbench.py
     10 #
     11 # For each bench, we get a distribution of min_ms measurements from nanobench.
     12 # From that, we try to recover the 1/3 and 2/3 quantiles of the distribution.
     13 # If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B,
     14 # we report that as a regression.
     15 #
     16 # The more measurements we have for a bench, the more accurate our quantiles
     17 # are. However, taking more measurements is time consuming. Hence we'll prune
     18 # out benches and only take more measurements for benches whose current quantile
     19 # ranges are disjoint.
     20 #
     21 # P.S. The current script is brute forcely translated from a ruby script. So it
     22 # may be ugly...
     23 
     24 import re
     25 import os
     26 import sys
     27 import time
     28 import json
     29 import subprocess
     30 import shlex
     31 import multiprocessing
     32 import traceback
     33 from argparse import ArgumentParser
     34 from multiprocessing import Process
     35 from threading import Thread
     36 from threading import Lock
     37 from pdb import set_trace
     38 
     39 
     40 HELP = """
     41 \033[31mPlease call calmbench.py to drive this script if you're not doing so.
     42 This script is not supposed to be used by itself. (At least, it's not easy to
     43 use by itself. The calmbench bots may use this script directly.)
     44 \033[0m
     45 """
     46 
     47 FACTOR  = 3     # lower/upper quantile factor
     48 DIFF_T  = 0.99  # different enough threshold
     49 TERM    = 10    # terminate after this no. of iterations without suspect changes
     50 MAXTRY  = 30    # max number of nanobench tries to narrow down suspects
     51 
     52 UNITS   = "ns s ms s".split()
     53 
     54 
     55 timesLock = Lock()
     56 timesA  = {}
     57 timesB  = {}
     58 
     59 
     60 def parse_args():
     61   parser = ArgumentParser(description=HELP)
     62 
     63   parser.add_argument('outdir', type=str, help="output directory")
     64   parser.add_argument('a', type=str, help="name of A")
     65   parser.add_argument('b', type=str, help="name of B")
     66   parser.add_argument('nano_a', type=str, help="path to A's nanobench binary")
     67   parser.add_argument('nano_b', type=str, help="path to B's nanobench binary")
     68   parser.add_argument('arg_a', type=str, help="args for A's nanobench run")
     69   parser.add_argument('arg_b', type=str, help="args for B's nanobench run")
     70   parser.add_argument('repeat', type=int, help="number of initial runs")
     71   parser.add_argument('skip_b', type=str, help=("whether to skip running B"
     72                                                 " ('true' or 'false')"))
     73   parser.add_argument('config', type=str, help="nanobenh config")
     74   parser.add_argument('threads', type=int, help="number of threads to run")
     75   parser.add_argument('noinit', type=str, help=("whether to skip running B"
     76                                                 " ('true' or 'false')"))
     77 
     78   parser.add_argument('--concise', dest='concise', action="store_true",
     79       help="If set, no verbose thread info will be printed.")
     80   parser.set_defaults(concise=False)
     81 
     82   # Additional args for bots
     83   BHELP = "bot specific options"
     84   parser.add_argument('--githash', type=str, default="", help=BHELP)
     85   parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP)
     86 
     87   args = parser.parse_args()
     88   args.skip_b = args.skip_b == "true"
     89   args.noinit = args.noinit == "true"
     90 
     91   if args.threads == -1:
     92     args.threads = 1
     93     if args.config in ["8888", "565"]: # multi-thread for CPU only
     94         args.threads = max(1, multiprocessing.cpu_count() / 2)
     95 
     96   return args
     97 
     98 def append_dict_sorted_array(dict_array, key, value):
     99   if key not in dict_array:
    100     dict_array[key] = []
    101   dict_array[key].append(value)
    102   dict_array[key].sort()
    103 
    104 
    105 def add_time(args, name, bench, t, unit):
    106   normalized_t = t * 1000 ** UNITS.index(unit);
    107   if name.startswith(args.a):
    108     append_dict_sorted_array(timesA, bench, normalized_t)
    109   else:
    110     append_dict_sorted_array(timesB, bench, normalized_t)
    111 
    112 
    113 def append_times_from_file(args, name, filename):
    114   with open(filename) as f:
    115     lines = f.readlines()
    116   for line in lines:
    117     items = line.split()
    118     if len(items) > 10:
    119       bench = items[10]
    120       matches = re.search("([+-]?\d*.?\d+)(s|ms|s|ns)", items[3])
    121       if (not matches or items[9] != args.config):
    122         continue
    123       time_num = matches.group(1)
    124       time_unit = matches.group(2)
    125       add_time(args, name, bench, float(time_num), time_unit)
    126 
    127 
    128 class ThreadWithException(Thread):
    129   def __init__(self, target):
    130     super(ThreadWithException, self).__init__(target = target)
    131     self.exception = None
    132 
    133   def run(self):
    134     try:
    135       self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
    136     except BaseException as e:
    137       self.exception = e
    138 
    139   def join(self, timeout=None):
    140     super(ThreadWithException, self).join(timeout)
    141 
    142 
    143 class ThreadRunner:
    144   """Simplest and stupidiest threaded executer."""
    145   def __init__(self, args):
    146     self.concise = args.concise
    147     self.threads = []
    148 
    149   def add(self, args, fn):
    150     if len(self.threads) >= args.threads:
    151       self.wait()
    152     t = ThreadWithException(target = fn)
    153     t.daemon = True
    154     self.threads.append(t)
    155     t.start()
    156 
    157   def wait(self):
    158     def spin():
    159       i = 0
    160       spinners = [".  ", ".. ", "..."]
    161       while len(self.threads) > 0:
    162         timesLock.acquire()
    163         sys.stderr.write(
    164             "\r" + spinners[i % len(spinners)] +
    165             " (%d threads running)" % len(self.threads) +
    166             "           \r" # spaces for erasing characters
    167         )
    168         timesLock.release()
    169         time.sleep(0.5)
    170         i += 1
    171 
    172     if not self.concise:
    173       ts = Thread(target = spin);
    174       ts.start()
    175 
    176     for t in self.threads:
    177       t.join()
    178 
    179     exceptions = []
    180     for t in self.threads:
    181       if t.exception:
    182         exceptions.append(t.exception)
    183 
    184     self.threads = []
    185 
    186     if not self.concise:
    187       ts.join()
    188 
    189     if len(exceptions):
    190       for exc in exceptions:
    191         print exc
    192       raise exceptions[0]
    193 
    194 
    195 def split_arg(arg):
    196   raw = shlex.split(arg)
    197   result = []
    198   for r in raw:
    199     if '~' in r:
    200       result.append(os.path.expanduser(r))
    201     else:
    202       result.append(r)
    203   return result
    204 
    205 
    206 def run(args, threadRunner, name, nano, arg, i):
    207   def task():
    208     file_i = "%s/%s.out%d" % (args.outdir, name, i)
    209 
    210     should_run = not args.noinit and not (name == args.b and args.skip_b)
    211     if i <= 0:
    212       should_run = True # always run for suspects
    213 
    214     if should_run:
    215       if i > 0:
    216         timesLock.acquire()
    217         print "Init run %d for %s..." % (i, name)
    218         timesLock.release()
    219       subprocess.check_call(["touch", file_i])
    220       with open(file_i, 'w') as f:
    221         subprocess.check_call([nano] + split_arg(arg) +
    222                               ["--config", args.config], stderr=f, stdout=f)
    223 
    224     timesLock.acquire()
    225     append_times_from_file(args, name, file_i)
    226     timesLock.release()
    227 
    228   threadRunner.add(args, task)
    229 
    230 
    231 def init_run(args):
    232   threadRunner = ThreadRunner(args)
    233   for i in range(1, max(args.repeat, args.threads / 2) + 1):
    234     run(args, threadRunner, args.a, args.nano_a, args.arg_a, i)
    235     run(args, threadRunner, args.b, args.nano_b, args.arg_b, i)
    236   threadRunner.wait()
    237 
    238 
    239 def get_lower_upper(values):
    240   i = max(0, (len(values) - 1) / FACTOR)
    241   return values[i], values[-i - 1]
    242 
    243 
    244 def different_enough(lower1, upper2):
    245   return upper2 < DIFF_T * lower1
    246 
    247 
    248 # TODO(liyuqian): we used this hacky criteria mainly because that I didn't have
    249 # time to study more rigorous statistical tests. We should adopt a more rigorous
    250 # test in the future.
    251 def get_suspects():
    252   suspects = []
    253   for bench in timesA.keys():
    254     if bench not in timesB:
    255       continue
    256     lowerA, upperA = get_lower_upper(timesA[bench])
    257     lowerB, upperB = get_lower_upper(timesB[bench])
    258     if different_enough(lowerA, upperB) or different_enough(lowerB, upperA):
    259       suspects.append(bench)
    260   return suspects
    261 
    262 
    263 def process_bench_pattern(s):
    264   if ".skp" in s: # skp bench won't match their exact names...
    265     return "^\"" + s[0:(s.index(".skp") + 3)] + "\""
    266   else:
    267     return "^\"" + s + "\"$"
    268 
    269 
    270 def suspects_arg(suspects):
    271   patterns = map(process_bench_pattern, suspects)
    272   return " --match " + (" ".join(patterns))
    273 
    274 
    275 def median(array):
    276   return array[len(array) / 2]
    277 
    278 
    279 def regression(bench):
    280   a = median(timesA[bench])
    281   b = median(timesB[bench])
    282   if (a == 0): # bad bench, just return no regression
    283     return 1
    284   return b / a
    285 
    286 
    287 def percentage(x):
    288   return (x - 1) * 100
    289 
    290 
    291 def format_r(r):
    292   return ('%6.2f' % percentage(r)) + "%"
    293 
    294 
    295 def normalize_r(r):
    296   if r > 1.0:
    297     return r - 1.0
    298   else:
    299     return 1.0 - 1/r
    300 
    301 
    302 def test():
    303   args = parse_args()
    304 
    305   init_run(args)
    306   last_unchanged_iter = 0
    307   last_suspect_number = -1
    308   tryCnt = 0
    309   it = 0
    310   while tryCnt < MAXTRY:
    311     it += 1
    312     suspects = get_suspects()
    313     if len(suspects) != last_suspect_number:
    314       last_suspect_number = len(suspects)
    315       last_unchanged_iter = it
    316     if (len(suspects) == 0 or it - last_unchanged_iter >= TERM):
    317       break
    318 
    319     print "Number of suspects at iteration %d: %d" % (it, len(suspects))
    320     threadRunner = ThreadRunner(args)
    321     for j in range(1, max(1, args.threads / 2) + 1):
    322       run(args, threadRunner, args.a, args.nano_a,
    323           args.arg_a + suspects_arg(suspects), -j)
    324       run(args, threadRunner, args.b, args.nano_b,
    325           args.arg_b + suspects_arg(suspects), -j)
    326       tryCnt += 1
    327     threadRunner.wait()
    328 
    329   suspects = get_suspects()
    330   if len(suspects) == 0:
    331     print ("%s and %s does not seem to have significant " + \
    332            "performance differences.") % (args.a, args.b)
    333   else:
    334     suspects.sort(key = regression)
    335     print "%s (compared to %s) is likely" % (args.a, args.b)
    336     for suspect in suspects:
    337       r = regression(suspect)
    338       if r < 1:
    339         print "\033[31m  %s slower in %s\033[0m" % \
    340                 (format_r(1/r), suspect)
    341       else:
    342         print "\033[32m  %s faster in %s\033[0m" % \
    343                 (format_r(r), suspect)
    344 
    345   with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f:
    346     results = {}
    347     for bench in timesA:
    348       r = regression(bench) if bench in suspects else 1.0
    349       results[bench] = {
    350         args.config: {
    351           "signed_regression": normalize_r(r),
    352           "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6,
    353           "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6,
    354           "options": {
    355             # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so
    356             # nanobench could generate the json for us that's exactly the same
    357             # as that being used by perf bots. Currently, we cannot guarantee
    358             # that bench is the name (e.g., bench may have additional resolution
    359             # information appended after name).
    360             "name": bench
    361           }
    362         }
    363       }
    364 
    365     output = {"results": results}
    366     if args.githash:
    367       output["gitHash"] = args.githash
    368     if args.keys:
    369       keys = {}
    370       for i in range(len(args.keys) / 2):
    371         keys[args.keys[i * 2]] = args.keys[i * 2 + 1]
    372       output["key"] = keys
    373     f.write(json.dumps(output, indent=4))
    374     print ("\033[36mJSON results available in %s\033[0m" % f.name)
    375 
    376   with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out:
    377     out.write(("bench, significant?, raw regresion, " +
    378                    "%(A)s quantile (ns), %(B)s quantile (ns), " +
    379                    "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b})
    380     for bench in suspects + timesA.keys():
    381       if (bench not in timesA or bench not in timesB):
    382         continue
    383       ta = timesA[bench]
    384       tb = timesB[bench]
    385       out.write(
    386           "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) +
    387           ' '.join(map(str, get_lower_upper(ta))) + ", " +
    388           ' '.join(map(str, get_lower_upper(tb))) + ", " +
    389           ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb))))
    390       )
    391     print (("\033[36m" +
    392            "Compared %d benches. " +
    393            "%d of them seem to be significantly differrent." +
    394            "\033[0m") %
    395            (len([x for x in timesA if x in timesB]), len(suspects)))
    396     print ("\033[36mPlease see detailed bench results in %s\033[0m" %
    397             out.name)
    398 
    399 
    400 if __name__ == "__main__":
    401   try:
    402     test()
    403   except Exception as e:
    404     print e
    405     print HELP
    406     traceback.print_exc()
    407     raise e
    408