Home | History | Annotate | Download | only in statistical_analysis
      1 #!/usr/bin/env python
      2 # Copyright 2015 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Calculates statistical hypothesis test for given benchmark results.
      7 
      8 Evaluate two benchmark results given as Chart JSON files to determine how
      9 statistically significantly different they are. This evaluation should be run
     10 using Chart JSON files created by one of the available benchmarks in
     11 tools/perf/run_benchmark.
     12 
     13 A "benchmark" (e.g. startup.cold.blank_page) includes several "metrics" (e.g.
     14 first_main_frame_load_time).
     15 """
     16 
     17 from __future__ import print_function
     18 import argparse
     19 import json
     20 import os
     21 import sys
     22 
     23 sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__),
     24                                                 '..')))
     25 from statistical_analysis import results_stats
     26 
     27 
     28 DEFAULT_SIGNIFICANCE_LEVEL = 0.05
     29 DEFAULT_STATISTICAL_TEST = results_stats.MANN
     30 
     31 
     32 def LoadJsonFromPath(json_path):
     33   """Returns a JSON from specified location."""
     34   with open(os.path.abspath(json_path)) as data_file:
     35     return json.load(data_file)
     36 
     37 
     38 def PrintOutcomeLine(name, max_name_length, outcome, print_p_value):
     39   """Prints a single output line, e.g. 'metric_1  True  0.03'."""
     40   print('{:{}}{}'.format(name, max_name_length + 2, outcome[0]), end='')
     41   if print_p_value:
     42     print('\t{:.10f}'.format(outcome[1]), end='')
     43   print()
     44 
     45 
     46 def PrintTestOutcome(test_outcome_dict, test_name, significance_level,
     47                      print_p_value):
     48   """Prints the given test outcomes to the command line.
     49 
     50   Will print the p-values for each metric's outcome if |print_p_value| is True
     51   and also prints the name of the executed statistical test and the
     52   significance level.
     53   """
     54   print('Statistical analysis results (True=Performance difference likely)\n'
     55         '(Test: {}, Significance Level: {})\n'.format(test_name,
     56                                                       significance_level))
     57 
     58   max_metric_name_len = max([len(metric_name) for metric_name in
     59                              test_outcome_dict])
     60 
     61   for metric_name, outcome in test_outcome_dict.iteritems():
     62     PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value)
     63 
     64 
     65 def PrintPagesetTestOutcome(test_outcome_dict, test_name, significance_level,
     66                             print_p_value, print_details):
     67   """Prints the given test outcomes to the command line.
     68 
     69   Prints a summary combining the p-values of the pageset for each metric. Then
     70   prints results for each metric/page combination if |print_details| is True.
     71   """
     72   print('Statistical analysis results (True=Performance difference likely)\n'
     73         '(Test: {}, Significance Level: {})\n'.format(test_name,
     74                                                       significance_level))
     75 
     76   # Print summarized version at the top.
     77   max_metric_name_len = max([len(metric_name) for metric_name in
     78                              test_outcome_dict])
     79   print('Summary (combined p-values for all pages in pageset):\n')
     80   for metric_name, pageset in test_outcome_dict.iteritems():
     81     combined_p_value = results_stats.CombinePValues([p[1] for p in
     82                                                      pageset.itervalues()])
     83     outcome = (combined_p_value < significance_level, combined_p_value)
     84     PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value)
     85   print()
     86 
     87   if not print_details:
     88     return
     89 
     90   # Print outcome for every metric/page combination.
     91   for metric_name, pageset in test_outcome_dict.iteritems():
     92     max_page_name_len = max([len(page_name) for page_name in pageset])
     93     print('{}:'.format(metric_name))
     94     for page_name, page_outcome in pageset.iteritems():
     95       PrintOutcomeLine(page_name, max_page_name_len, page_outcome,
     96                        print_p_value)
     97     print()
     98 
     99 
    100 def main(args=None):
    101   """Set up parser and run statistical test on given benchmark results.
    102 
    103   Set up command line parser and its arguments. Then load Chart JSONs from
    104   given paths, run the specified statistical hypothesis test on the results and
    105   print the test outcomes.
    106   """
    107   if args is None:
    108     args = sys.argv[1:]
    109 
    110   parser = argparse.ArgumentParser(description="""Runs statistical significance
    111                                    tests on two given Chart JSON benchmark
    112                                    results produced by the telemetry
    113                                    benchmarks.""")
    114 
    115   parser.add_argument(dest='json_paths', nargs=2, help='JSON file location')
    116 
    117   parser.add_argument('--significance', dest='significance_level',
    118                       default=DEFAULT_SIGNIFICANCE_LEVEL, type=float,
    119                       help="""The significance level is the type I error rate,
    120                       which is the probability of determining that the
    121                       benchmark results are different although they're not.
    122                       Default: {}, which is common in statistical hypothesis
    123                       testing.""".format(DEFAULT_SIGNIFICANCE_LEVEL))
    124 
    125   parser.add_argument('--statistical-test', dest='statistical_test',
    126                       default=DEFAULT_STATISTICAL_TEST,
    127                       choices=results_stats.ALL_TEST_OPTIONS,
    128                       help="""Specifies the statistical hypothesis test that is
    129                       used. Choices are: Mann-Whitney U-test,
    130                       Kolmogorov-Smirnov, Welch's t-test. Default: Mann-Whitney
    131                       U-Test.""")
    132 
    133   parser.add_argument('-p', action='store_true', dest='print_p_value',
    134                       help="""If the -p flag is set, the output will include
    135                       the p-value for each metric.""")
    136 
    137   parser.add_argument('-d', action='store_true', dest='print_details',
    138                       help="""If the -d flag is set, the output will be more
    139                       detailed for benchmarks containing pagesets, giving
    140                       results for every metric/page combination after a summary
    141                       at the top.""")
    142 
    143   args = parser.parse_args(args)
    144 
    145   result_jsons = [LoadJsonFromPath(json_path) for json_path in args.json_paths]
    146 
    147   if (results_stats.DoesChartJSONContainPageset(result_jsons[0]) and
    148       results_stats.DoesChartJSONContainPageset(result_jsons[1])):
    149     # Benchmark containing a pageset.
    150     result_dict_1, result_dict_2 = (
    151         [results_stats.CreatePagesetBenchmarkResultDict(result_json)
    152          for result_json in result_jsons])
    153     test_outcome_dict = results_stats.ArePagesetBenchmarkResultsDifferent(
    154         result_dict_1, result_dict_2, args.statistical_test,
    155         args.significance_level)
    156 
    157     PrintPagesetTestOutcome(test_outcome_dict, args.statistical_test,
    158                             args.significance_level, args.print_p_value,
    159                             args.print_details)
    160 
    161   else:
    162     # Benchmark not containing a pageset.
    163     # (If only one JSON contains a pageset, results_stats raises an error.)
    164     result_dict_1, result_dict_2 = (
    165         [results_stats.CreateBenchmarkResultDict(result_json)
    166          for result_json in result_jsons])
    167     test_outcome_dict = (
    168         results_stats.AreBenchmarkResultsDifferent(result_dict_1, result_dict_2,
    169                                                    args.statistical_test,
    170                                                    args.significance_level))
    171 
    172     PrintTestOutcome(test_outcome_dict, args.statistical_test,
    173                      args.significance_level, args.print_p_value)
    174 
    175 
    176 if __name__ == '__main__':
    177   sys.exit(main())
    178