Home | History | Annotate | Download | only in statistical_analysis
      1 # Copyright 2016 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 """Statistical hypothesis testing for comparing benchmark results."""
      6 
      7 try:
      8   import numpy as np
      9 except ImportError:
     10   np = None
     11 
     12 try:
     13   from scipy import stats
     14   import scipy.version
     15 except ImportError:
     16   stats = None
     17 
     18 
     19 MANN = 'mann'
     20 KOLMOGOROV = 'kolmogorov'
     21 WELCH = 'welch'
     22 ALL_TEST_OPTIONS = [MANN, KOLMOGOROV, WELCH]
     23 
     24 
     25 class DictMismatchError(Exception):
     26   """Provides exception for result dicts with mismatching keys/metrics."""
     27   def __str__(self):
     28     return ("Provided benchmark result dicts' keys/metrics do not match. "
     29             "Check if they have been created by the same benchmark.")
     30 
     31 
     32 class SampleSizeError(Exception):
     33   """Provides exception for sample sizes too small for Mann-Whitney U-test."""
     34   def __str__(self):
     35     return ('At least one sample size is smaller than 20, which is too small '
     36             'for Mann-Whitney U-test.')
     37 
     38 
     39 class NonNormalSampleError(Exception):
     40   """Provides exception for samples that are not normally distributed."""
     41   def __str__(self):
     42     return ("At least one sample is not normally distributed as required by "
     43             "Welch's t-test.")
     44 
     45 
     46 def IsScipyMannTestOneSided():
     47   """Checks if Scipy version is < 0.17.0.
     48 
     49   This is the version where stats.mannwhitneyu(...) is changed from returning
     50   a one-sided to returning a two-sided p-value.
     51   """
     52   scipy_version = [int(num) for num in scipy.version.version.split('.')]
     53   return scipy_version[0] < 1 and scipy_version[1] < 17
     54 
     55 
     56 def GetChartsFromBenchmarkResultJson(benchmark_result_json):
     57   """Returns the 'charts' element from a given Chart JSON.
     58 
     59   Excludes entries that are not list_of_scalar_values and empty entries. Also
     60   raises errors for an invalid JSON format or empty 'charts' element.
     61 
     62   Raises:
     63     ValueError: Provided chart JSON is either not valid or 'charts' is empty.
     64   """
     65   try:
     66     charts = benchmark_result_json['charts']
     67   except KeyError:
     68     raise ValueError('Invalid benchmark result format. Make sure input is a '
     69                      'Chart-JSON.\nProvided JSON:\n',
     70                      repr(benchmark_result_json))
     71   if not charts:
     72     raise ValueError("Invalid benchmark result format. Dict entry 'charts' is "
     73                      "empty.")
     74 
     75   def IsValidPageContent(page_content):
     76     return (page_content['type'] == 'list_of_scalar_values' and
     77             'values' in page_content)
     78 
     79   def CreatePageDict(metric_content):
     80     return {page_name: page_content
     81             for page_name, page_content in metric_content.iteritems()
     82             if IsValidPageContent(page_content)}
     83 
     84   charts_valid_entries_only = {}
     85   for metric_name, metric_content in charts.iteritems():
     86     inner_page_dict = CreatePageDict(metric_content)
     87     if not inner_page_dict:
     88       continue
     89     charts_valid_entries_only[metric_name] = inner_page_dict
     90 
     91   return charts_valid_entries_only
     92 
     93 
     94 def DoesChartJSONContainPageset(benchmark_result_json):
     95   """Checks if given Chart JSON contains results for a pageset.
     96 
     97   A metric in a benchmark NOT containing a pageset contains only two elements
     98   ("Only_page_in_this_benchmark" and "Summary", as opposed to "Ex_page_1",
     99   "Ex_page_2", ..., and "Summary").
    100   """
    101   charts = GetChartsFromBenchmarkResultJson(benchmark_result_json)
    102 
    103   arbitrary_metric_in_charts = charts.itervalues().next()
    104   return len(arbitrary_metric_in_charts) > 2
    105 
    106 
    107 def CreateBenchmarkResultDict(benchmark_result_json):
    108   """Creates a dict of format {metric_name: list of benchmark results}.
    109 
    110   Takes a raw result Chart-JSON produced when using '--output-format=chartjson'
    111   for 'run_benchmark'.
    112 
    113   Args:
    114     benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry.
    115 
    116   Returns:
    117     Dictionary of benchmark results.
    118     Example dict entry: 'tab_load_time': [650, 700, ...].
    119   """
    120   charts = GetChartsFromBenchmarkResultJson(benchmark_result_json)
    121 
    122   benchmark_result_dict = {}
    123   for metric_name, metric_content in charts.iteritems():
    124     benchmark_result_dict[metric_name] = metric_content['summary']['values']
    125 
    126   return benchmark_result_dict
    127 
    128 
    129 def CreatePagesetBenchmarkResultDict(benchmark_result_json):
    130   """Creates a dict of format {metric_name: {page_name: list of page results}}.
    131 
    132   Takes a raw result Chart-JSON produced by 'run_benchmark' when using
    133   '--output-format=chartjson' and when specifying a benchmark that has a
    134   pageset (e.g. top25mobile). Run 'DoesChartJSONContainPageset' to check if
    135   your Chart-JSON contains a pageset.
    136 
    137   Args:
    138     benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry.
    139 
    140   Returns:
    141     Dictionary of benchmark results.
    142     Example dict entry: 'tab_load_time': 'Gmail.com': [650, 700, ...].
    143   """
    144   charts = GetChartsFromBenchmarkResultJson(benchmark_result_json)
    145 
    146   benchmark_result_dict = {}
    147   for metric_name, metric_content in charts.iteritems():
    148     benchmark_result_dict[metric_name] = {}
    149     for page_name, page_content in metric_content.iteritems():
    150       if page_name == 'summary':
    151         continue
    152       benchmark_result_dict[metric_name][page_name] = page_content['values']
    153 
    154   return benchmark_result_dict
    155 
    156 
    157 def CombinePValues(p_values):
    158   """Combines p-values from a number of tests using Fisher's Method.
    159 
    160   The tests the p-values result from must test the same null hypothesis and be
    161   independent.
    162 
    163   Args:
    164     p_values: List of p-values.
    165 
    166   Returns:
    167     combined_p_value: Combined p-value according to Fisher's method.
    168   """
    169   # TODO (wierichs): Update to use scipy.stats.combine_pvalues(p_values) when
    170   # Scipy v0.15.0 becomes available as standard version.
    171   if not np:
    172     raise ImportError('This function requires Numpy.')
    173 
    174   if not stats:
    175     raise ImportError('This function requires Scipy.')
    176 
    177   test_statistic = -2 * np.sum(np.log(p_values))
    178   p_value = stats.chi2.sf(test_statistic, 2 * len(p_values))
    179   return p_value
    180 
    181 
    182 def IsNormallyDistributed(sample, significance_level=0.05):
    183   """Calculates Shapiro-Wilk test for normality for a single sample.
    184 
    185   Note that normality is a requirement for Welch's t-test.
    186 
    187   Args:
    188     sample: List of values.
    189     significance_level: The significance level the p-value is compared against.
    190 
    191   Returns:
    192     is_normally_distributed: Returns True or False.
    193     p_value: The calculated p-value.
    194   """
    195   if not stats:
    196     raise ImportError('This function requires Scipy.')
    197 
    198   # pylint: disable=unbalanced-tuple-unpacking
    199   _, p_value = stats.shapiro(sample)
    200 
    201   is_normally_distributed = p_value >= significance_level
    202   return is_normally_distributed, p_value
    203 
    204 
    205 def AreSamplesDifferent(sample_1, sample_2, test=MANN,
    206                         significance_level=0.05):
    207   """Calculates the specified statistical test for the given samples.
    208 
    209   The null hypothesis for each test is that the two populations that the
    210   samples are taken from are not significantly different. Tests are two-tailed.
    211 
    212   Raises:
    213     ImportError: Scipy is not installed.
    214     SampleSizeError: Sample size is too small for MANN.
    215     NonNormalSampleError: Sample is not normally distributed as required by
    216     WELCH.
    217 
    218   Args:
    219     sample_1: First list of values.
    220     sample_2: Second list of values.
    221     test: Statistical test that is used.
    222     significance_level: The significance level the p-value is compared against.
    223 
    224   Returns:
    225     is_different: True or False, depending on the test outcome.
    226     p_value: The p-value the test has produced.
    227   """
    228   if not stats:
    229     raise ImportError('This function requires Scipy.')
    230 
    231   if test == MANN:
    232     if len(sample_1) < 20 or len(sample_2) < 20:
    233       raise SampleSizeError()
    234     try:
    235       _, p_value = stats.mannwhitneyu(sample_1, sample_2, use_continuity=True)
    236     except ValueError:
    237       # If sum of ranks of values in |sample_1| and |sample_2| is equal,
    238       # scipy.stats.mannwhitneyu raises ValueError. Treat this as a 1.0 p-value
    239       # (indistinguishable).
    240       return (False, 1.0)
    241 
    242     if IsScipyMannTestOneSided():
    243       p_value = p_value * 2 if p_value < 0.5 else 1
    244 
    245   elif test == KOLMOGOROV:
    246     _, p_value = stats.ks_2samp(sample_1, sample_2)
    247 
    248   elif test == WELCH:
    249     if not (IsNormallyDistributed(sample_1, significance_level)[0] and
    250             IsNormallyDistributed(sample_2, significance_level)[0]):
    251       raise NonNormalSampleError()
    252     _, p_value = stats.ttest_ind(sample_1, sample_2, equal_var=False)
    253   # TODO: Add k sample anderson darling test
    254 
    255   is_different = p_value <= significance_level
    256   return is_different, p_value
    257 
    258 
    259 def AssertThatKeysMatch(result_dict_1, result_dict_2):
    260   """Raises an exception if benchmark dicts do not contain the same metrics."""
    261   if result_dict_1.viewkeys() != result_dict_2.viewkeys():
    262     raise DictMismatchError()
    263 
    264 
    265 def AreBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN,
    266                                  significance_level=0.05):
    267   """Runs the given test on the results of each metric in the benchmarks.
    268 
    269   Checks if the dicts have been created from the same benchmark, i.e. if
    270   metric names match (e.g. first_non_empty_paint_time). Then runs the specified
    271   statistical test on each metric's samples to find if they vary significantly.
    272 
    273   Args:
    274     result_dict_1: Benchmark result dict of format {metric: list of values}.
    275     result_dict_2: Benchmark result dict of format {metric: list of values}.
    276     test: Statistical test that is used.
    277     significance_level: The significance level the p-value is compared against.
    278 
    279   Returns:
    280     test_outcome_dict: Format {metric: (bool is_different, p-value)}.
    281   """
    282   AssertThatKeysMatch(result_dict_1, result_dict_2)
    283 
    284   test_outcome_dict = {}
    285   for metric in result_dict_1:
    286     is_different, p_value = AreSamplesDifferent(result_dict_1[metric],
    287                                                 result_dict_2[metric],
    288                                                 test, significance_level)
    289     test_outcome_dict[metric] = (is_different, p_value)
    290 
    291   return test_outcome_dict
    292 
    293 
    294 def ArePagesetBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN,
    295                                         significance_level=0.05):
    296   """Runs the given test on the results of each metric/page combination.
    297 
    298   Checks if the dicts have been created from the same benchmark, i.e. if metric
    299   names and pagesets match (e.g. metric first_non_empty_paint_time and page
    300   Google.com). Then runs the specified statistical test on each metric/page
    301   combination's sample to find if they vary significantly.
    302 
    303   Args:
    304     result_dict_1: Benchmark result dict
    305     result_dict_2: Benchmark result dict
    306     test: Statistical test that is used.
    307     significance_level: The significance level the p-value is compared against.
    308 
    309   Returns:
    310     test_outcome_dict: Format {metric: {page: (bool is_different, p-value)}}
    311   """
    312   AssertThatKeysMatch(result_dict_1, result_dict_2)
    313 
    314   # Pagesets should also match.
    315   for metric in result_dict_1.iterkeys():
    316     AssertThatKeysMatch(result_dict_1[metric], result_dict_2[metric])
    317 
    318   test_outcome_dict = {}
    319   for metric in result_dict_1.iterkeys():
    320     test_outcome_dict[metric] = {}
    321     for page in result_dict_1[metric]:
    322       is_different, p_value = AreSamplesDifferent(result_dict_1[metric][page],
    323                                                   result_dict_2[metric][page],
    324                                                   test, significance_level)
    325       test_outcome_dict[metric][page] = (is_different, p_value)
    326 
    327   return test_outcome_dict
    328