Home | History | Annotate | Download | only in auto_bisect
      1 # Copyright 2014 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import math
      6 import os
      7 
      8 import bisect_utils
      9 import math_utils
     10 import ttest
     11 
     12 
     13 def ConfidenceScore(good_results_lists, bad_results_lists):
     14   """Calculates a confidence score.
     15 
     16   This score is a percentage which represents our degree of confidence in the
     17   proposition that the good results and bad results are distinct groups, and
     18   their differences aren't due to chance alone.
     19 
     20 
     21   Args:
     22     good_results_lists: A list of lists of "good" result numbers.
     23     bad_results_lists: A list of lists of "bad" result numbers.
     24 
     25   Returns:
     26     A number in the range [0, 100].
     27   """
     28   # If there's only one item in either list, this means only one revision was
     29   # classified good or bad; this isn't good enough evidence to make a decision.
     30   # If an empty list was passed, that also implies zero confidence.
     31   if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
     32     return 0.0
     33 
     34   # Flatten the lists of results lists.
     35   sample1 = sum(good_results_lists, [])
     36   sample2 = sum(bad_results_lists, [])
     37 
     38   # If there were only empty lists in either of the lists (this is unexpected
     39   # and normally shouldn't happen), then we also want to return 0.
     40   if not sample1 or not sample2:
     41     return 0.0
     42 
     43   # The p-value is approximately the probability of obtaining the given set
     44   # of good and bad values just by chance.
     45   _, _, p_value = ttest.WelchsTTest(sample1, sample2)
     46   return 100.0 * (1.0 - p_value)
     47 
     48 
     49 class BisectResults(object):
     50 
     51   def __init__(self, depot_registry, source_control):
     52     self._depot_registry = depot_registry
     53     self.revision_data = {}
     54     self.error = None
     55     self._source_control = source_control
     56 
     57   @staticmethod
     58   def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
     59     """Compiles a list of other possible regressions from the revision data.
     60 
     61     Args:
     62       revision_data_sorted: Sorted list of (revision, revision data) pairs.
     63       bad_greater_than_good: Whether the result value at the "bad" revision is
     64           numerically greater than the result value at the "good" revision.
     65 
     66     Returns:
     67       A list of [current_rev, previous_rev, confidence] for other places where
     68       there may have been a regression.
     69     """
     70     other_regressions = []
     71     previous_values = []
     72     previous_id = None
     73     for current_id, current_data in revision_data_sorted:
     74       current_values = current_data['value']
     75       if current_values:
     76         current_values = current_values['values']
     77         if previous_values:
     78           confidence = ConfidenceScore(previous_values, [current_values])
     79           mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
     80           mean_of_current_runs = math_utils.Mean(current_values)
     81 
     82           # Check that the potential regression is in the same direction as
     83           # the overall regression. If the mean of the previous runs < the
     84           # mean of the current runs, this local regression is in same
     85           # direction.
     86           prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
     87           is_same_direction = (prev_less_than_current if
     88               bad_greater_than_good else not prev_less_than_current)
     89 
     90           # Only report potential regressions with high confidence.
     91           if is_same_direction and confidence > 50:
     92             other_regressions.append([current_id, previous_id, confidence])
     93         previous_values.append(current_values)
     94         previous_id = current_id
     95     return other_regressions
     96 
     97   def GetResultsDict(self):
     98     """Prepares and returns information about the final resulsts as a dict.
     99 
    100     Returns:
    101       A dictionary with the following fields
    102 
    103       'first_working_revision': First good revision.
    104       'last_broken_revision': Last bad revision.
    105       'culprit_revisions': A list of revisions, which contain the bad change
    106           introducing the failure.
    107       'other_regressions': A list of tuples representing other regressions,
    108           which may have occured.
    109       'regression_size': For performance bisects, this is a relative change of
    110           the mean metric value. For other bisects this field always contains
    111           'zero-to-nonzero'.
    112       'regression_std_err': For performance bisects, it is a pooled standard
    113           error for groups of good and bad runs. Not used for other bisects.
    114       'confidence': For performance bisects, it is a confidence that the good
    115           and bad runs are distinct groups. Not used for non-performance
    116           bisects.
    117       'revision_data_sorted': dict mapping revision ids to data about that
    118           revision. Each piece of revision data consists of a dict with the
    119           following keys:
    120 
    121           'passed': Represents whether the performance test was successful at
    122               that revision. Possible values include: 1 (passed), 0 (failed),
    123               '?' (skipped), 'F' (build failed).
    124           'depot': The depot that this revision is from (i.e. WebKit)
    125           'external': If the revision is a 'src' revision, 'external' contains
    126               the revisions of each of the external libraries.
    127           'sort': A sort value for sorting the dict in order of commits.
    128 
    129           For example:
    130           {
    131             'CL #1':
    132             {
    133               'passed': False,
    134               'depot': 'chromium',
    135               'external': None,
    136               'sort': 0
    137             }
    138           }
    139     """
    140     revision_data_sorted = sorted(self.revision_data.iteritems(),
    141                                   key = lambda x: x[1]['sort'])
    142 
    143     # Find range where it possibly broke.
    144     first_working_revision = None
    145     first_working_revision_index = -1
    146     last_broken_revision = None
    147     last_broken_revision_index = -1
    148 
    149     culprit_revisions = []
    150     other_regressions = []
    151     regression_size = 0.0
    152     regression_std_err = 0.0
    153     confidence = 0.0
    154 
    155     for i in xrange(len(revision_data_sorted)):
    156       k, v = revision_data_sorted[i]
    157       if v['passed'] == 1:
    158         if not first_working_revision:
    159           first_working_revision = k
    160           first_working_revision_index = i
    161 
    162       if not v['passed']:
    163         last_broken_revision = k
    164         last_broken_revision_index = i
    165 
    166     if last_broken_revision != None and first_working_revision != None:
    167       broken_means = []
    168       for i in xrange(0, last_broken_revision_index + 1):
    169         if revision_data_sorted[i][1]['value']:
    170           broken_means.append(revision_data_sorted[i][1]['value']['values'])
    171 
    172       working_means = []
    173       for i in xrange(first_working_revision_index, len(revision_data_sorted)):
    174         if revision_data_sorted[i][1]['value']:
    175           working_means.append(revision_data_sorted[i][1]['value']['values'])
    176 
    177       # Flatten the lists to calculate mean of all values.
    178       working_mean = sum(working_means, [])
    179       broken_mean = sum(broken_means, [])
    180 
    181       # Calculate the approximate size of the regression
    182       mean_of_bad_runs = math_utils.Mean(broken_mean)
    183       mean_of_good_runs = math_utils.Mean(working_mean)
    184 
    185       regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
    186                                                       mean_of_bad_runs)
    187       if math.isnan(regression_size):
    188         regression_size = 'zero-to-nonzero'
    189 
    190       regression_std_err = math.fabs(math_utils.PooledStandardError(
    191           [working_mean, broken_mean]) /
    192           max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
    193 
    194       # Give a "confidence" in the bisect. At the moment we use how distinct the
    195       # values are before and after the last broken revision, and how noisy the
    196       # overall graph is.
    197       confidence = ConfidenceScore(working_means, broken_means)
    198 
    199       culprit_revisions = []
    200 
    201       cwd = os.getcwd()
    202       self._depot_registry.ChangeToDepotDir(
    203           self.revision_data[last_broken_revision]['depot'])
    204 
    205       if self.revision_data[last_broken_revision]['depot'] == 'cros':
    206         # Want to get a list of all the commits and what depots they belong
    207         # to so that we can grab info about each.
    208         cmd = ['repo', 'forall', '-c',
    209             'pwd ; git log --pretty=oneline --before=%d --after=%d' % (
    210             last_broken_revision, first_working_revision + 1)]
    211         output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd)
    212 
    213         changes = []
    214         assert not return_code, ('An error occurred while running '
    215                                  '"%s"' % ' '.join(cmd))
    216         last_depot = None
    217         cwd = os.getcwd()
    218         for l in output.split('\n'):
    219           if l:
    220             # Output will be in form:
    221             # /path_to_depot
    222             # /path_to_other_depot
    223             # <SHA1>
    224             # /path_again
    225             # <SHA1>
    226             # etc.
    227             if l[0] == '/':
    228               last_depot = l
    229             else:
    230               contents = l.split(' ')
    231               if len(contents) > 1:
    232                 changes.append([last_depot, contents[0]])
    233         for c in changes:
    234           os.chdir(c[0])
    235           info = self._source_control.QueryRevisionInfo(c[1])
    236           culprit_revisions.append((c[1], info, None))
    237       else:
    238         for i in xrange(last_broken_revision_index, len(revision_data_sorted)):
    239           k, v = revision_data_sorted[i]
    240           if k == first_working_revision:
    241             break
    242           self._depot_registry.ChangeToDepotDir(v['depot'])
    243           info = self._source_control.QueryRevisionInfo(k)
    244           culprit_revisions.append((k, info, v['depot']))
    245       os.chdir(cwd)
    246 
    247       # Check for any other possible regression ranges.
    248       other_regressions = self._FindOtherRegressions(
    249           revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)
    250 
    251     return {
    252         'first_working_revision': first_working_revision,
    253         'last_broken_revision': last_broken_revision,
    254         'culprit_revisions': culprit_revisions,
    255         'other_regressions': other_regressions,
    256         'regression_size': regression_size,
    257         'regression_std_err': regression_std_err,
    258         'confidence': confidence,
    259         'revision_data_sorted': revision_data_sorted
    260     }
    261