1 # Copyright 2014 The Chromium Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import math 6 import os 7 8 import bisect_utils 9 import math_utils 10 import ttest 11 12 13 def ConfidenceScore(good_results_lists, bad_results_lists): 14 """Calculates a confidence score. 15 16 This score is a percentage which represents our degree of confidence in the 17 proposition that the good results and bad results are distinct groups, and 18 their differences aren't due to chance alone. 19 20 21 Args: 22 good_results_lists: A list of lists of "good" result numbers. 23 bad_results_lists: A list of lists of "bad" result numbers. 24 25 Returns: 26 A number in the range [0, 100]. 27 """ 28 # If there's only one item in either list, this means only one revision was 29 # classified good or bad; this isn't good enough evidence to make a decision. 30 # If an empty list was passed, that also implies zero confidence. 31 if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1: 32 return 0.0 33 34 # Flatten the lists of results lists. 35 sample1 = sum(good_results_lists, []) 36 sample2 = sum(bad_results_lists, []) 37 38 # If there were only empty lists in either of the lists (this is unexpected 39 # and normally shouldn't happen), then we also want to return 0. 40 if not sample1 or not sample2: 41 return 0.0 42 43 # The p-value is approximately the probability of obtaining the given set 44 # of good and bad values just by chance. 45 _, _, p_value = ttest.WelchsTTest(sample1, sample2) 46 return 100.0 * (1.0 - p_value) 47 48 49 class BisectResults(object): 50 51 def __init__(self, depot_registry, source_control): 52 self._depot_registry = depot_registry 53 self.revision_data = {} 54 self.error = None 55 self._source_control = source_control 56 57 @staticmethod 58 def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): 59 """Compiles a list of other possible regressions from the revision data. 60 61 Args: 62 revision_data_sorted: Sorted list of (revision, revision data) pairs. 63 bad_greater_than_good: Whether the result value at the "bad" revision is 64 numerically greater than the result value at the "good" revision. 65 66 Returns: 67 A list of [current_rev, previous_rev, confidence] for other places where 68 there may have been a regression. 69 """ 70 other_regressions = [] 71 previous_values = [] 72 previous_id = None 73 for current_id, current_data in revision_data_sorted: 74 current_values = current_data['value'] 75 if current_values: 76 current_values = current_values['values'] 77 if previous_values: 78 confidence = ConfidenceScore(previous_values, [current_values]) 79 mean_of_prev_runs = math_utils.Mean(sum(previous_values, [])) 80 mean_of_current_runs = math_utils.Mean(current_values) 81 82 # Check that the potential regression is in the same direction as 83 # the overall regression. If the mean of the previous runs < the 84 # mean of the current runs, this local regression is in same 85 # direction. 86 prev_less_than_current = mean_of_prev_runs < mean_of_current_runs 87 is_same_direction = (prev_less_than_current if 88 bad_greater_than_good else not prev_less_than_current) 89 90 # Only report potential regressions with high confidence. 91 if is_same_direction and confidence > 50: 92 other_regressions.append([current_id, previous_id, confidence]) 93 previous_values.append(current_values) 94 previous_id = current_id 95 return other_regressions 96 97 def GetResultsDict(self): 98 """Prepares and returns information about the final resulsts as a dict. 99 100 Returns: 101 A dictionary with the following fields 102 103 'first_working_revision': First good revision. 104 'last_broken_revision': Last bad revision. 105 'culprit_revisions': A list of revisions, which contain the bad change 106 introducing the failure. 107 'other_regressions': A list of tuples representing other regressions, 108 which may have occured. 109 'regression_size': For performance bisects, this is a relative change of 110 the mean metric value. For other bisects this field always contains 111 'zero-to-nonzero'. 112 'regression_std_err': For performance bisects, it is a pooled standard 113 error for groups of good and bad runs. Not used for other bisects. 114 'confidence': For performance bisects, it is a confidence that the good 115 and bad runs are distinct groups. Not used for non-performance 116 bisects. 117 'revision_data_sorted': dict mapping revision ids to data about that 118 revision. Each piece of revision data consists of a dict with the 119 following keys: 120 121 'passed': Represents whether the performance test was successful at 122 that revision. Possible values include: 1 (passed), 0 (failed), 123 '?' (skipped), 'F' (build failed). 124 'depot': The depot that this revision is from (i.e. WebKit) 125 'external': If the revision is a 'src' revision, 'external' contains 126 the revisions of each of the external libraries. 127 'sort': A sort value for sorting the dict in order of commits. 128 129 For example: 130 { 131 'CL #1': 132 { 133 'passed': False, 134 'depot': 'chromium', 135 'external': None, 136 'sort': 0 137 } 138 } 139 """ 140 revision_data_sorted = sorted(self.revision_data.iteritems(), 141 key = lambda x: x[1]['sort']) 142 143 # Find range where it possibly broke. 144 first_working_revision = None 145 first_working_revision_index = -1 146 last_broken_revision = None 147 last_broken_revision_index = -1 148 149 culprit_revisions = [] 150 other_regressions = [] 151 regression_size = 0.0 152 regression_std_err = 0.0 153 confidence = 0.0 154 155 for i in xrange(len(revision_data_sorted)): 156 k, v = revision_data_sorted[i] 157 if v['passed'] == 1: 158 if not first_working_revision: 159 first_working_revision = k 160 first_working_revision_index = i 161 162 if not v['passed']: 163 last_broken_revision = k 164 last_broken_revision_index = i 165 166 if last_broken_revision != None and first_working_revision != None: 167 broken_means = [] 168 for i in xrange(0, last_broken_revision_index + 1): 169 if revision_data_sorted[i][1]['value']: 170 broken_means.append(revision_data_sorted[i][1]['value']['values']) 171 172 working_means = [] 173 for i in xrange(first_working_revision_index, len(revision_data_sorted)): 174 if revision_data_sorted[i][1]['value']: 175 working_means.append(revision_data_sorted[i][1]['value']['values']) 176 177 # Flatten the lists to calculate mean of all values. 178 working_mean = sum(working_means, []) 179 broken_mean = sum(broken_means, []) 180 181 # Calculate the approximate size of the regression 182 mean_of_bad_runs = math_utils.Mean(broken_mean) 183 mean_of_good_runs = math_utils.Mean(working_mean) 184 185 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs, 186 mean_of_bad_runs) 187 if math.isnan(regression_size): 188 regression_size = 'zero-to-nonzero' 189 190 regression_std_err = math.fabs(math_utils.PooledStandardError( 191 [working_mean, broken_mean]) / 192 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 193 194 # Give a "confidence" in the bisect. At the moment we use how distinct the 195 # values are before and after the last broken revision, and how noisy the 196 # overall graph is. 197 confidence = ConfidenceScore(working_means, broken_means) 198 199 culprit_revisions = [] 200 201 cwd = os.getcwd() 202 self._depot_registry.ChangeToDepotDir( 203 self.revision_data[last_broken_revision]['depot']) 204 205 if self.revision_data[last_broken_revision]['depot'] == 'cros': 206 # Want to get a list of all the commits and what depots they belong 207 # to so that we can grab info about each. 208 cmd = ['repo', 'forall', '-c', 209 'pwd ; git log --pretty=oneline --before=%d --after=%d' % ( 210 last_broken_revision, first_working_revision + 1)] 211 output, return_code = bisect_utils.RunProcessAndRetrieveOutput(cmd) 212 213 changes = [] 214 assert not return_code, ('An error occurred while running ' 215 '"%s"' % ' '.join(cmd)) 216 last_depot = None 217 cwd = os.getcwd() 218 for l in output.split('\n'): 219 if l: 220 # Output will be in form: 221 # /path_to_depot 222 # /path_to_other_depot 223 # <SHA1> 224 # /path_again 225 # <SHA1> 226 # etc. 227 if l[0] == '/': 228 last_depot = l 229 else: 230 contents = l.split(' ') 231 if len(contents) > 1: 232 changes.append([last_depot, contents[0]]) 233 for c in changes: 234 os.chdir(c[0]) 235 info = self._source_control.QueryRevisionInfo(c[1]) 236 culprit_revisions.append((c[1], info, None)) 237 else: 238 for i in xrange(last_broken_revision_index, len(revision_data_sorted)): 239 k, v = revision_data_sorted[i] 240 if k == first_working_revision: 241 break 242 self._depot_registry.ChangeToDepotDir(v['depot']) 243 info = self._source_control.QueryRevisionInfo(k) 244 culprit_revisions.append((k, info, v['depot'])) 245 os.chdir(cwd) 246 247 # Check for any other possible regression ranges. 248 other_regressions = self._FindOtherRegressions( 249 revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) 250 251 return { 252 'first_working_revision': first_working_revision, 253 'last_broken_revision': last_broken_revision, 254 'culprit_revisions': culprit_revisions, 255 'other_regressions': other_regressions, 256 'regression_size': regression_size, 257 'regression_std_err': regression_std_err, 258 'confidence': confidence, 259 'revision_data_sorted': revision_data_sorted 260 } 261