Home | History | Annotate | Download | only in findit
      1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import os
      6 from threading import Lock
      7 
      8 import blame
      9 from common import utils
     10 import component_dictionary
     11 import crash_utils
     12 import git_repository_parser
     13 import match_set
     14 import svn_repository_parser
     15 
     16 
     17 LINE_CHANGE_PRIORITY = 1
     18 FILE_CHANGE_PRIORITY = 2
     19 _THIS_DIR = os.path.abspath(os.path.dirname(__file__))
     20 CONFIG = crash_utils.ParseURLsFromConfig(os.path.join(_THIS_DIR,
     21                                                       'config.ini'))
     22 
     23 
     24 def GenerateMatchEntry(
     25     matches, revision_info, revision_number, file_path, function,
     26     component_path, component_name, crashed_line_numbers, stack_frame_indices,
     27     file_change_type, repository_parser):
     28   """Generates a match object and adds it to the match set.
     29 
     30   Args:
     31     matches: A matchset object, a map from CL to a match object.
     32     revision_info: The revision information, a map from fields (message,
     33                    changed files, etc) to its values.
     34     revision_number: The SVN revision number or git hash.
     35     file_path: The path of the file.
     36     function: The function that caused an crash.
     37     component_path: The path of the component this file is from.
     38     component_name: The name of the component the file is from.
     39     crashed_line_numbers: The list of the lines in the file that caused
     40                           the crash.
     41     stack_frame_indices: The list of positions of this file within a stack.
     42     file_change_type: Whether file is modified, added or deleted.
     43     repository_parser: The parser object to parse line diff.
     44   """
     45   # Check if this CL should be ignored.
     46   with matches.matches_lock:
     47     if revision_number in matches.cls_to_ignore:
     48       return
     49 
     50     # If this CL is not already identified as suspected, create a new entry.
     51     if revision_number not in matches.matches:
     52       match = match_set.Match(revision_info, component_name)
     53       message = revision_info['message']
     54       # TODO(jeun): Don't hold lock while issuing http request.
     55       match.ParseMessage(message, matches.codereview_api_url)
     56 
     57       # If this match is a revert, add to the set of CLs to be ignored.
     58       if match.is_revert:
     59         matches.cls_to_ignore.add(revision_number)
     60 
     61         # If this match has info on what CL it is reverted from, add that CL.
     62         if match.revert_of:
     63           matches.cls_to_ignore.add(match.revert_of)
     64 
     65         return
     66 
     67       matches.matches[revision_number] = match
     68 
     69     else:
     70       match = matches.matches[revision_number]
     71 
     72   (diff_url, changed_line_numbers, changed_line_contents) = (
     73       repository_parser.ParseLineDiff(
     74           file_path, component_path, file_change_type, revision_number))
     75 
     76   # Ignore this match if the component is not supported for svn.
     77   if not diff_url:
     78     return
     79 
     80   # Find the intersection between the lines that this file crashed on and
     81   # the changed lines.
     82   (line_number_intersection, stack_frame_index_intersection, functions) = (
     83       crash_utils.Intersection(
     84           crashed_line_numbers, stack_frame_indices, changed_line_numbers,
     85           function))
     86 
     87   # Find the minimum distance between the changed lines and crashed lines.
     88   (min_distance, min_crashed_line, min_changed_line) = \
     89       crash_utils.FindMinLineDistance(crashed_line_numbers,
     90                                       changed_line_numbers)
     91 
     92   # Check whether this CL changes the crashed lines or not.
     93   if line_number_intersection:
     94     priority = LINE_CHANGE_PRIORITY
     95   else:
     96     priority = FILE_CHANGE_PRIORITY
     97 
     98   # Add the parsed information to the object.
     99   with matches.matches_lock:
    100     match.crashed_line_numbers.append(line_number_intersection)
    101 
    102     file_name = file_path.split('/')[-1]
    103     match.changed_files.append(file_name)
    104 
    105     # Update the min distance only if it is less than the current one.
    106     if min_distance < match.min_distance:
    107       match.min_distance = min_distance
    108       match.min_distance_info = (file_name, min_crashed_line, min_changed_line)
    109 
    110     # If this CL does not change the crashed line, all occurrence of this
    111     # file in the stack has the same priority.
    112     if not stack_frame_index_intersection:
    113       stack_frame_index_intersection = stack_frame_indices
    114       functions = function
    115     match.stack_frame_indices.append(stack_frame_index_intersection)
    116     match.changed_file_urls.append(diff_url)
    117     match.priorities.append(priority)
    118     match.function_list.append(functions)
    119 
    120 
    121 def FindMatch(revisions_info_map, file_to_revision_info, file_to_crash_info,
    122               component_path, component_name, repository_parser,
    123               codereview_api_url):
    124   """Finds a CL that modifies file in the stacktrace.
    125 
    126   Args:
    127     revisions_info_map: A dictionary mapping revision number to the CL
    128                         information.
    129     file_to_revision_info: A dictionary mapping file to the revision that
    130                            modifies it.
    131     file_to_crash_info: A dictionary mapping file to its occurrence in
    132                        stacktrace.
    133     component_path: The path of the component to search for.
    134     component_name: The name of the component to search for.
    135     repository_parser: The parser object to parse the line diff.
    136     codereview_api_url: A code review url to retrieve data from.
    137 
    138   Returns:
    139     Matches, a set of match objects.
    140   """
    141   matches = match_set.MatchSet(codereview_api_url)
    142 
    143   tasks = []
    144   # Iterate through the crashed files in the stacktrace.
    145   for crashed_file_path in file_to_crash_info:
    146     # Ignore header file.
    147     if crashed_file_path.endswith('.h'):
    148       continue
    149 
    150     # If the file in the stacktrace is not changed in any commits, continue.
    151     for changed_file_path in file_to_revision_info:
    152       changed_file_name = changed_file_path.split('/')[-1].lower()
    153       crashed_file_name = crashed_file_path.split('/')[-1].lower()
    154       if changed_file_name != crashed_file_name:
    155         continue
    156 
    157       if not crash_utils.GuessIfSameSubPath(
    158           changed_file_path.lower(), crashed_file_path.lower()):
    159         continue
    160 
    161       crashed_line_numbers = file_to_crash_info.GetCrashedLineNumbers(
    162           crashed_file_path)
    163       stack_frame_nums = file_to_crash_info.GetCrashStackFrameIndices(
    164           crashed_file_path)
    165       functions = file_to_crash_info.GetCrashFunctions(crashed_file_path)
    166 
    167       # Iterate through the CLs that this file path is changed.
    168       for (cl, file_change_type) in file_to_revision_info[changed_file_path]:
    169         # If the file change is delete, ignore this CL.
    170         if file_change_type == 'D':
    171           continue
    172 
    173         revision = revisions_info_map[cl]
    174 
    175         tasks.append({
    176             'function': GenerateMatchEntry,
    177             'args':[matches, revision, cl, changed_file_path, functions,
    178                     component_path, component_name, crashed_line_numbers,
    179                     stack_frame_nums, file_change_type,
    180                    repository_parser]})
    181 
    182   # Run all the tasks.
    183   crash_utils.RunTasks(tasks)
    184 
    185   matches.RemoveRevertedCLs()
    186 
    187   return matches
    188 
    189 
    190 def FindMatchForComponent(component_path, file_to_crash_info, changelog,
    191                           callstack_priority, results, results_lock):
    192   """Parses changelog and finds suspected CLs for a given component.
    193 
    194   Args:
    195     component_path: The path of component to look for the culprit CL.
    196     file_to_crash_info: A dictionary mapping file to its occurrence in
    197                         stackframe.
    198     changelog: The parsed changelog for this component.
    199     callstack_priority: The priority of this call stack, 0 if from crash stack,
    200                         1 if from freed, 2 if from previously allocated.
    201     results: A dictionary to store the result.
    202     results_lock: A lock that guards results.
    203   """
    204   (repository_parser, component_name, revisions, file_to_revision_map) = \
    205       changelog
    206 
    207   # Find match for this component.
    208   codereview_api_url = CONFIG['codereview']['review_url']
    209   component_result = FindMatch(
    210       revisions, file_to_revision_map, file_to_crash_info, component_path,
    211       component_name, repository_parser, codereview_api_url)
    212   matches = component_result.matches
    213 
    214   # For all the match results in a dictionary, add to the list so that it
    215   # can be sorted.
    216   with results_lock:
    217     for cl in matches:
    218       match = matches[cl]
    219       results.append((callstack_priority, cl, match))
    220 
    221 
    222 def FindMatchForCallstack(
    223     callstack, components, component_to_changelog_map, results,
    224     results_lock):
    225   """Finds culprit cl for a stack within a stacktrace.
    226 
    227   For each components to look for, create new thread that computes the matches
    228   and join the results at the end.
    229 
    230   Args:
    231     callstack: A callstack in a stacktrace to find the result for.
    232     components: A set of components to look for.
    233     component_to_changelog_map: A map from component to its parsed changelog.
    234     results: A list to aggregrate results from all stacktraces.
    235     results_lock: A lock that guards results.
    236   """
    237   # Create component dictionary from the component and call stack.
    238   component_dict = component_dictionary.ComponentDictionary(callstack,
    239                                                             components)
    240   callstack_priority = callstack.priority
    241 
    242   # Iterate through all components.
    243   for component_path in component_dict:
    244     # If the component to consider in this callstack is not in the parsed list
    245     # of components, ignore this one.
    246     if component_path not in component_to_changelog_map:
    247       continue
    248 
    249     changelog = component_to_changelog_map[component_path]
    250     file_to_crash_info = component_dict.GetFileDict(component_path)
    251     FindMatchForComponent(component_path, file_to_crash_info, changelog,
    252                           callstack_priority, results, results_lock)
    253 
    254 
    255 def FindMatchForStacktrace(stacktrace, components,
    256                            component_to_regression_dict):
    257   """Finds the culprit CL for stacktrace.
    258 
    259   The passed stacktrace is either from release build stacktrace
    260   or debug build stacktrace.
    261 
    262   Args:
    263     stacktrace: A list of parsed stacks within a stacktrace.
    264     components: A set of components to look for.
    265     component_to_regression_dict: A dictionary mapping component path to
    266                                   its regression.
    267 
    268   Returns:
    269     A list of match results from all stacks.
    270   """
    271   # A list to aggregate results from all the callstacks in the stacktrace.
    272   results = []
    273   results_lock = Lock()
    274 
    275   # Setup parsers.
    276   svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
    277   git_parser = git_repository_parser.GitParser(component_to_regression_dict,
    278                                                CONFIG['git'])
    279 
    280   # Create a cache of parsed revisions.
    281   component_to_changelog_map = {}
    282   for component_path in components:
    283     component_object = component_to_regression_dict[component_path]
    284     range_start = component_object['old_revision']
    285     range_end = component_object['new_revision']
    286 
    287     # If range start is 0, the range is too large and the crash has been
    288     # introduced the archived build, so ignore this case.
    289     if range_start == '0':
    290       continue
    291 
    292     component_name = component_to_regression_dict[component_path]['name']
    293 
    294     is_git = utils.IsGitHash(range_start)
    295     if is_git:
    296       repository_parser = git_parser
    297     else:
    298       repository_parser = svn_parser
    299 
    300     (revisions, file_to_revision_map) = repository_parser.ParseChangelog(
    301         component_path, range_start, range_end)
    302 
    303     # If the returned map from ParseChangeLog is empty, we don't need to look
    304     # further because either the parsing failed or the changelog is empty.
    305     if not (revisions and file_to_revision_map):
    306       continue
    307 
    308     component_to_changelog_map[component_path] = (repository_parser,
    309                                                   component_name,
    310                                                   revisions,
    311                                                   file_to_revision_map)
    312 
    313   # Analyze each of the call stacks in the stacktrace.
    314   for callstack in stacktrace.stack_list:
    315     FindMatchForCallstack(callstack, components, component_to_changelog_map,
    316                           results, results_lock)
    317 
    318   return results
    319 
    320 
    321 def SortMatchesFunction(match_with_stack_priority):
    322   """A function to sort the match triple.
    323 
    324   Currently, it sorts the list by:
    325   1) The highest priority file change in the CL (changing crashed line is
    326   higher priority than just changing the file).
    327   2) The callstack this match is computed (crash stack, freed, allocation).
    328   3) The minimum stack frame number of the changed file in the match.
    329   4) The number of files this CL changes (higher the better).
    330   5) The minimum distance between the lines that the CL changes and crashed
    331   lines.
    332 
    333   Args:
    334     match_with_stack_priority: A match object, with the CL it is from and what
    335                                callstack it is from.
    336 
    337   Returns:
    338     A sort key.
    339   """
    340   (stack_priority, _, match) = match_with_stack_priority
    341 
    342   return (min(match.priorities),
    343           stack_priority,
    344           match.min_distance,
    345           crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,
    346                                               match.priorities),
    347           -len(match.changed_files))
    348 
    349 
    350 def SortAndFilterMatches(matches, num_important_frames=5):
    351   """Filters the list of potential culprit CLs to remove noise.
    352 
    353   Args:
    354     matches: A list containing match results.
    355     num_important_frames: A number of frames on the top of the frame to Check
    356                           for when filtering the results. A match with a file
    357                           that is in top num_important_frames of the stacktrace
    358                           is regarded more probable then others.
    359 
    360   Returns:
    361     Filtered match results.
    362   """
    363   new_matches = []
    364   line_changed = False
    365   is_important_frame = False
    366   highest_priority_stack = crash_utils.INFINITY
    367   matches.sort(key=SortMatchesFunction)
    368   # Iterate through the matches to find out what results are significant.
    369   for stack_priority, cl, match in matches:
    370     # Check if the current match changes crashed line.
    371     is_line_change = (min(match.priorities) == LINE_CHANGE_PRIORITY)
    372 
    373     # Check which stack this match is from, and finds the highest priority
    374     # callstack up to this point.
    375     current_stack = stack_priority
    376     if current_stack < highest_priority_stack:
    377       highest_priority_stack = current_stack
    378 
    379     # Check if current match changes a file that occurs in crash state.
    380     flattened_stack_frame_indices = [frame for frame_indices in
    381                                      match.stack_frame_indices
    382                                      for frame in frame_indices]
    383     current_is_important = (
    384         min(flattened_stack_frame_indices) < num_important_frames)
    385 
    386     # This match and anything lower than this should be ignored if:
    387     # - Current match does not change crashed lines but there are matches
    388     # that do so.
    389     # - Current match is not in crash state but there are matches in it.
    390     # - There are other matches that came from higher priority stack.
    391     if (line_changed and not is_line_change) or (
    392         is_important_frame and not current_is_important) or (
    393             current_stack > highest_priority_stack):
    394       break
    395 
    396     # Update the variables.
    397     if is_line_change:
    398       line_changed = True
    399     if current_is_important:
    400       is_important_frame = True
    401 
    402     # Add current match to the filtered result.
    403     new_matches.append((stack_priority, cl, match))
    404 
    405   return new_matches
    406 
    407 
    408 def GenerateReasonForMatches(matches):
    409   """Generates a reason that a match (CL) is a culprit cl.
    410 
    411   Args:
    412     matches: A list of match objects.
    413   """
    414   # Iterate through the matches in the list.
    415   for i, _, match in matches:
    416     reason = []
    417 
    418     # Zip the files in the match by the reason they are suspected
    419     # (how the file is modified).
    420     match_by_priority = zip(
    421         match.priorities, match.crashed_line_numbers, match.changed_files,
    422         match.stack_frame_indices, match.function_list)
    423 
    424     # Sort the zipped changed files in the match by their priority so that the
    425     # changed lines comes first in the reason.
    426     match_by_priority.sort(
    427         key=lambda (priority, crashed_line_numbers, file_name,
    428                     stack_frame_indices, function_list): priority)
    429 
    430     # Iterate through the sorted match.
    431     for i in range(len(match_by_priority)):
    432       (priority, crashed_line_numbers, file_name, stack_frame_indices,
    433        function_list) = match_by_priority[i]
    434 
    435       # If the file in the match is a line change, append a explanation.
    436       if priority == LINE_CHANGE_PRIORITY:
    437         crashed_line_numbers = [crashed_line_number
    438                                 for lines in crashed_line_numbers
    439                                 for crashed_line_number in lines]
    440         reason.append(
    441             'Lines %s of file %s which potentially caused crash '
    442             'are changed in this cl (%s).\n' %
    443             (utils.JoinLineNumbers(crashed_line_numbers, accepted_gap=4),
    444              file_name,
    445              crash_utils.PrettifyFrameInfo(stack_frame_indices, function_list)))
    446 
    447       else:
    448         # Get all the files that are not line change.
    449         rest_of_the_files = match_by_priority[i:]
    450 
    451         if len(rest_of_the_files) == 1:
    452           file_string = 'File %s is changed in this cl '
    453         else:
    454           file_string = 'Files %s are changed in this cl '
    455 
    456         # Create a list of file names, and prettify the list.
    457         file_names = [
    458             file_name for (_, _, file_name, _, _) in rest_of_the_files]
    459         pretty_file_names = crash_utils.PrettifyList(file_names)
    460 
    461         # Add the reason, break because we took care of the rest of the files.
    462         file_string += ('(and is part of stack %s)' %
    463             crash_utils.PrettifyFrameInfo(stack_frame_indices, function_list))
    464         reason.append(file_string % pretty_file_names)
    465         break
    466 
    467     # Set the reason as string.
    468     match.reason = '\n'.join(reason)
    469 
    470 
    471 def CombineMatches(matches):
    472   """Combine possible duplicates in matches.
    473 
    474   Args:
    475     matches: A list of matches object, along with its callstack priority and
    476              CL it is from.
    477   Returns:
    478     A combined list of matches.
    479   """
    480   combined_matches = []
    481 
    482   for stack_index, cl, match in matches:
    483     found_match = None
    484 
    485     # Iterate through the list of combined matches.
    486     for _, cl_combined, match_combined in combined_matches:
    487       # Check for if current CL is already higher up in the result.
    488       if cl == cl_combined:
    489         found_match = match_combined
    490         break
    491 
    492     # If current match is not already in, add it to the list of matches.
    493     if not found_match:
    494       combined_matches.append((stack_index, cl, match))
    495       continue
    496 
    497     # Combine the reason if the current match is already in there.
    498     found_match.reason += match.reason
    499     if match.min_distance < found_match.min_distance:
    500       found_match.min_distance = match.min_distance
    501       found_match.min_distance_info = match.min_distance_info
    502 
    503   for stack_index, cl, match in combined_matches:
    504     if match.min_distance_info:
    505       file_name, min_crashed_line, min_changed_line = match.min_distance_info
    506       match.reason += \
    507           ('\nMinimum distance from crash line to modified line: %d. '
    508            '(file: %s, crashed on: %d, modified: %d).\n' %
    509            (match.min_distance, file_name, min_crashed_line, min_changed_line))
    510 
    511   return combined_matches
    512 
    513 
    514 def FilterAndGenerateReasonForMatches(result):
    515   """A wrapper function.
    516 
    517   It generates reasons for the matches and returns string representation
    518   of filtered results.
    519 
    520   Args:
    521     result: A list of match objects.
    522 
    523   Returns:
    524     A string representation of filtered results.
    525   """
    526   new_result = SortAndFilterMatches(result)
    527   GenerateReasonForMatches(new_result)
    528   combined_matches = CombineMatches(new_result)
    529   return crash_utils.MatchListToResultList(combined_matches)
    530 
    531 
    532 def ParseCrashComponents(main_stack):
    533   """Parses the crashing component.
    534 
    535   Crashing components is a component that top_n_frames of the stacktrace is
    536   from.
    537 
    538   Args:
    539     main_stack: Main stack from the stacktrace.
    540 
    541   Returns:
    542     A set of components.
    543   """
    544   components = set()
    545 
    546   for frame in main_stack.frame_list:
    547     components.add(frame.component_path)
    548 
    549   return components
    550 
    551 
    552 def GenerateAndFilterBlameList(callstack, component_to_crash_revision_dict,
    553                                component_to_regression_dict):
    554   """A wrapper function.
    555 
    556   Finds blame information for stack and returns string representation.
    557 
    558   Args:
    559     callstack: A callstack to find the blame information.
    560     component_to_crash_revision_dict: A dictionary mapping component to its
    561                                       crash revision.
    562     component_to_regression_dict: A dictionary mapping component to its
    563                                   regression.
    564 
    565   Returns:
    566     A list of blame results.
    567   """
    568   if component_to_regression_dict:
    569     parsed_deps = component_to_regression_dict
    570   else:
    571     parsed_deps = component_to_crash_revision_dict
    572 
    573   # Setup parser objects to use for parsing blame information.
    574   svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
    575   git_parser = git_repository_parser.GitParser(parsed_deps, CONFIG['git'])
    576   parsers = {}
    577   parsers['svn'] = svn_parser
    578   parsers['git'] = git_parser
    579 
    580   # Create and generate the blame objects from the callstack.
    581   blame_list = blame.BlameList()
    582   blame_list.FindBlame(callstack, component_to_crash_revision_dict,
    583                        component_to_regression_dict,
    584                        parsers)
    585 
    586   blame_list.FilterAndSortBlameList()
    587   return crash_utils.BlameListToResultList(blame_list)
    588 
    589 
    590 def FindItForCrash(stacktrace_list,
    591                    callstack,
    592                    component_to_regression_dict,
    593                    component_to_crash_revision_dict):
    594   """Finds the culprit CL from the list of stacktrace.
    595 
    596   Args:
    597     stacktrace_list: A list of stacktraces to look for, in the order of
    598                      decreasing significance.
    599     callstack: A callstack object to show blame information for, if there are
    600                no results for all stacktraces in the stacktrace_list.
    601     component_to_regression_dict: A parsed regression information as a
    602                                   result of parsing DEPS file.
    603     component_to_crash_revision_dict: A parsed crash revision information.
    604 
    605   Returns:
    606     A list of result objects, with the message how the result is created.
    607   """
    608   # If regression information is not available, return blame information.
    609   if not component_to_regression_dict:
    610     result = GenerateAndFilterBlameList(callstack,
    611                                         component_to_crash_revision_dict,
    612                                         component_to_regression_dict)
    613     if result:
    614       return_message = (
    615           'Regression information is not available. The result is '
    616           'the blame information.')
    617     else:
    618       return_message = ('Findit could not find any suspected CLs.')
    619 
    620     return (return_message, result)
    621 
    622   for stacktrace in stacktrace_list:
    623     # Check the next stacktrace if current one is empty.
    624     if not stacktrace.stack_list:
    625       continue
    626 
    627     # Get the crash stack for this stacktrace, and extract crashing components
    628     # from it.
    629     main_stack = stacktrace.GetCrashStack()
    630     components = ParseCrashComponents(main_stack)
    631 
    632     result_for_stacktrace = FindMatchForStacktrace(
    633         stacktrace, components, component_to_regression_dict)
    634     filtered_result = FilterAndGenerateReasonForMatches(result_for_stacktrace)
    635 
    636     # If the result is empty, check the next stacktrace. Else, return the
    637     # filtered result.
    638     if not filtered_result:
    639       continue
    640 
    641     return_message = (
    642         'The result is a list of CLs that change the crashed files.')
    643     return (return_message, filtered_result)
    644 
    645   # If no match is found, return the blame information for the input
    646   # callstack.
    647   result = GenerateAndFilterBlameList(
    648       callstack, component_to_crash_revision_dict,
    649       component_to_regression_dict)
    650 
    651   if result:
    652     return_message = (
    653         'No CL in the regression range changes the crashed files. '
    654         'The result is the blame information.')
    655 
    656   # When findit could not find any CL that changes file in stacktrace or if
    657   # if cannot get any blame information, return a message saying that no
    658   # results are available.
    659   else:
    660     return_message = ('Findit could not find any suspected CLs.')
    661 
    662   return (return_message, result)
    663 
    664