tools/findit/findit_for_crash.py

# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import os
from threading import Lock

import blame
from common import utils
import component_dictionary
import crash_utils
import git_repository_parser
import match_set
import svn_repository_parser


LINE_CHANGE_PRIORITY = 1
FILE_CHANGE_PRIORITY = 2
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
CONFIG = crash_utils.ParseURLsFromConfig(os.path.join(_THIS_DIR,
                                                      'config.ini'))


def GenerateMatchEntry(
    matches, revision_info, revision_number, file_path, function,
    component_path, component_name, crashed_line_numbers, stack_frame_indices,
    file_change_type, repository_parser):
  """Generates a match object and adds it to the match set.

  Args:
    matches: A matchset object, a map from CL to a match object.
    revision_info: The revision information, a map from fields (message,
                   changed files, etc) to its values.
    revision_number: The SVN revision number or git hash.
    file_path: The path of the file.
    function: The function that caused an crash.
    component_path: The path of the component this file is from.
    component_name: The name of the component the file is from.
    crashed_line_numbers: The list of the lines in the file that caused
                          the crash.
    stack_frame_indices: The list of positions of this file within a stack.
    file_change_type: Whether file is modified, added or deleted.
    repository_parser: The parser object to parse line diff.
  """
  # Check if this CL should be ignored.
  with matches.matches_lock:
    if revision_number in matches.cls_to_ignore:
      return

    # If this CL is not already identified as suspected, create a new entry.
    if revision_number not in matches.matches:
      match = match_set.Match(revision_info, component_name)
      message = revision_info['message']
      # TODO(jeun): Don't hold lock while issuing http request.
      match.ParseMessage(message, matches.codereview_api_url)

      # If this match is a revert, add to the set of CLs to be ignored.
      if match.is_revert:
        matches.cls_to_ignore.add(revision_number)

        # If this match has info on what CL it is reverted from, add that CL.
        if match.revert_of:
          matches.cls_to_ignore.add(match.revert_of)

        return

      matches.matches[revision_number] = match

    else:
      match = matches.matches[revision_number]

  (diff_url, changed_line_numbers, changed_line_contents) = (
      repository_parser.ParseLineDiff(
          file_path, component_path, file_change_type, revision_number))

  # Ignore this match if the component is not supported for svn.
  if not diff_url:
    return

  # Find the intersection between the lines that this file crashed on and
  # the changed lines.
  (line_number_intersection, stack_frame_index_intersection, functions) = (
      crash_utils.Intersection(
          crashed_line_numbers, stack_frame_indices, changed_line_numbers,
          function))

  # Find the minimum distance between the changed lines and crashed lines.
  (min_distance, min_crashed_line, min_changed_line) = \
      crash_utils.FindMinLineDistance(crashed_line_numbers,
                                      changed_line_numbers)

  # Check whether this CL changes the crashed lines or not.
  if line_number_intersection:
    priority = LINE_CHANGE_PRIORITY
  else:
    priority = FILE_CHANGE_PRIORITY

  # Add the parsed information to the object.
  with matches.matches_lock:
    match.crashed_line_numbers.append(line_number_intersection)

    file_name = file_path.split('/')[-1]
    match.changed_files.append(file_name)

    # Update the min distance only if it is less than the current one.
    if min_distance < match.min_distance:
      match.min_distance = min_distance
      match.min_distance_info = (file_name, min_crashed_line, min_changed_line)

    # If this CL does not change the crashed line, all occurrence of this
    # file in the stack has the same priority.
    if not stack_frame_index_intersection:
      stack_frame_index_intersection = stack_frame_indices
      functions = function
    match.stack_frame_indices.append(stack_frame_index_intersection)
    match.changed_file_urls.append(diff_url)
    match.priorities.append(priority)
    match.function_list.append(functions)


def FindMatch(revisions_info_map, file_to_revision_info, file_to_crash_info,
              component_path, component_name, repository_parser,
              codereview_api_url):
  """Finds a CL that modifies file in the stacktrace.

  Args:
    revisions_info_map: A dictionary mapping revision number to the CL
                        information.
    file_to_revision_info: A dictionary mapping file to the revision that
                           modifies it.
    file_to_crash_info: A dictionary mapping file to its occurrence in
                       stacktrace.
    component_path: The path of the component to search for.
    component_name: The name of the component to search for.
    repository_parser: The parser object to parse the line diff.
    codereview_api_url: A code review url to retrieve data from.

  Returns:
    Matches, a set of match objects.
  """
  matches = match_set.MatchSet(codereview_api_url)

  tasks = []
  # Iterate through the crashed files in the stacktrace.
  for crashed_file_path in file_to_crash_info:
    # Ignore header file.
    if crashed_file_path.endswith('.h'):
      continue

    # If the file in the stacktrace is not changed in any commits, continue.
    for changed_file_path in file_to_revision_info:
      changed_file_name = changed_file_path.split('/')[-1].lower()
      crashed_file_name = crashed_file_path.split('/')[-1].lower()
      if changed_file_name != crashed_file_name:
        continue

      if not crash_utils.GuessIfSameSubPath(
          changed_file_path.lower(), crashed_file_path.lower()):
        continue

      crashed_line_numbers = file_to_crash_info.GetCrashedLineNumbers(
          crashed_file_path)
      stack_frame_nums = file_to_crash_info.GetCrashStackFrameIndices(
          crashed_file_path)
      functions = file_to_crash_info.GetCrashFunctions(crashed_file_path)

      # Iterate through the CLs that this file path is changed.
      for (cl, file_change_type) in file_to_revision_info[changed_file_path]:
        # If the file change is delete, ignore this CL.
        if file_change_type == 'D':
          continue

        revision = revisions_info_map[cl]

        tasks.append({
            'function': GenerateMatchEntry,
            'args':[matches, revision, cl, changed_file_path, functions,
                    component_path, component_name, crashed_line_numbers,
                    stack_frame_nums, file_change_type,
                   repository_parser]})

  # Run all the tasks.
  crash_utils.RunTasks(tasks)

  matches.RemoveRevertedCLs()

  return matches


def FindMatchForComponent(component_path, file_to_crash_info, changelog,
                          callstack_priority, results, results_lock):
  """Parses changelog and finds suspected CLs for a given component.

  Args:
    component_path: The path of component to look for the culprit CL.
    file_to_crash_info: A dictionary mapping file to its occurrence in
                        stackframe.
    changelog: The parsed changelog for this component.
    callstack_priority: The priority of this call stack, 0 if from crash stack,
                        1 if from freed, 2 if from previously allocated.
    results: A dictionary to store the result.
    results_lock: A lock that guards results.
  """
  (repository_parser, component_name, revisions, file_to_revision_map) = \
      changelog

  # Find match for this component.
  codereview_api_url = CONFIG['codereview']['review_url']
  component_result = FindMatch(
      revisions, file_to_revision_map, file_to_crash_info, component_path,
      component_name, repository_parser, codereview_api_url)
  matches = component_result.matches

  # For all the match results in a dictionary, add to the list so that it
  # can be sorted.
  with results_lock:
    for cl in matches:
      match = matches[cl]
      results.append((callstack_priority, cl, match))


def FindMatchForCallstack(
    callstack, components, component_to_changelog_map, results,
    results_lock):
  """Finds culprit cl for a stack within a stacktrace.

  For each components to look for, create new thread that computes the matches
  and join the results at the end.

  Args:
    callstack: A callstack in a stacktrace to find the result for.
    components: A set of components to look for.
    component_to_changelog_map: A map from component to its parsed changelog.
    results: A list to aggregrate results from all stacktraces.
    results_lock: A lock that guards results.
  """
  # Create component dictionary from the component and call stack.
  component_dict = component_dictionary.ComponentDictionary(callstack,
                                                            components)
  callstack_priority = callstack.priority

  # Iterate through all components.
  for component_path in component_dict:
    # If the component to consider in this callstack is not in the parsed list
    # of components, ignore this one.
    if component_path not in component_to_changelog_map:
      continue

    changelog = component_to_changelog_map[component_path]
    file_to_crash_info = component_dict.GetFileDict(component_path)
    FindMatchForComponent(component_path, file_to_crash_info, changelog,
                          callstack_priority, results, results_lock)


def FindMatchForStacktrace(stacktrace, components,
                           component_to_regression_dict):
  """Finds the culprit CL for stacktrace.

  The passed stacktrace is either from release build stacktrace
  or debug build stacktrace.

  Args:
    stacktrace: A list of parsed stacks within a stacktrace.
    components: A set of components to look for.
    component_to_regression_dict: A dictionary mapping component path to
                                  its regression.

  Returns:
    A list of match results from all stacks.
  """
  # A list to aggregate results from all the callstacks in the stacktrace.
  results = []
  results_lock = Lock()

  # Setup parsers.
  svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
  git_parser = git_repository_parser.GitParser(component_to_regression_dict,
                                               CONFIG['git'])

  # Create a cache of parsed revisions.
  component_to_changelog_map = {}
  for component_path in components:
    component_object = component_to_regression_dict[component_path]
    range_start = component_object['old_revision']
    range_end = component_object['new_revision']

    # If range start is 0, the range is too large and the crash has been
    # introduced the archived build, so ignore this case.
    if range_start == '0':
      continue

    component_name = component_to_regression_dict[component_path]['name']

    is_git = utils.IsGitHash(range_start)
    if is_git:
      repository_parser = git_parser
    else:
      repository_parser = svn_parser

    (revisions, file_to_revision_map) = repository_parser.ParseChangelog(
        component_path, range_start, range_end)

    # If the returned map from ParseChangeLog is empty, we don't need to look
    # further because either the parsing failed or the changelog is empty.
    if not (revisions and file_to_revision_map):
      continue

    component_to_changelog_map[component_path] = (repository_parser,
                                                  component_name,
                                                  revisions,
                                                  file_to_revision_map)

  # Analyze each of the call stacks in the stacktrace.
  for callstack in stacktrace.stack_list:
    FindMatchForCallstack(callstack, components, component_to_changelog_map,
                          results, results_lock)

  return results


def SortMatchesFunction(match_with_stack_priority):
  """A function to sort the match triple.

  Currently, it sorts the list by:
  1) The highest priority file change in the CL (changing crashed line is
  higher priority than just changing the file).
  2) The callstack this match is computed (crash stack, freed, allocation).
  3) The minimum stack frame number of the changed file in the match.
  4) The number of files this CL changes (higher the better).
  5) The minimum distance between the lines that the CL changes and crashed
  lines.

  Args:
    match_with_stack_priority: A match object, with the CL it is from and what
                               callstack it is from.

  Returns:
    A sort key.
  """
  (stack_priority, _, match) = match_with_stack_priority

  return (min(match.priorities),
          stack_priority,
          match.min_distance,
          crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,
                                              match.priorities),
          -len(match.changed_files))


def SortAndFilterMatches(matches, num_important_frames=5):
  """Filters the list of potential culprit CLs to remove noise.

  Args:
    matches: A list containing match results.
    num_important_frames: A number of frames on the top of the frame to Check
                          for when filtering the results. A match with a file
                          that is in top num_important_frames of the stacktrace
                          is regarded more probable then others.

  Returns:
    Filtered match results.
  """
  new_matches = []
  line_changed = False
  is_important_frame = False
  highest_priority_stack = crash_utils.INFINITY
  matches.sort(key=SortMatchesFunction)
  # Iterate through the matches to find out what results are significant.
  for stack_priority, cl, match in matches:
    # Check if the current match changes crashed line.
    is_line_change = (min(match.priorities) == LINE_CHANGE_PRIORITY)

    # Check which stack this match is from, and finds the highest priority
    # callstack up to this point.
    current_stack = stack_priority
    if current_stack < highest_priority_stack:
      highest_priority_stack = current_stack

    # Check if current match changes a file that occurs in crash state.
    flattened_stack_frame_indices = [frame for frame_indices in
                                     match.stack_frame_indices
                                     for frame in frame_indices]
    current_is_important = (
        min(flattened_stack_frame_indices) < num_important_frames)

    # This match and anything lower than this should be ignored if:
    # - Current match does not change crashed lines but there are matches
    # that do so.
    # - Current match is not in crash state but there are matches in it.
    # - There are other matches that came from higher priority stack.
    if (line_changed and not is_line_change) or (
        is_important_frame and not current_is_important) or (
            current_stack > highest_priority_stack):
      break

    # Update the variables.
    if is_line_change:
      line_changed = True
    if current_is_important:
      is_important_frame = True

    # Add current match to the filtered result.
    new_matches.append((stack_priority, cl, match))

  return new_matches


def GenerateReasonForMatches(matches):
  """Generates a reason that a match (CL) is a culprit cl.

  Args:
    matches: A list of match objects.
  """
  # Iterate through the matches in the list.
  for i, _, match in matches:
    reason = []

    # Zip the files in the match by the reason they are suspected
    # (how the file is modified).
    match_by_priority = zip(
        match.priorities, match.crashed_line_numbers, match.changed_files,
        match.stack_frame_indices, match.function_list)

    # Sort the zipped changed files in the match by their priority so that the
    # changed lines comes first in the reason.
    match_by_priority.sort(
        key=lambda (priority, crashed_line_numbers, file_name,
                    stack_frame_indices, function_list): priority)

    # Iterate through the sorted match.
    for i in range(len(match_by_priority)):
      (priority, crashed_line_numbers, file_name, stack_frame_indices,
       function_list) = match_by_priority[i]

      # If the file in the match is a line change, append a explanation.
      if priority == LINE_CHANGE_PRIORITY:
        crashed_line_numbers = [crashed_line_number
                                for lines in crashed_line_numbers
                                for crashed_line_number in lines]
        reason.append(
            'Lines %s of file %s which potentially caused crash '
            'are changed in this cl (%s).\n' %
            (utils.JoinLineNumbers(crashed_line_numbers, accepted_gap=4),
             file_name,
             crash_utils.PrettifyFrameInfo(stack_frame_indices, function_list)))

      else:
        # Get all the files that are not line change.
        rest_of_the_files = match_by_priority[i:]

        if len(rest_of_the_files) == 1:
          file_string = 'File %s is changed in this cl '
        else:
          file_string = 'Files %s are changed in this cl '

        # Create a list of file names, and prettify the list.
        file_names = [
            file_name for (_, _, file_name, _, _) in rest_of_the_files]
        pretty_file_names = crash_utils.PrettifyList(file_names)

        # Add the reason, break because we took care of the rest of the files.
        file_string += ('(and is part of stack %s)' %
            crash_utils.PrettifyFrameInfo(stack_frame_indices, function_list))
        reason.append(file_string % pretty_file_names)
        break

    # Set the reason as string.
    match.reason = '\n'.join(reason)


def CombineMatches(matches):
  """Combine possible duplicates in matches.

  Args:
    matches: A list of matches object, along with its callstack priority and
             CL it is from.
  Returns:
    A combined list of matches.
  """
  combined_matches = []

  for stack_index, cl, match in matches:
    found_match = None

    # Iterate through the list of combined matches.
    for _, cl_combined, match_combined in combined_matches:
      # Check for if current CL is already higher up in the result.
      if cl == cl_combined:
        found_match = match_combined
        break

    # If current match is not already in, add it to the list of matches.
    if not found_match:
      combined_matches.append((stack_index, cl, match))
      continue

    # Combine the reason if the current match is already in there.
    found_match.reason += match.reason
    if match.min_distance < found_match.min_distance:
      found_match.min_distance = match.min_distance
      found_match.min_distance_info = match.min_distance_info

  for stack_index, cl, match in combined_matches:
    if match.min_distance_info:
      file_name, min_crashed_line, min_changed_line = match.min_distance_info
      match.reason += \
          ('\nMinimum distance from crash line to modified line: %d. '
           '(file: %s, crashed on: %d, modified: %d).\n' %
           (match.min_distance, file_name, min_crashed_line, min_changed_line))

  return combined_matches


def FilterAndGenerateReasonForMatches(result):
  """A wrapper function.

  It generates reasons for the matches and returns string representation
  of filtered results.

  Args:
    result: A list of match objects.

  Returns:
    A string representation of filtered results.
  """
  new_result = SortAndFilterMatches(result)
  GenerateReasonForMatches(new_result)
  combined_matches = CombineMatches(new_result)
  return crash_utils.MatchListToResultList(combined_matches)


def ParseCrashComponents(main_stack):
  """Parses the crashing component.

  Crashing components is a component that top_n_frames of the stacktrace is
  from.

  Args:
    main_stack: Main stack from the stacktrace.

  Returns:
    A set of components.
  """
  components = set()

  for frame in main_stack.frame_list:
    components.add(frame.component_path)

  return components


def GenerateAndFilterBlameList(callstack, component_to_crash_revision_dict,
                               component_to_regression_dict):
  """A wrapper function.

  Finds blame information for stack and returns string representation.

  Args:
    callstack: A callstack to find the blame information.
    component_to_crash_revision_dict: A dictionary mapping component to its
                                      crash revision.
    component_to_regression_dict: A dictionary mapping component to its
                                  regression.

  Returns:
    A list of blame results.
  """
  if component_to_regression_dict:
    parsed_deps = component_to_regression_dict
  else:
    parsed_deps = component_to_crash_revision_dict

  # Setup parser objects to use for parsing blame information.
  svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
  git_parser = git_repository_parser.GitParser(parsed_deps, CONFIG['git'])
  parsers = {}
  parsers['svn'] = svn_parser
  parsers['git'] = git_parser

  # Create and generate the blame objects from the callstack.
  blame_list = blame.BlameList()
  blame_list.FindBlame(callstack, component_to_crash_revision_dict,
                       component_to_regression_dict,
                       parsers)

  blame_list.FilterAndSortBlameList()
  return crash_utils.BlameListToResultList(blame_list)


def FindItForCrash(stacktrace_list,
                   callstack,
                   component_to_regression_dict,
                   component_to_crash_revision_dict):
  """Finds the culprit CL from the list of stacktrace.

  Args:
    stacktrace_list: A list of stacktraces to look for, in the order of
                     decreasing significance.
    callstack: A callstack object to show blame information for, if there are
               no results for all stacktraces in the stacktrace_list.
    component_to_regression_dict: A parsed regression information as a
                                  result of parsing DEPS file.
    component_to_crash_revision_dict: A parsed crash revision information.

  Returns:
    A list of result objects, with the message how the result is created.
  """
  # If regression information is not available, return blame information.
  if not component_to_regression_dict:
    result = GenerateAndFilterBlameList(callstack,
                                        component_to_crash_revision_dict,
                                        component_to_regression_dict)
    if result:
      return_message = (
          'Regression information is not available. The result is '
          'the blame information.')
    else:
      return_message = ('Findit could not find any suspected CLs.')

    return (return_message, result)

  for stacktrace in stacktrace_list:
    # Check the next stacktrace if current one is empty.
    if not stacktrace.stack_list:
      continue

    # Get the crash stack for this stacktrace, and extract crashing components
    # from it.
    main_stack = stacktrace.GetCrashStack()
    components = ParseCrashComponents(main_stack)

    result_for_stacktrace = FindMatchForStacktrace(
        stacktrace, components, component_to_regression_dict)
    filtered_result = FilterAndGenerateReasonForMatches(result_for_stacktrace)

    # If the result is empty, check the next stacktrace. Else, return the
    # filtered result.
    if not filtered_result:
      continue

    return_message = (
        'The result is a list of CLs that change the crashed files.')
    return (return_message, filtered_result)

  # If no match is found, return the blame information for the input
  # callstack.
  result = GenerateAndFilterBlameList(
      callstack, component_to_crash_revision_dict,
      component_to_regression_dict)

  if result:
    return_message = (
        'No CL in the regression range changes the crashed files. '
        'The result is the blame information.')

  # When findit could not find any CL that changes file in stacktrace or if
  # if cannot get any blame information, return a message saying that no
  # results are available.
  else:
    return_message = ('Findit could not find any suspected CLs.')

  return (return_message, result)