Home | History | Annotate | Download | only in findit
      1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import base64
      6 import xml.dom.minidom as minidom
      7 from xml.parsers.expat import ExpatError
      8 
      9 import crash_utils
     10 from repository_parser_interface import ParserInterface
     11 
     12 FILE_CHANGE_TYPE_MAP = {
     13     'add': 'A',
     14     'copy': 'C',
     15     'delete': 'D',
     16     'modify': 'M',
     17     'rename': 'R'
     18 }
     19 
     20 
     21 def _ConvertToFileChangeType(file_action):
     22   # TODO(stgao): verify impact on code that checks the file change type.
     23   return file_action[0].upper()
     24 
     25 
     26 class GitParser(ParserInterface):
     27   """Parser for Git repository in googlesource.
     28 
     29   Attributes:
     30     parsed_deps: A map from component path to its repository name, regression,
     31                  etc.
     32     url_parts_map: A map from url type to its url parts. This parts are added
     33                    the base url to form different urls.
     34   """
     35 
     36   def __init__(self, parsed_deps, url_parts_map):
     37     self.component_to_url_map = parsed_deps
     38     self.url_parts_map = url_parts_map
     39 
     40   def ParseChangelog(self, component_path, range_start, range_end):
     41     file_to_revision_map = {}
     42     revision_map = {}
     43     base_url = self.component_to_url_map[component_path]['repository']
     44     changelog_url = base_url + self.url_parts_map['changelog_url']
     45     revision_url = base_url + self.url_parts_map['revision_url']
     46 
     47     # Retrieve data from the url, return empty maps if fails. Html url is a\
     48     # url where the changelog can be parsed from html.
     49     url = changelog_url % (range_start, range_end)
     50     html_url = url + '?pretty=fuller'
     51     response = crash_utils.GetDataFromURL(html_url)
     52     if not response:
     53       return (revision_map, file_to_revision_map)
     54 
     55     # Parse xml out of the returned string. If it failes, Try parsing
     56     # from JSON objects.
     57     try:
     58       dom = minidom.parseString(response)
     59     except ExpatError:
     60       self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
     61                                   revision_url, revision_map,
     62                                   file_to_revision_map)
     63       return (revision_map, file_to_revision_map)
     64 
     65     # The revisions information are in from the third divs to the second
     66     # to last one.
     67     divs = dom.getElementsByTagName('div')[2:-1]
     68     pres = dom.getElementsByTagName('pre')
     69     uls = dom.getElementsByTagName('ul')
     70 
     71     # Divs, pres and uls each contain revision information for one CL, so
     72     # they should have same length.
     73     if not divs or len(divs) != len(pres) or len(pres) != len(uls):
     74       self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
     75                                   revision_url, revision_map,
     76                                   file_to_revision_map)
     77       return (revision_map, file_to_revision_map)
     78 
     79     # Iterate through divs and parse revisions
     80     for (div, pre, ul) in zip(divs, pres, uls):
     81       # Create new revision object for each revision.
     82       revision = {}
     83 
     84       # There must be three <tr>s. If not, this page is wrong.
     85       trs = div.getElementsByTagName('tr')
     86       if len(trs) != 3:
     87         continue
     88 
     89       # Retrieve git hash.
     90       githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue
     91 
     92       # Retrieve and set author.
     93       author = trs[1].getElementsByTagName(
     94           'td')[0].firstChild.nodeValue.split('<')[0]
     95       revision['author'] = author
     96 
     97       # Retrive and set message.
     98       revision['message'] = pre.firstChild.nodeValue
     99 
    100       # Set url of this CL.
    101       revision_url_part = self.url_parts_map['revision_url'] % githash
    102       revision['url'] = base_url + revision_url_part
    103 
    104       # Go through changed files, they are in li.
    105       lis = ul.getElementsByTagName('li')
    106       for li in lis:
    107         # Retrieve path and action of the changed file
    108         file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
    109         file_change_type = li.getElementsByTagName('span')[
    110             0].getAttribute('class')
    111 
    112         # Normalize file action so that it is same as SVN parser.
    113         file_change_type = _ConvertToFileChangeType(file_change_type)
    114 
    115         # Add the changed file to the map.
    116         if file_path not in file_to_revision_map:
    117           file_to_revision_map[file_path] = []
    118         file_to_revision_map[file_path].append((githash, file_change_type))
    119 
    120       # Add this revision object to the map.
    121       revision_map[githash] = revision
    122 
    123     # Parse one revision for the start range, because googlesource does not
    124     # include the start of the range.
    125     self.ParseRevision(revision_url, range_start, revision_map,
    126                        file_to_revision_map)
    127 
    128     return (revision_map, file_to_revision_map)
    129 
    130   def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
    131                              revision_url, revision_map, file_to_revision_map):
    132     """Parses changelog by going over the JSON file.
    133 
    134     Args:
    135       range_start: Starting range of the regression.
    136       range_end: Ending range of the regression.
    137       changelog_url: The url to retrieve changelog from.
    138       revision_url: The url to retrieve individual revision from.
    139       revision_map: A map from a git hash number to its revision information.
    140       file_to_revision_map: A map from file to a git hash in which it occurs.
    141     """
    142     # Compute URLs from given range, and retrieves changelog. Stop if it fails.
    143     changelog_url %= (range_start, range_end)
    144     json_url = changelog_url + '?format=json'
    145     response = crash_utils.GetDataFromURL(json_url)
    146     if not response:
    147       return
    148 
    149     # Parse changelog from the returned object. The returned string should
    150     # start with ")}]'\n", so start from the 6th character.
    151     revisions = crash_utils.LoadJSON(response[5:])
    152     if not revisions:
    153       return
    154 
    155     # Parse individual revision in the log.
    156     for revision in revisions['log']:
    157       githash = revision['commit']
    158       self.ParseRevision(revision_url, githash, revision_map,
    159                          file_to_revision_map)
    160 
    161     # Parse the revision with range_start, because googlesource ignores
    162     # that one.
    163     self.ParseRevision(revision_url, range_start, revision_map,
    164                        file_to_revision_map)
    165 
    166   def ParseRevision(self, revision_url, githash, revision_map,
    167                     file_to_revision_map):
    168 
    169     # Retrieve data from the URL, return if it fails.
    170     url = revision_url % githash
    171     response = crash_utils.GetDataFromURL(url + '?format=json')
    172     if not response:
    173       return
    174 
    175     # Load JSON object from the string. If it fails, terminate the function.
    176     json_revision = crash_utils.LoadJSON(response[5:])
    177     if not json_revision:
    178       return
    179 
    180     # Create a map representing object and get githash from the JSON object.
    181     revision = {}
    182     githash = json_revision['commit']
    183 
    184     # Set author, message and URL of this CL.
    185     revision['author'] = json_revision['author']['name']
    186     revision['message'] = json_revision['message']
    187     revision['url'] = url
    188 
    189     # Iterate through the changed files.
    190     for diff in json_revision['tree_diff']:
    191       file_path = diff['new_path']
    192       file_change_type = diff['type']
    193 
    194       # Normalize file action so that it fits with svn_repository_parser.
    195       file_change_type = _ConvertToFileChangeType(file_change_type)
    196 
    197       # Add the file to the map.
    198       if file_path not in file_to_revision_map:
    199         file_to_revision_map[file_path] = []
    200       file_to_revision_map[file_path].append((githash, file_change_type))
    201 
    202     # Add this CL to the map.
    203     revision_map[githash] = revision
    204 
    205     return
    206 
    207   def ParseLineDiff(self, path, component, file_change_type, githash):
    208     changed_line_numbers = []
    209     changed_line_contents = []
    210     base_url = self.component_to_url_map[component]['repository']
    211     backup_url = (base_url + self.url_parts_map['revision_url']) % githash
    212 
    213     # If the file is added (not modified), treat it as if it is not changed.
    214     if file_change_type in ('A', 'C', 'R'):
    215       # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
    216       return (backup_url, changed_line_numbers, changed_line_contents)
    217 
    218     # Retrieves the diff data from URL, and if it fails, return emptry lines.
    219     url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
    220     data = crash_utils.GetDataFromURL(url + '?format=text')
    221     if not data:
    222       return (backup_url, changed_line_numbers, changed_line_contents)
    223 
    224     # Decode the returned object to line diff info
    225     diff = base64.b64decode(data).splitlines()
    226 
    227     # Iterate through the lines in diff. Set current line to -1 so that we know
    228     # that current line is part of the diff chunk.
    229     current_line = -1
    230     for line in diff:
    231       line = line.strip()
    232 
    233       # If line starts with @@, a new chunk starts.
    234       if line.startswith('@@'):
    235         current_line = int(line.split('+')[1].split(',')[0])
    236 
    237       # If we are in a chunk.
    238       elif current_line != -1:
    239         # If line is either added or modified.
    240         if line.startswith('+'):
    241           changed_line_numbers.append(current_line)
    242           changed_line_contents.append(line[2:])
    243 
    244         # Do not increment current line if the change is 'delete'.
    245         if not line.startswith('-'):
    246           current_line += 1
    247 
    248     # Return url without '?format=json'
    249     return (url, changed_line_numbers, changed_line_contents)
    250 
    251   def ParseBlameInfo(self, component, file_path, line, revision):
    252     base_url = self.component_to_url_map[component]['repository']
    253 
    254     # Retrieve blame JSON file from googlesource. If it fails, return None.
    255     url_part = self.url_parts_map['blame_url'] % (revision, file_path)
    256     blame_url = base_url + url_part
    257     json_string = crash_utils.GetDataFromURL(blame_url)
    258     if not json_string:
    259       return
    260 
    261     # Parse JSON object from the string. The returned string should
    262     # start with ")}]'\n", so start from the 6th character.
    263     annotation = crash_utils.LoadJSON(json_string[5:])
    264     if not annotation:
    265       return
    266 
    267     # Go through the regions, which is a list of consecutive lines with same
    268     # author/revision.
    269     for blame_line in annotation['regions']:
    270       start = blame_line['start']
    271       count = blame_line['count']
    272 
    273       # For each region, check if the line we want the blame info of is in this
    274       # region.
    275       if start <= line and line <= start + count - 1:
    276         # If we are in the right region, get the information from the line.
    277         revision = blame_line['commit']
    278         author = blame_line['author']['name']
    279         revision_url_parts = self.url_parts_map['revision_url'] % revision
    280         revision_url = base_url + revision_url_parts
    281         # TODO(jeun): Add a way to get content from JSON object.
    282         content = None
    283 
    284         (revision_info, _) = self.ParseChangelog(component, revision, revision)
    285         message = revision_info[revision]['message']
    286         return (content, revision, author, revision_url, message)
    287 
    288     # Return none if the region does not exist.
    289     return None
    290