Home | History | Annotate | Download | only in findit
      1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 import xml.dom.minidom as minidom
      6 from xml.parsers.expat import ExpatError
      7 
      8 import crash_utils
      9 from repository_parser_interface import ParserInterface
     10 
     11 
     12 # This number is 6 because each linediff page in src.chromium.org should
     13 # contain the following tables: table with revision number, table with actual
     14 # diff, table with dropdown menu, table with legend, a border table and a table
     15 # containing page information.
     16 NUM_TABLES_IN_LINEDIFF_PAGE = 6
     17 # Each of the linediff info should contain 3 tds, one for changed line number,
     18 # and two for line contents before/after.
     19 NUM_TDS_IN_LINEDIFF_PAGE = 3
     20 
     21 
     22 class SVNParser(ParserInterface):
     23   """Parser for SVN repository using chromium.org, for components in config.
     24 
     25   Attributes:
     26     url_map: A map from component to the urls, where urls are for changelog,
     27              revision, line diff and annotation.
     28   """
     29 
     30   def __init__(self, url_map):
     31     self.component_to_urls_map = url_map
     32 
     33   def ParseChangelog(self, component, range_start, range_end):
     34     file_to_revision_map = {}
     35     revision_map = {}
     36 
     37     # Check if the current component is supported by reading the components
     38     # parsed from config file. If it is not, fail.
     39 
     40     url_map = self.component_to_urls_map.get(component)
     41     if not url_map:
     42       return (revision_map, file_to_revision_map)
     43 
     44     # Retrieve data from the url, return empty map if fails.
     45     revision_range_str = '%s:%s' % (range_start, range_end)
     46     url = url_map['changelog_url'] % revision_range_str
     47     response = crash_utils.GetDataFromURL(url)
     48     if not response:
     49       return (revision_map, file_to_revision_map)
     50 
     51     # Parse xml out of the returned string. If it fails, return empty map.
     52     try:
     53       xml_revisions = minidom.parseString(response)
     54     except ExpatError:
     55       return (revision_map, file_to_revision_map)
     56 
     57     # Iterate through the returned XML object.
     58     revisions = xml_revisions.getElementsByTagName('logentry')
     59     for revision in revisions:
     60       # Create new revision object for each of the revision.
     61       revision_object = {}
     62 
     63       # Set author of the CL.
     64       revision_object['author'] = revision.getElementsByTagName(
     65           'author')[0].firstChild.nodeValue
     66 
     67       # Get the revision number from xml.
     68       revision_number = int(revision.getAttribute('revision'))
     69 
     70       # Iterate through the changed paths in the CL.
     71       paths = revision.getElementsByTagName('paths')
     72       if paths:
     73         for changed_path in paths[0].getElementsByTagName('path'):
     74           # Get path and file change type from the xml.
     75           file_path = changed_path.firstChild.nodeValue
     76           file_change_type = changed_path.getAttribute('action')
     77 
     78           if file_path.startswith('/trunk/'):
     79             file_path = file_path[len('/trunk/'):]
     80 
     81           # Add file to the map.
     82           if file_path not in file_to_revision_map:
     83             file_to_revision_map[file_path] = []
     84           file_to_revision_map[file_path].append(
     85               (revision_number, file_change_type))
     86 
     87       # Set commit message of the CL.
     88       revision_object['message'] = revision.getElementsByTagName('msg')[
     89           0].firstChild.nodeValue
     90 
     91       # Set url of this CL.
     92       revision_url = url_map['revision_url'] % revision_number
     93       revision_object['url'] = revision_url
     94 
     95       # Add this CL to the revision map.
     96       revision_map[revision_number] = revision_object
     97 
     98     return (revision_map, file_to_revision_map)
     99 
    100   def ParseLineDiff(self, path, component, file_change_type, revision_number):
    101     changed_line_numbers = []
    102     changed_line_contents = []
    103 
    104     url_map = self.component_to_urls_map.get(component)
    105     if not url_map:
    106       return (None, None, None)
    107 
    108     # If the file is added (not modified), treat it as if it is not changed.
    109     backup_url = url_map['revision_url'] % revision_number
    110     if file_change_type == 'A':
    111       return (backup_url, changed_line_numbers, changed_line_contents)
    112 
    113     # Retrieve data from the url. If no data is retrieved, return empty lists.
    114     url = url_map['diff_url'] % (path, revision_number - 1,
    115                                  revision_number, revision_number)
    116     data = crash_utils.GetDataFromURL(url)
    117     if not data:
    118       return (backup_url, changed_line_numbers, changed_line_contents)
    119 
    120     line_diff_html = minidom.parseString(data)
    121     tables = line_diff_html.getElementsByTagName('table')
    122     # If there are not NUM_TABLES tables in the html page, there should be an
    123     # error in the html page.
    124     if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE:
    125       return (backup_url, changed_line_numbers, changed_line_contents)
    126 
    127     # Diff content is in the second table. Each line of the diff content
    128     # is in <tr>.
    129     trs = tables[1].getElementsByTagName('tr')
    130     prefix_len = len('vc_diff_')
    131 
    132     # Filter trs so that it only contains diff chunk with contents.
    133     filtered_trs = []
    134     for tr in trs:
    135       tr_class = tr.getAttribute('class')
    136 
    137       # Check for the classes of the <tr>s.
    138       if tr_class:
    139         tr_class = tr_class[prefix_len:]
    140 
    141         # Do not have to add header.
    142         if tr_class == 'header' or tr_class == 'chunk_header':
    143           continue
    144 
    145         # If the class of tr is empty, this page does not have any change.
    146         if tr_class == 'empty':
    147           return (backup_url, changed_line_numbers, changed_line_contents)
    148 
    149       filtered_trs.append(tr)
    150 
    151     # Iterate through filtered trs, and grab line diff information.
    152     for tr in filtered_trs:
    153       tds = tr.getElementsByTagName('td')
    154 
    155       # If there aren't 3 tds, this line does should not contain line diff.
    156       if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE:
    157         continue
    158 
    159       # If line number information is not in hyperlink, ignore this line.
    160       try:
    161         line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue
    162         left_diff_type = tds[1].getAttribute('class')[prefix_len:]
    163         right_diff_type = tds[2].getAttribute('class')[prefix_len:]
    164       except IndexError:
    165         continue
    166 
    167       # Treat the line as modified only if both left and right diff has type
    168       # changed or both have different change type, and if the change is not
    169       # deletion.
    170       if (left_diff_type != right_diff_type) or (
    171           left_diff_type == 'change' and right_diff_type == 'change'):
    172 
    173         # Check if the line content is not empty.
    174         try:
    175           new_line = tds[2].firstChild.nodeValue
    176         except AttributeError:
    177           new_line = ''
    178 
    179         if not (left_diff_type == 'remove' and right_diff_type == 'empty'):
    180           changed_line_numbers.append(int(line_num))
    181           changed_line_contents.append(new_line.strip())
    182 
    183     return (url, changed_line_numbers, changed_line_contents)
    184 
    185   def ParseBlameInfo(self, component, file_path, line, revision):
    186     url_map = self.component_to_urls_map.get(component)
    187     if not url_map:
    188       return None
    189 
    190     # Retrieve blame data from url, return None if fails.
    191     url = url_map['blame_url'] % (file_path, revision, revision)
    192     data = crash_utils.GetDataFromURL(url)
    193     if not data:
    194       return None
    195 
    196     blame_html = minidom.parseString(data)
    197 
    198     title = blame_html.getElementsByTagName('title')
    199     # If the returned html page is an exception page, return None.
    200     if title[0].firstChild.nodeValue == 'ViewVC Exception':
    201       return None
    202 
    203     # Each of the blame result is in <tr>.
    204     blame_results = blame_html.getElementsByTagName('tr')
    205     try:
    206       blame_result = blame_results[line]
    207     except IndexError:
    208       return None
    209 
    210     # There must be 4 <td> for each <tr>. If not, this page is wrong.
    211     tds = blame_result.getElementsByTagName('td')
    212     if len(tds) != 4:
    213       return None
    214 
    215     # The third <td> has the line content, separated by <span>s. Combine
    216     # those to get a string of changed line. If it has nothing, the line
    217     # is empty.
    218     line_content = ''
    219     if tds[3].hasChildNodes():
    220       contents = tds[3].childNodes
    221 
    222       for content in contents:
    223         # Nodetype 3 means it is text node.
    224         if content.nodeType == minidom.Node.TEXT_NODE:
    225           line_content += content.nodeValue
    226         else:
    227           line_content += content.firstChild.nodeValue
    228 
    229       line_content = line_content.strip()
    230 
    231     # If the current line has the same author/revision as the previous lines,
    232     # the result is not shown. Propagate up until we find the line with info.
    233     while not tds[1].firstChild:
    234       line -= 1
    235       blame_result = blame_results[line]
    236       tds = blame_result.getElementsByTagName('td')
    237     author = tds[1].firstChild.nodeValue
    238 
    239     # Revision can either be in hyperlink or plain text.
    240     try:
    241       revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue
    242     except IndexError:
    243       revision = tds[2].firstChild.nodeValue
    244 
    245     (revision_info, _) = self.ParseChangelog(component, revision, revision)
    246     message = revision_info[int(revision)]['message']
    247 
    248     # Return the parsed information.
    249     revision_url = url_map['revision_url'] % int(revision)
    250     return (line_content, revision, author, revision_url, message)
    251