Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/python2
      2 
      3 # Copyright 2014 Google Inc.
      4 #
      5 # Use of this source code is governed by a BSD-style license that can be
      6 # found in the LICENSE file.
      7 
      8 """Skia's Chromium Codereview Comparison Script.
      9 
     10 This script takes two Codereview URLs, looks at the trybot results for
     11 the two codereviews and compares the results.
     12 
     13 Usage:
     14   compare_codereview.py CONTROL_URL ROLL_URL
     15 """
     16 
     17 import collections
     18 import os
     19 import re
     20 import sys
     21 import urllib2
     22 import HTMLParser
     23 
     24 
     25 class CodeReviewHTMLParser(HTMLParser.HTMLParser):
     26   """Parses CodeReview web page.
     27 
     28   Use the CodeReviewHTMLParser.parse static function to make use of
     29   this class.
     30 
     31   This uses the HTMLParser class because it's the best thing in
     32   Python's standard library.  We need a little more power than a
     33   regex.  [Search for "You can't parse [X]HTML with regex." for more
     34   information.
     35   """
     36   # pylint: disable=I0011,R0904
     37   @staticmethod
     38   def parse(url):
     39     """Parses a CodeReview web pages.
     40 
     41     Args:
     42       url (string), a codereview URL like this:
     43         'https://codereview.chromium.org/?????????'.
     44 
     45     Returns:
     46       A dictionary; the keys are bot_name strings, the values
     47       are CodeReviewHTMLParser.Status objects
     48     """
     49     parser = CodeReviewHTMLParser()
     50     try:
     51       parser.feed(urllib2.urlopen(url).read())
     52     except (urllib2.URLError,):
     53       print >> sys.stderr, 'Error getting', url
     54       return None
     55     parser.close()
     56     return parser.statuses
     57 
     58   # namedtuples are like lightweight structs in Python.  The low
     59   # overhead of a tuple, but the ease of use of an object.
     60   Status = collections.namedtuple('Status', ['status', 'url'])
     61 
     62   def __init__(self):
     63     HTMLParser.HTMLParser.__init__(self)
     64     self._id = None
     65     self._status = None
     66     self._href = None
     67     self._anchor_data = ''
     68     self._currently_parsing_trybotdiv = False
     69     # statuses is a dictionary of CodeReviewHTMLParser.Status
     70     self.statuses = {}
     71 
     72   def handle_starttag(self, tag, attrs):
     73     """Overrides the HTMLParser method to implement functionality.
     74 
     75     [[begin standard library documentation]]
     76     This method is called to handle the start of a tag
     77     (e.g. <div id="main">).
     78 
     79     The tag argument is the name of the tag converted to lower
     80     case. The attrs argument is a list of (name, value) pairs
     81     containing the attributes found inside the tag's <>
     82     brackets. The name will be translated to lower case, and
     83     quotes in the value have been removed, and character and
     84     entity references have been replaced.
     85 
     86     For instance, for the tag <A HREF="http://www.cwi.nl/">, this
     87     method would be called as handle_starttag('a', [('href',
     88     'http://www.cwi.nl/')]).
     89     [[end standard library documentation]]
     90     """
     91     attrs = dict(attrs)
     92     if tag == 'div':
     93       # We are looking for <div id="tryjobdiv*">.
     94       id_attr = attrs.get('id','')
     95       if id_attr.startswith('tryjobdiv'):
     96         self._id = id_attr
     97     if (self._id and tag == 'a'
     98       and 'build-result' in attrs.get('class', '').split()):
     99       # If we are already inside a <div id="tryjobdiv*">, we
    100       # look for a link if the form
    101       # <a class="build-result" href="*">.  Then we save the
    102       # (non-standard) status attribute and the URL.
    103       self._status = attrs.get('status')
    104       self._href = attrs.get('href')
    105       self._currently_parsing_trybotdiv = True
    106       # Start saving anchor data.
    107 
    108   def handle_data(self, data):
    109     """Overrides the HTMLParser method to implement functionality.
    110 
    111     [[begin standard library documentation]]
    112     This method is called to process arbitrary data (e.g. text
    113     nodes and the content of <script>...</script> and
    114     <style>...</style>).
    115     [[end standard library documentation]]
    116     """
    117     # Save the text inside the <a></a> tags.  Assume <a> tags
    118     # aren't nested.
    119     if self._currently_parsing_trybotdiv:
    120       self._anchor_data += data
    121 
    122   def handle_endtag(self, tag):
    123     """Overrides the HTMLParser method to implement functionality.
    124 
    125     [[begin standard library documentation]]
    126     This method is called to handle the end tag of an element
    127     (e.g. </div>).  The tag argument is the name of the tag
    128     converted to lower case.
    129     [[end standard library documentation]]
    130     """
    131     if tag == 'a' and self._status:
    132       # We take the accumulated self._anchor_data and save it as
    133       # the bot name.
    134       bot = self._anchor_data.strip()
    135       stat = CodeReviewHTMLParser.Status(status=self._status,
    136                          url=self._href)
    137       if bot:
    138         # Add to accumulating dictionary.
    139         self.statuses[bot] = stat
    140       # Reset state to search for the next bot.
    141       self._currently_parsing_trybotdiv = False
    142       self._anchor_data = ''
    143       self._status = None
    144       self._href = None
    145 
    146 
    147 class BuilderHTMLParser(HTMLParser.HTMLParser):
    148   """parses Trybot web pages.
    149 
    150   Use the BuilderHTMLParser.parse static function to make use of
    151   this class.
    152 
    153   This uses the HTMLParser class because it's the best thing in
    154   Python's standard library.  We need a little more power than a
    155   regex.  [Search for "You can't parse [X]HTML with regex." for more
    156   information.
    157   """
    158   # pylint: disable=I0011,R0904
    159   @staticmethod
    160   def parse(url):
    161     """Parses a Trybot web page.
    162 
    163     Args:
    164       url (string), a trybot result URL.
    165 
    166     Returns:
    167       An array of BuilderHTMLParser.Results, each a description
    168       of failure results, along with an optional url
    169     """
    170     parser = BuilderHTMLParser()
    171     try:
    172       parser.feed(urllib2.urlopen(url).read())
    173     except (urllib2.URLError,):
    174       print >> sys.stderr, 'Error getting', url
    175       return []
    176     parser.close()
    177     return parser.failure_results
    178 
    179   Result = collections.namedtuple('Result', ['text', 'url'])
    180 
    181   def __init__(self):
    182     HTMLParser.HTMLParser.__init__(self)
    183     self.failure_results = []
    184     self._current_failure_result = None
    185     self._divlevel = None
    186     self._li_level = 0
    187     self._li_data = ''
    188     self._current_failure = False
    189     self._failure_results_url = ''
    190 
    191   def handle_starttag(self, tag, attrs):
    192     """Overrides the HTMLParser method to implement functionality.
    193 
    194     [[begin standard library documentation]]
    195     This method is called to handle the start of a tag
    196     (e.g. <div id="main">).
    197 
    198     The tag argument is the name of the tag converted to lower
    199     case. The attrs argument is a list of (name, value) pairs
    200     containing the attributes found inside the tag's <>
    201     brackets. The name will be translated to lower case, and
    202     quotes in the value have been removed, and character and
    203     entity references have been replaced.
    204 
    205     For instance, for the tag <A HREF="http://www.cwi.nl/">, this
    206     method would be called as handle_starttag('a', [('href',
    207     'http://www.cwi.nl/')]).
    208     [[end standard library documentation]]
    209     """
    210     attrs = dict(attrs)
    211     if tag == 'li':
    212       # <li> tags can be nested.  So we have to count the
    213       # nest-level for backing out.
    214       self._li_level += 1
    215       return
    216     if tag == 'div' and attrs.get('class') == 'failure result':
    217       # We care about this sort of thing:
    218       # <li>
    219       #   <li>
    220       #   <li>
    221       #     <div class="failure result">...</div>
    222       #   </li>
    223       #   </li>
    224       #   We want this text here.
    225       # </li>
    226       if self._li_level > 0:
    227         self._current_failure = True  # Tells us to keep text.
    228       return
    229 
    230     if tag == 'a' and self._current_failure:
    231       href = attrs.get('href')
    232       # Sometimes we want to keep the stdio url.  We always
    233       # return it, just in case.
    234       if href.endswith('/logs/stdio'):
    235         self._failure_results_url = href
    236 
    237   def handle_data(self, data):
    238     """Overrides the HTMLParser method to implement functionality.
    239 
    240     [[begin standard library documentation]]
    241     This method is called to process arbitrary data (e.g. text
    242     nodes and the content of <script>...</script> and
    243     <style>...</style>).
    244     [[end standard library documentation]]
    245     """
    246     if self._current_failure:
    247       self._li_data += data
    248 
    249   def handle_endtag(self, tag):
    250     """Overrides the HTMLParser method to implement functionality.
    251 
    252     [[begin standard library documentation]]
    253     This method is called to handle the end tag of an element
    254     (e.g. </div>).  The tag argument is the name of the tag
    255     converted to lower case.
    256     [[end standard library documentation]]
    257     """
    258     if tag == 'li':
    259       self._li_level -= 1
    260       if 0 == self._li_level:
    261         if self._current_failure:
    262           result = self._li_data.strip()
    263           first = result.split()[0]
    264           if first:
    265             result = re.sub(
    266               r'^%s(\s+%s)+' % (first, first), first, result)
    267             # Sometimes, it repeats the same thing
    268             # multiple times.
    269           result = re.sub(r'unexpected flaky.*', '', result)
    270           # Remove some extra unnecessary text.
    271           result = re.sub(r'\bpreamble\b', '', result)
    272           result = re.sub(r'\bstdio\b', '', result)
    273           url = self._failure_results_url
    274           self.failure_results.append(
    275             BuilderHTMLParser.Result(result, url))
    276           self._current_failure_result = None
    277         # Reset the state.
    278         self._current_failure = False
    279         self._li_data = ''
    280         self._failure_results_url = ''
    281 
    282 
    283 def printer(indent, string):
    284   """Print indented, wrapped text.
    285   """
    286   def wrap_to(line, columns):
    287     """Wrap a line to the given number of columns, return a list
    288     of strings.
    289     """
    290     ret = []
    291     nextline = ''
    292     for word in line.split():
    293       if nextline:
    294         if len(nextline) + 1 + len(word) > columns:
    295           ret.append(nextline)
    296           nextline = word
    297         else:
    298           nextline += (' ' + word)
    299       else:
    300         nextline = word
    301     if nextline:
    302       ret.append(nextline)
    303     return ret
    304   out = sys.stdout
    305   spacer = '  '
    306   for line in string.split('\n'):
    307     for i, wrapped_line in enumerate(wrap_to(line, 68 - (2 * indent))):
    308       out.write(spacer * indent)
    309       if i > 0:
    310         out.write(spacer)
    311       out.write(wrapped_line)
    312       out.write('\n')
    313   out.flush()
    314 
    315 
    316 def main(control_url, roll_url, verbosity=1):
    317   """Compare two Codereview URLs
    318 
    319   Args:
    320     control_url, roll_url: (strings) URL of the format
    321       https://codereview.chromium.org/?????????
    322 
    323     verbosity: (int) verbose level.  0, 1, or 2.
    324   """
    325   # pylint: disable=I0011,R0914,R0912
    326   control = CodeReviewHTMLParser.parse(control_url)
    327   roll = CodeReviewHTMLParser.parse(roll_url)
    328   all_bots = set(control) & set(roll)  # Set intersection.
    329   if not all_bots:
    330     print >> sys.stderr, (
    331       'Error:  control %s and roll %s have no common trybots.'
    332       % (list(control), list(roll)))
    333     return
    334 
    335   control_name = '[control %s]' % control_url.split('/')[-1]
    336   roll_name = '[roll %s]' % roll_url.split('/')[-1]
    337 
    338   out = sys.stdout
    339 
    340   for bot in sorted(all_bots):
    341     if (roll[bot].status == 'success'):
    342       if verbosity > 1:
    343         printer(0, '==%s==' % bot)
    344         printer(1, 'OK')
    345       continue
    346 
    347     if control[bot].status != 'failure' and roll[bot].status != 'failure':
    348       continue
    349     printer(0, '==%s==' % bot)
    350 
    351     formatted_results = []
    352     for (status, name, url) in [
    353             (control[bot].status, control_name, control[bot].url),
    354             (   roll[bot].status,    roll_name,    roll[bot].url)]:
    355       lines = []
    356       if status == 'failure':
    357         results = BuilderHTMLParser.parse(url)
    358         for result in results:
    359           formatted_result = re.sub(r'(\S*\.html) ', '\n__\g<1>\n', result.text)
    360           # Strip runtimes.
    361           formatted_result = re.sub(r'\(.*\)', '', formatted_result)
    362           lines.append((2, formatted_result))
    363           if ('compile' in result.text or '...and more' in result.text):
    364             lines.append((3, re.sub('/[^/]*$', '/', url) + result.url))
    365       formatted_results.append(lines)
    366 
    367     identical = formatted_results[0] == formatted_results[1]
    368 
    369 
    370     for (formatted_result, (status, name, url)) in zip(
    371         formatted_results,
    372         [(control[bot].status, control_name, control[bot].url),
    373           (roll[bot].status,  roll_name,  roll[bot].url)]):
    374       if status != 'failure' and not identical:
    375         printer(1, name)
    376         printer(2, status)
    377       elif status == 'failure':
    378         if identical:
    379           printer(1, control_name + ' and ' + roll_name + ' failed identically')
    380         else:
    381           printer(1, name)
    382         for (indent, line) in formatted_result:
    383           printer(indent, line)
    384         if identical:
    385           break
    386     out.write('\n')
    387 
    388   if verbosity > 0:
    389     # Print out summary of all of the bots.
    390     out.write('%11s %11s %4s %s\n\n' %
    391           ('CONTROL', 'ROLL', 'DIFF', 'BOT'))
    392     for bot in sorted(all_bots):
    393       if roll[bot].status == 'success':
    394         diff = ''
    395       elif (control[bot].status == 'success' and
    396            roll[bot].status == 'failure'):
    397         diff = '!!!!'
    398       elif ('pending' in control[bot].status or
    399           'pending' in roll[bot].status):
    400         diff = '....'
    401       else:
    402         diff = '****'
    403       out.write('%11s %11s %4s %s\n' % (
    404           control[bot].status, roll[bot].status, diff, bot))
    405     out.write('\n')
    406     out.flush()
    407 
    408 if __name__ == '__main__':
    409   if len(sys.argv) < 3:
    410     print >> sys.stderr, __doc__
    411     exit(1)
    412   main(sys.argv[1], sys.argv[2],
    413      int(os.environ.get('COMPARE_CODEREVIEW_VERBOSITY', 1)))
    414 
    415