tools/bug_hunter/bug_hunter.py

#!/usr/bin/env python
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""This script queries the Chromium issue tracker and e-mails the results.

It queries issue tracker using Issue Tracker API. The query
parameters can be specified by command-line arguments. For example, with the
following command:

  'python bug_hunter.py -q video Status:Unconfirmed OR audio Status:Unconfirmed
   -s sender (at] chromium.org -r receiver (at] chromium.org -v 100 -u days'

You will find all 'Unconfirmed' issues created in the last 100 days containing
'video' or 'audio' in their content/comments. The content of these issues are
sent to receiver (at] chromium.org.

TODO(imasaki): users can specify the interval as say: "100d" for "100 days".

There are two limitations in the current implementation of issue tracker API
and UI:
* only outermost OR is valid. For example, the query
  'video OR audio Status:Unconfirmed' is translated into
  'video OR (audio AND Status:Unconfirmed)'
* brackets are not supported. For example, the query
  '(video OR audio) Status:Unconfirmed' does not work.

You need to install following to run this script
  gdata-python-client (http://code.google.com/p/gdata-python-client/)
  rfc3339.py (http://henry.precheur.org/projects/rfc3339)

Links:
* Chromium issue tracker: http://code.google.com/p/chromium/issues/list
* Issue tracker API: http://code.google.com/p/support/wiki/IssueTrackerAPI
* Search tips for the issue tracker:
    http://code.google.com/p/chromium/issues/searchtips
"""

import csv
import datetime
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import logging
from operator import itemgetter
import optparse
import re
import smtplib
import socket
import sys
import urllib

try:
  import gdata.data
  import gdata.projecthosting.client
except ImportError:
  logging.error('gdata-client needs to be installed. Please install\n'
                'and try again (http://code.google.com/p/gdata-python-client/)')
  sys.exit(1)

try:
  import rfc3339
except ImportError:
  logging.error('rfc3339 needs to be installed. Please install\n'
                'and try again (http://henry.precheur.org/projects/rfc3339)')
  sys.exit(1)

# A list of default values.
_DEFAULT_INTERVAL_UNIT = 'hours'
_DEFAULT_ISSUE_ELEMENT_IN_EMAIL = ('author', 'status', 'state', 'content',
                                   'comments', 'labels', 'urls')
_DEFAULT_PROJECT_NAME = 'chromium'
_DEFAULT_QUERY_TITLE = 'potential media bugs'
_DEFAULT_QUERY = ('video -has:Feature -has:Owner -label:nomedia '
                  'status:Unconfirmed OR audio -has:Feature -has:Owner '
                  '-label:nomedia status:Unconfirmed')
_DEFAULT_OUTPUT_FILENAME = 'output.csv'
_DETAULT_MAX_COMMENTS = 1000

_INTERVAL_UNIT_CHOICES = ('hours', 'days', 'weeks')

# URLs in this list are excluded from URL extraction from bug
# content/comments. Each list element should not contain the url ending in
# '/'. For example, the element should be 'http://www.google.com' but not
# 'http://www.google.com/'
_URL_EXCLUSION_LIST = ('http://www.youtube.com/html5',
                       'http://www.google.com')
_ISSUE_ELEMENT_IN_EMAIL_CHOICES = ('issue_id', 'author', 'status', 'state',
                                   'content', 'comments', 'labels', 'urls',
                                   'mstone')


def ParseArgs():
  """Returns options dictionary from parsed command line arguments."""
  parser = optparse.OptionParser()

  parser.add_option('-e', '--email-entries',
                    help=('A comma-separated list of issue entries that are '
                          'sent in the email content. '
                          'Possible strings are %s. Default: %%default.' %
                          ', '.join(_ISSUE_ELEMENT_IN_EMAIL_CHOICES)),
                    default=','.join(_DEFAULT_ISSUE_ELEMENT_IN_EMAIL))
  parser.add_option('-l', '--max-comments',
                    help=('The maximum number of comments returned for each '
                          'issue in a reverse chronological order. '
                          'Default: %default.'),
                    type='int', default=_DETAULT_MAX_COMMENTS)
  parser.add_option('-o', '--output-filename',
                    help=('Filename for result output in CSV format. '
                          'Default: %default.'),
                    default=_DEFAULT_OUTPUT_FILENAME, metavar='FILE')
  parser.add_option('-p', '--project-name', default=_DEFAULT_PROJECT_NAME,
                    help='Project name string. Default: %default')
  parser.add_option('-q', '--query', default=_DEFAULT_QUERY,
                    help=('Query to be used to find bugs. The detail can be '
                          'found in Chromium Issue tracker page '
                          'http://code.google.com/p/chromium/issues/searchtips.'
                          ' Default: "%default".'))
  parser.add_option('-r', '--receiver-email-address',
                    help="Receiver's email address (Required).")
  parser.add_option('-s', '--sender-email-address',
                    help="Sender's email address (Required).")
  parser.add_option('-t', '--query-title',
                    default=_DEFAULT_QUERY_TITLE, dest='query_title',
                    help=('Query title string used in the subject of the '
                          'result email. Default: %default.'))
  parser.add_option('-u', '--interval_unit', default=_DEFAULT_INTERVAL_UNIT,
                    choices=_INTERVAL_UNIT_CHOICES,
                    help=('Unit name for |interval_value|. Valid options are '
                          '%s. Default: %%default' % (
                              ', '.join(_INTERVAL_UNIT_CHOICES))))
  parser.add_option('-v', '--interval-value', type='int',
                    help=('Interval value to find bugs. '
                          'The script looks for bugs during '
                          'that interval (up to now). This option is used in '
                          'conjunction with |--interval_unit| option. '
                          'The script looks for all bugs if this is not '
                          'specified.'))

  options = parser.parse_args()[0]

  options.email_entries = options.email_entries.split(',')
  options.email_entries = [entry for entry in options.email_entries
                           if entry in _ISSUE_ELEMENT_IN_EMAIL_CHOICES]
  if not options.email_entries:
    logging.warning('No issue elements in email in option. '
                    'Default email entries will be used.')
    options.email_entries = _DEFAULT_ISSUE_ELEMENT_IN_EMAIL
  logging.info('The following is the issue elements in email: %s ' + (
      ', '.join(options.email_entries)))
  return options


class BugHunter(object):
  """This class queries issue trackers and e-mails the results."""

  _ISSUE_SEARCH_LINK_BASE = ('http://code.google.com/p/chromium/issues/list?'
                             'can=2&colspec=ID+Pri+Mstone+ReleaseBlock+Area'
                             '+Feature+Status+Owner+Summary&cells=tiles'
                             '&sort=-id')
  # TODO(imasaki): Convert these into template library.
  _EMAIL_ISSUE_TEMPLATE = ('<li><a href="http://crbug.com/%(issue_id)s">'
                           '%(issue_id)s %(title)s</a> ')
  _EMAIL_SUBJECT_TEMPLATE = ('BugHunter found %(n_issues)d %(query_title)s '
                             'bug%(plural)s%(time_msg)s!')
  _EMAIL_MSG_TEMPLATE = ('<a href="%(link_base)s&q=%(unquote_query_text)s">'
                         'Used Query</a>: %(query_text)s<br><br>'
                         'The number of issues : %(n_issues)d<br>'
                         '<ul>%(issues)s</ul>')

  def __init__(self, options):
    """Sets up initial state for Bug Hunter.

    Args:
      options: Command-line options.
    """
    self._client = gdata.projecthosting.client.ProjectHostingClient()
    self._options = options
    self._issue_template = BugHunter._EMAIL_ISSUE_TEMPLATE
    for entry in options.email_entries:
      self._issue_template += '%%(%s)s ' % entry
    self._issue_template += '</li>'

  def GetComments(self, issue_id, max_comments):
    """Get comments for a issue.

    Args:
      issue_id: Issue id for each issue in the issue tracker.
      max_comments: The maximum number of comments to be returned. The comments
        are returned in a reverse chronological order.

    Returns:
      A list of (author name, comments, updated time) tuples.
    """
    comments_feed = self._client.get_comments(self._options.project_name,
                                              issue_id)
    comment_list = [(comment.content.text, comment.author[0].name.text,
                     comment.updated.text)
                    for comment
                    in list(reversed(comments_feed.entry))[0:max_comments]]
    return comment_list

  def GetIssues(self):
    """Get issues from issue tracker and return them.

    Returns:
      A list of issues in descending order by issue_id. Each element in the
        list is a dictionary where the keys are 'issue_id', 'title', 'author',
        'status', 'state', 'content', 'comments', 'labels', 'urls'.
        Returns an empty list when there is no matching issue.
    """
    min_time = None
    if self._options.interval_value:
      # Issue Tracker Data API uses RFC 3339 timestamp format, For example:
      # 2005-08-09T10:57:00-08:00
      # (http://code.google.com/p/support/wiki/IssueTrackerAPIPython)
      delta = datetime.timedelta(
          **{self._options.interval_unit: self._options.interval_value})
      dt = datetime.datetime.now() - delta
      min_time = rfc3339.rfc3339(dt)

    query = gdata.projecthosting.client.Query(text_query=self._options.query,
                                              max_results=1000,
                                              published_min=min_time)

    feed = self._client.get_issues(self._options.project_name, query=query)
    if not feed.entry:
      logging.info('No issues available to match query %s.',
                   self._options.query)
      return []
    issues = []
    for entry in feed.entry:
      # The fully qualified id is a URL. We just want the number.
      issue_id = entry.id.text.split('/')[-1]
      if not issue_id.isdigit():
        logging.warning('Issue_id is not correct: %s. Skipping.', issue_id)
        continue
      label_list = [label.text for label in entry.label]
      comments = ''
      if 'comments' in self._options.email_entries:
        comments = ''.join(
            [''.join(comment) if not comment else ''
             for comment
             in self.GetComments(issue_id, self._options.max_comments)])
      content = BugHunterUtils.StripHTML(entry.content.text)
      url_list = list(
          set(re.findall(r'(https?://\S+)', content + comments)))
      url_list = [url for url in url_list
                  if not url.rstrip('/') in _URL_EXCLUSION_LIST]
      mstone = ''
      r = re.compile(r'Mstone-(\d*)')
      for label in label_list:
        m = r.search(label)
        if m:
          mstone = m.group(1)
      issues.append(
          {'issue_id': issue_id, 'title': entry.title.text,
           'author': entry.author[0].name.text,
           'status': entry.status.text if entry.status is not None else '',
           'state': entry.state.text if entry.state is not None else '',
           'content': content, 'mstone': mstone, 'comments': comments,
           'labels': label_list, 'urls': url_list})
    return sorted(issues, key=itemgetter('issue_id'), reverse=True)

  def _SetUpEmailSubjectMsg(self, issues):
    """Set up email subject and its content.

    Args:
      issues: Please refer to the return value in GetIssues().

    Returns:
      A tuple of two strings (email subject and email content).
    """
    time_msg = ''
    if self._options.interval_value:
      time_msg = ' in the past %s %s%s' % (
          self._options.interval_value, self._options.interval_unit[:-1],
          's' if self._options.interval_value > 1 else '')
    subject = BugHunter._EMAIL_SUBJECT_TEMPLATE % {
        'n_issues': len(issues),
        'query_title': self._options.query_title,
        'plural': 's' if len(issues) > 1 else '',
        'time_msg': time_msg}
    content = BugHunter._EMAIL_MSG_TEMPLATE % {
        'link_base': BugHunter._ISSUE_SEARCH_LINK_BASE,
        'unquote_query_text': urllib.quote(self._options.query),
        'query_text': self._options.query,
        'n_issues': len(issues),
        'issues': ''.join(
            [self._issue_template % issue for issue in issues])}
    return (subject, content)

  def SendResultEmail(self, issues):
    """Send result email.

    Args:
      issues: Please refer to the return value in GetIssues().
    """
    subject, content = self._SetUpEmailSubjectMsg(issues)
    BugHunterUtils.SendEmail(
        content, self._options.sender_email_address,
        self._options.receiver_email_address, subject)

  def WriteIssuesToFileInCSV(self, issues, filename):
    """Write issues to a file in CSV format.

    Args:
      issues: Please refer to the return value in GetIssues().
      filename: File name for CSV file.
    """
    with open(filename, 'w') as f:
      writer = csv.writer(f)
      # Write header first.
      writer.writerow(issues[0].keys())
      for issue in issues:
        writer.writerow(
            [unicode(value).encode('utf-8') for value in issue.values()])


class BugHunterUtils(object):
  """Utility class for Bug Hunter."""

  @staticmethod
  def StripHTML(string_with_html):
    """Strip HTML tags from string.

    Args:
      string_with_html: A string with HTML tags.

    Returns:
      A string without HTML tags.
    """
    return re.sub('<[^<]+?>', '', string_with_html)

  @staticmethod
  def SendEmail(message, sender_email_address, receivers_email_address,
                subject):
    """Send email using localhost's mail server.

    Args:
      message: Email message to be sent.
      sender_email_address: Sender's email address.
      receivers_email_address: Receiver's email address.
      subject: Email subject.

    Returns:
      True if successful; False, otherwise.
    """
    try:
      html = '<html><head></head><body>%s</body></html>' % message
      msg = MIMEMultipart('alternative')
      msg['Subject'] = subject
      msg['From'] = sender_email_address
      msg['To'] = receivers_email_address
      msg.attach(MIMEText(html.encode('utf-8'), 'html', _charset='utf-8'))
      smtp_obj = smtplib.SMTP('localhost')
      smtp_obj.sendmail(sender_email_address, receivers_email_address,
                        msg.as_string())
      logging.info('Successfully sent email.')
      smtp_obj.quit()
      return True
    except smtplib.SMTPException:
      logging.exception('Authentication failed, unable to send email.')
    except (socket.gaierror, socket.error, socket.herror):
      logging.exception('Unable to send email.')
    return False


def Main():
  ops = ParseArgs()
  bh = BugHunter(ops)
  issues = bh.GetIssues()
  if issues and ops.sender_email_address and ops.receiver_email_address:
    bh.SendResultEmail(issues)
  if issues:
    bh.WriteIssuesToFileInCSV(issues, ops.output_filename)


if __name__ == '__main__':
  Main()