Home | History | Annotate | Download | only in contrib
      1 #!/usr/bin/python
      2 
      3 # Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 """
      8 This script crawls crbug. Sort-of.
      9 Invocation:
     10     Get all bugs with labels, strings (in summary and/or comments):
     11         crbug_crawler.py --labels 'one two three'
     12                          --queries '"first query" "second query"'
     13 
     14     Get baddest open bugs of all time:
     15         crbug_crawler.py --reap
     16 
     17 Tips:
     18     - Label based queries will return faster than text queries.
     19     - contrib/crbug_shell.py is a wrapper that allows you to incrementally
     20         filter search results using this script.
     21 """
     22 
     23 import argparse
     24 import cmd
     25 import logging
     26 import sys
     27 import shlex
     28 
     29 import common
     30 from autotest_lib.client.common_lib import global_config
     31 from autotest_lib.server.cros.dynamic_suite import reporting
     32 
     33 
     34 def _parse_args(args):
     35     if not args:
     36         import crbug_crawler
     37         logging.error('Improper usage of crbug_crawler: %s',
     38                 crbug_crawler.__doc__)
     39         sys.exit(1)
     40 
     41     description = ('Usage: crbug_crawler.py --reap')
     42     parser = argparse.ArgumentParser(description=description)
     43     parser.add_argument('--quiet', help=('Turn off logging noise.'),
     44             action='store_true', default=False)
     45     parser.add_argument('--num', help='Number of issues to output.', default=10,
     46             type=int)
     47     parser.add_argument('--queries',
     48                         help=('Search query. Eg: --queries "%s %s"' %
     49                               ('build_Root', 'login')),
     50                         default='')
     51     parser.add_argument('--labels',
     52                         help=('Search labels. Eg: --labels "%s %s"' %
     53                               ('autofiled', 'Pri-1')), default=None)
     54     parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
     55             action='store_true', default=False)
     56     return parser.parse_args(args)
     57 
     58 
     59 class Update(object):
     60     """Class encapsulating fields of an update to a bug.
     61     """
     62     open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
     63                      'Started', 'ExternalDependency']
     64     closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']
     65 
     66     def __init__(self, comment='', labels='', status=''):
     67         self.comment = comment
     68         self.labels = labels if labels else []
     69         self.status = status
     70 
     71 
     72     def __str__(self):
     73         msg = 'status: %s' % self.status
     74         if self.labels:
     75             msg = '%s labels: %s' % (msg, self.labels)
     76         if self.comment:
     77             msg = '%s comment: %s' % (msg, self.comment)
     78         return msg
     79 
     80 
     81 class UpdateManager(object):
     82     """Update manager that allows you to revert status updates.
     83 
     84     This class keeps track of the last update applied and is capable
     85     of reverting it.
     86     """
     87 
     88     def __init__(self, autocommit=False):
     89         """Initialize update manager.
     90 
     91         @param autocommit: If False just print out the update instead
     92             of committing it.
     93         """
     94         self.history = {}
     95         self.present = {}
     96         self.reporter = reporting.Reporter()
     97         self.phapi_lib = self.reporter.get_bug_tracker_client()
     98         self.autocommit = autocommit
     99 
    100 
    101     def revert(self):
    102         """Only manages status reverts as of now.
    103         """
    104         for issue_id, update in self.history.iteritems():
    105             logging.warning('You will have to manually update %s and %s on %s',
    106                     self.present[issue_id].labels,
    107                     self.present[issue_id].comment, issue_id)
    108             # Create a new update with just the status.
    109             self.update(issue_id, Update(status=update.status))
    110 
    111 
    112     def update(self, old_issue, update):
    113         """Record the state of an issue before updating it.
    114 
    115         @param old_issue: The issue to update. If an id is specified an
    116             issue is constructed. If an issue object (as defined in phapi_lib
    117             Issue)is passed in, it is used directly.
    118         @param update: The Update object to apply to the issue.
    119         """
    120         if type(old_issue) == int:
    121             old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
    122         old_update = Update(
    123                 labels=old_issue.labels, status=old_issue.status)
    124 
    125         if not update.status:
    126             update.status = old_update.status
    127         elif (update.status not in Update.open_statuses and
    128               update.status not in Update.closed_statuses):
    129             raise ValueError('Unknown status %s' % update.status)
    130 
    131         if not self.autocommit:
    132             logging.warning('Would have applied the following update: '
    133                     '%s -> %s', old_update, update)
    134             return
    135 
    136         self.history[old_issue.id] = old_update
    137         self.reporter.modify_bug_report(
    138                 issue_id=old_issue.id, comment=update.comment,
    139                 label_update=update.labels,
    140                 status=update.status)
    141         self.present[old_issue.id] = update
    142 
    143 
    144 class Crawler(object):
    145     """Class capable of crawling crbug.
    146 
    147     This class applies filters to issues it crawls and caches them locally.
    148     """
    149 
    150     # The limit at which we ask for confirmation to proceed with the crawl.
    151     PROMPT_LIMIT = 2000
    152 
    153     def __init__(self):
    154         self.reporter = reporting.Reporter()
    155         self.phapi_client = self.reporter.get_bug_tracker_client()
    156         self.issues = None
    157         self.all_autofiled_query = 'ANCHOR  TestFailure'
    158         self.all_autofiled_label = 'autofiled'
    159         self.prompted = False
    160 
    161 
    162     def fuzzy_search(self, query='', label='', fast=True):
    163         """Returns all issues using one query and/or one label.
    164 
    165         @param query: A string representing the query.
    166         @param label: A string representing the label.
    167         @param fast: If true, don't bother fetching comments.
    168 
    169         @return: A list of issues matching the query. If fast is
    170             specified the issues won't have comments.
    171         """
    172         if not query and not label:
    173             raise ValueError('Require query or labels to make a tracker query, '
    174                     'try query = "%s" or one of the predefined labels %s' %
    175                     (self.fuzzy_search_anchor(),
    176                      self.reporter._PREDEFINED_LABELS))
    177         if type(label) != str:
    178             raise ValueError('The crawler only supports one label per query, '
    179                     'and it must be a string. you supplied %s' % label)
    180         return self.phapi_client.get_tracker_issues_by_text(
    181                 query, label=label, full_text=not fast)
    182 
    183 
    184     @staticmethod
    185     def _get_autofiled_count(issue):
    186         """Return the autofiled count.
    187 
    188         @param issue: An issue object that has labels.
    189 
    190         @return: An integer representing the autofiled count.
    191         """
    192         for label in issue.labels:
    193             if 'autofiled-count-' in label:
    194                 return int(label.replace('autofiled-count-', ''))
    195 
    196         # Force bugs without autofiled-count to sink
    197         return 0
    198 
    199 
    200     def _prompt_crawl(self, new_issues, start_index):
    201         """Warn the user that a crawl is getting large.
    202 
    203         This method prompts for a y/n answer in case the user wants to abort the
    204         crawl and specify another set of labels/queries.
    205 
    206         @param new_issues: A list of issues used with the start_index to
    207             determine the number of issues already processed.
    208         @param start_index: The start index of the next crawl iteration.
    209         """
    210         logging.warning('Found %s issues, Crawling issues starting from %s',
    211                 len(new_issues), start_index)
    212         if start_index > self.PROMPT_LIMIT and not self.prompted:
    213             logging.warning('Already crawled %s issues, it is possible that'
    214                     'you\'ve specified a very general label. If this is the '
    215                     'case consider re-rodering the labels so they start with '
    216                     'the rarest. Continue crawling [y/n]?',
    217                     start_index + len(new_issues))
    218             self.prompted = raw_input() == 'y'
    219             if not self.prompted:
    220                 sys.exit(0)
    221 
    222 
    223     def exhaustive_crawl(self, query='', label='', fast=True):
    224         """Perform an exhaustive crawl using one label and query string.
    225 
    226         @param query: A string representing one query.
    227         @param lable: A string representing one label.
    228 
    229         @return A list of issues sorted by descending autofiled count.
    230         """
    231         start_index = 0
    232         self.phapi_client.set_max_results(200)
    233         logging.warning('Performing an exhaustive crawl with label %s query %s',
    234                 label, query)
    235         vague_issues = []
    236         new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
    237         while new_issues:
    238             vague_issues += new_issues
    239             start_index += len(new_issues) + 1
    240             self.phapi_client.set_start_index(start_index)
    241             new_issues = self.fuzzy_search(query=query, label=label,
    242                     fast=fast)
    243             self._prompt_crawl(new_issues, start_index)
    244 
    245         # Subsequent calls will clear the issues cache with new results.
    246         self.phapi_client.set_start_index(1)
    247         return sorted(vague_issues, reverse=True,
    248                       key=lambda issue: self._get_autofiled_count(issue))
    249 
    250 
    251     @staticmethod
    252     def filter_labels(issues, labels):
    253         """Takes a list of labels and returns matching issues.
    254 
    255         @param issues: A list of issues to parse for labels.
    256         @param labels: A list of labels to match.
    257 
    258         @return: A list of matching issues. The issues must contain
    259             all the labels specified.
    260         """
    261         if not labels:
    262             return issues
    263         matching_issues = set([])
    264         labels = set(labels)
    265         for issue in issues:
    266             issue_labels = set(issue.labels)
    267             if issue_labels.issuperset(labels):
    268                 matching_issues.add(issue)
    269         return matching_issues
    270 
    271 
    272     @classmethod
    273     def does_query_match(cls, issue, query):
    274         """Check if a query matches the given issue.
    275 
    276         @param issue: The issue to check.
    277         @param query: The query to check against.
    278 
    279         @return: True if the query matches, false otherwise.
    280         """
    281         if query in issue.title or query in issue.summary:
    282             return True
    283         # We can only search comments if the issue is a complete issue
    284         # i.e as defined in phapi_lib.Issue.
    285         try:
    286             if any(query in comment for comment in issue.comments):
    287                 return True
    288         except (AttributeError, TypeError):
    289             pass
    290         return False
    291 
    292 
    293     @classmethod
    294     def filter_queries(cls, issues, queries):
    295         """Take a list of queries and returns matching issues.
    296 
    297         @param issues: A list of issues to parse. If the issues contain
    298             comments and a query is not in the issues title or summmary,
    299             the comments are parsed for a substring match.
    300         @param queries: A list of queries to parse the issues for.
    301             This method looks for an exact substring match within each issue.
    302 
    303         @return: A list of matching issues.
    304         """
    305         if not queries:
    306             return issues
    307         matching_issues = set([])
    308         for issue in issues:
    309             # For each query, check if it's in the title, description or
    310             # comments. If a query isn't in any of these, discard the issue.
    311             for query in queries:
    312                 if cls.does_query_match(issue, query):
    313                     matching_issues.add(issue)
    314                 else:
    315                     if issue in matching_issues:
    316                         logging.warning('%s: %s\n \tPassed a subset of the '
    317                                 'queries but failed query %s',
    318                                 issue.id, issue.title, query)
    319                         matching_issues.remove(issue)
    320                     break
    321         return matching_issues
    322 
    323 
    324     def filter_issues(self, queries='', labels=None, fast=True):
    325         """Run the queries, labels filters by crawling crbug.
    326 
    327         @param queries: A space seperated string of queries, usually passed
    328             through the command line.
    329         @param labels: A space seperated string of labels, usually passed
    330             through the command line.
    331         @param fast: If specified, skip creating comments for issues since this
    332             can be a slow process. This value is only a suggestion, since it is
    333             ignored if multiple queries are specified.
    334         """
    335         queries = shlex.split(queries)
    336         labels = shlex.split(labels) if labels else None
    337 
    338         # We'll need comments to filter multiple queries.
    339         if len(queries) > 1:
    340             fast = False
    341         matching_issues = self.exhaustive_crawl(
    342                 query=queries.pop(0) if queries else '',
    343                 label=labels.pop(0) if labels else '', fast=fast)
    344         matching_issues = self.filter_labels(matching_issues, labels)
    345         matching_issues = self.filter_queries(matching_issues, queries)
    346         self.issues = list(matching_issues)
    347 
    348 
    349     def dump_issues(self, limit=None):
    350         """Print issues.
    351         """
    352         if limit and limit < len(self.issues):
    353             issues = self.issues[:limit]
    354         else:
    355             issues = self.issues
    356         #TODO: Modify formatting, include some paging etc.
    357         for issue in issues:
    358             try:
    359                 print ('[%s] %s crbug.com/%s %s' %
    360                        (self._get_autofiled_count(issue),
    361                         issue.status, issue.id, issue.title))
    362             except UnicodeEncodeError as e:
    363                 print "Unicdoe error decoding issue id %s" % issue.id
    364                 continue
    365 
    366 
    367 def _update_test(args):
    368     """A simple update test, to record usage.
    369     """
    370     updater = UpdateManager(autocommit=True)
    371     for issue in issues:
    372         updater.update(issue,
    373                        Update(comment='this is bogus', labels=['bogus'],
    374                               status='Assigned'))
    375     updater.revert()
    376 
    377 
    378 def configure_logging(quiet=False):
    379     """Configure logging.
    380 
    381     @param quiet: True to turn off warning messages.
    382     """
    383     logging.basicConfig()
    384     logger = logging.getLogger()
    385     level = logging.WARNING
    386     if quiet:
    387         level = logging.ERROR
    388     logger.setLevel(level)
    389 
    390 
    391 def main(args):
    392     crawler = Crawler()
    393     if args.reap:
    394         if args.queries or args.labels:
    395             logging.error('Query based ranking of bugs not supported yet.')
    396             return
    397         queries = ''
    398         labels = crawler.all_autofiled_label
    399     else:
    400         queries = args.queries
    401         labels = args.labels
    402     crawler.filter_issues(queries=queries, labels=labels,
    403             fast=False if queries else True)
    404     crawler.dump_issues(int(args.num))
    405     logging.warning('\nThis is a truncated list of %s results, use --num %s '
    406             'to get them all. If you want more informative results/better '
    407             'querying capabilities try crbug_shell.py.',
    408             args.num, len(crawler.issues))
    409 
    410 
    411 if __name__ == '__main__':
    412     args = _parse_args(sys.argv[1:])
    413     configure_logging(args.quiet)
    414     main(args)
    415 
    416