Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/python
      2 #
      3 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 
      8 """Tool for running suites of tests and waiting for completion.
      9 
     10 The desired test suite will be scheduled with autotest. By default,
     11 this tool will block until the job is complete, printing a summary
     12 at the end.  Error conditions result in exceptions.
     13 
     14 This is intended for use only with Chrome OS test suits that leverage the
     15 dynamic suite infrastructure in server/cros/dynamic_suite.py.
     16 
     17 This script exits with one of the following codes:
     18 0 - OK: Suite finished successfully
     19 1 - ERROR: Test(s) failed, or hits its own timeout
     20 2 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out.
     21 3 - INFRA_FAILURE: Infrastructure related issues, e.g.
     22     * Lab is down
     23     * Too many duts (defined as a constant) in repair failed status
     24     * Suite job issues, like bug in dynamic suite,
     25       user aborted the suite, lose a drone/all devservers/rpc server,
     26       0 tests ran, etc.
     27     * provision failed
     28       TODO(fdeng): crbug.com/413918, reexamine treating all provision
     29                    failures as INFRA failures.
     30 4 - SUITE_TIMEOUT: Suite timed out, some tests ran,
     31     none failed by the time the suite job was aborted. This will cover,
     32     but not limited to, the following cases:
     33     * A devserver failure that manifests as a timeout
     34     * No DUTs available midway through a suite
     35     * Provision/Reset/Cleanup took longer time than expected for new image
     36     * A regression in scheduler tick time.
     37 5- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool.
     38 6- INVALID_OPTIONS: If options are not valid.
     39 """
     40 
     41 import argparse
     42 import ast
     43 import collections
     44 from collections import namedtuple
     45 from datetime import datetime
     46 from datetime import timedelta
     47 import functools
     48 import getpass
     49 import json
     50 import logging
     51 import os
     52 import re
     53 import sys
     54 import time
     55 import warnings
     56 
     57 import common
     58 from chromite.lib import buildbot_annotations as annotations
     59 
     60 from autotest_lib.client.common_lib import control_data
     61 from autotest_lib.client.common_lib import error
     62 from autotest_lib.client.common_lib import global_config, enum
     63 from autotest_lib.client.common_lib import priorities
     64 from autotest_lib.client.common_lib import time_utils
     65 from autotest_lib.client.common_lib.cros import retry
     66 from autotest_lib.frontend.afe import rpc_client_lib
     67 from autotest_lib.frontend.afe.json_rpc import proxy
     68 from autotest_lib.server import site_utils
     69 from autotest_lib.server import utils
     70 from autotest_lib.server.cros import provision
     71 from autotest_lib.server.cros.dynamic_suite import constants
     72 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     73 from autotest_lib.server.cros.dynamic_suite import reporting_utils
     74 from autotest_lib.server.cros.dynamic_suite import tools
     75 from autotest_lib.site_utils import diagnosis_utils
     76 from autotest_lib.site_utils import job_overhead
     77 
     78 CONFIG = global_config.global_config
     79 
     80 _DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value(
     81         'SERVER', 'hostname', type=str)
     82 _URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
     83 
     84 # Return code that will be sent back to autotest_rpc_server.py
     85 RETURN_CODES = enum.Enum(
     86         'OK', 'ERROR', 'WARNING', 'INFRA_FAILURE', 'SUITE_TIMEOUT',
     87         'BOARD_NOT_AVAILABLE', 'INVALID_OPTIONS')
     88 
     89 # Minimum RPC timeout setting for calls expected to take long time, e.g.,
     90 # create_suite_job. If default socket time (socket.getdefaulttimeout()) is
     91 # None or greater than this value, the default will be used.
     92 # The value here is set to be the same as the timeout for the RetryingAFE object
     93 # so long running RPCs can wait long enough before being aborted.
     94 _MIN_RPC_TIMEOUT = 600
     95 
     96 # Number of days back to search for existing job.
     97 _SEARCH_JOB_MAX_DAYS = 14
     98 
     99 _PROVISION_SUITE = 'provision'
    100 
    101 
    102 @functools.total_ordering
    103 class _ReturnResult(object):
    104     """Represents overall result of run_suite operation.
    105 
    106     _ReturnResult instances sort based on priority (the order in
    107     _RETURN_RESULTS).
    108 
    109     Furthermore, _ReturnResult instances can be combined by bitwise or
    110     ("union"), which returns the instance with the higher priority
    111     between the two (the instance with higher priority is a "superset"
    112     of the other).
    113 
    114     Do not create new instances of this; use _RETURN_RESULTS instead.
    115     """
    116 
    117     def __init__(self, return_code, message):
    118         self.return_code = return_code
    119         self.message = message
    120 
    121     def __repr__(self):
    122         return '<{cls} {key}, {this.return_code}, {this.message}>'.format(
    123             cls=type(self).__name__,
    124             key=self._getkey(),
    125             this=self)
    126 
    127     def __gt__(self, other):
    128         if isinstance(other, type(self)):
    129             return self._getkey() > other._getkey()
    130         else:
    131             return NotImplemented
    132 
    133     def __eq__(self, other):
    134         if isinstance(other, type(self)):
    135             return (self.return_code == other.return_code
    136                     and self.message == other.message)
    137         else:
    138             return NotImplemented
    139 
    140     def __hash__(self):
    141         return hash(self.return_code) ^ hash(self.message)
    142 
    143     def __or__(self, other):
    144         if isinstance(other, type(self)):
    145             if self > other:
    146                 return self
    147             else:
    148                 return other
    149         else:
    150             return NotImplemented
    151 
    152     def _getkey(self):
    153         """Return sort key."""
    154         return _RETURN_RESULTS_LIST.index(self)
    155 
    156     def suite_result(self, output_dict=None):
    157         """Make a SuiteResult using this _ReturnResult.
    158 
    159         @param output_dict: output_dict to merge into SuiteResult.
    160         """
    161         if output_dict is None:
    162             output_dict = dict()
    163         else:
    164             output_dict = output_dict.copy()
    165         if self.message:
    166             output_dict['return_message'] = self.message
    167         return SuiteResult(self.return_code, output_dict)
    168 
    169 
    170 _RETURN_RESULTS = collections.OrderedDict([
    171     ('ok', _ReturnResult(RETURN_CODES.OK, '')),
    172 
    173     ('test_warning', _ReturnResult(
    174         RETURN_CODES.WARNING, 'Test job raised warning.')),
    175     ('suite_warning', _ReturnResult(
    176         RETURN_CODES.WARNING, 'Suite job raised warning.')),
    177     ('test_retry', _ReturnResult(
    178         RETURN_CODES.WARNING, 'Tests were retried.')),
    179 
    180     ('test_aborted_prestart', _ReturnResult(
    181         RETURN_CODES.SUITE_TIMEOUT,
    182         'Tests were aborted before running; suite must have timed out.')),
    183     # This really indicates a user action or an infra failure. But, suite
    184     # timeouts cause similar fauilres in the individual tests, so we must
    185     # classify these lower than suite_timeout. In case of a suite_timeout, the
    186     # result from the suite job will promote the result to suite_timeout.
    187     ('test_aborted_mystery',
    188      _ReturnResult(
    189              RETURN_CODES.SUITE_TIMEOUT,
    190              'Tests were aborted after running, but before timeout; '
    191              'Test was manually aborted or parsing results failed: '
    192              'crbug.com/796348.')),
    193     ('suite_timeout', _ReturnResult(
    194         RETURN_CODES.SUITE_TIMEOUT, 'Suite job timed out.')),
    195 
    196     ('test_views_missing', _ReturnResult(
    197         RETURN_CODES.INFRA_FAILURE, 'No test views found.')),
    198     ('suite_failed', _ReturnResult(
    199         RETURN_CODES.INFRA_FAILURE, 'Suite job failed.')),
    200     ('provision_failed', _ReturnResult(
    201         RETURN_CODES.INFRA_FAILURE, 'Provisioning failed.')),
    202 
    203     ('test_failure', _ReturnResult(
    204         RETURN_CODES.ERROR, 'Tests failed.')),
    205 ])
    206 _RETURN_RESULTS_LIST = list(_RETURN_RESULTS.values())
    207 
    208 
    209 def bool_str(x):
    210     """Boolean string type for option arguments.
    211 
    212     @param x: string representation of boolean value.
    213 
    214     """
    215     if x == 'True':
    216         return True
    217     elif x == 'False':
    218         return False
    219     else:
    220         raise argparse.ArgumentTypeError(
    221             '%s is not one of True or False' % (x,))
    222 
    223 
    224 def _get_priority_value(x):
    225     """Convert a priority representation to its int value.
    226 
    227     Priorities can be described either by an int value (possibly as a string)
    228     or a name string.  This function coerces both forms to an int value.
    229 
    230     This function is intended for casting command line arguments during
    231     parsing.
    232 
    233     @param x: priority value as an int, int string, or name string
    234 
    235     @returns: int value of priority
    236     """
    237     try:
    238         return int(x)
    239     except ValueError:
    240         try:
    241             return priorities.Priority.get_value(x)
    242         except AttributeError:
    243             raise argparse.ArgumentTypeError(
    244                 'Unknown priority level %s.  Try one of %s.'
    245                 % (x, ', '.join(priorities.Priority.names)))
    246 
    247 
    248 def make_parser():
    249     """Make ArgumentParser instance for run_suite.py."""
    250     parser = argparse.ArgumentParser(
    251         usage="%(prog)s [options]")
    252     parser.add_argument("-b", "--board", dest="board")
    253     parser.add_argument(
    254             "--model",
    255             help="The device model to run tests against. For non-unified "
    256                  "builds, model and board are synonymous, but board is more "
    257                  "accurate in some cases. Only pass this option if your build "
    258                  "is a unified build.",
    259     )
    260     parser.add_argument("-i", "--build", dest="build")
    261     parser.add_argument(
    262         "-w", "--web", dest="web", default=None,
    263         help="Address of a webserver to receive suite requests.")
    264     parser.add_argument(
    265         '--cheets_build', dest='cheets_build', default=None,
    266         help='ChromeOS Android build to be installed on dut.')
    267     parser.add_argument(
    268         '--firmware_rw_build', dest='firmware_rw_build', default=None,
    269         help='Firmware build to be installed in dut RW firmware.')
    270     parser.add_argument(
    271         '--firmware_ro_build', dest='firmware_ro_build', default=None,
    272         help='Firmware build to be installed in dut RO firmware.')
    273     parser.add_argument(
    274         '--test_source_build', dest='test_source_build', default=None,
    275         help=('Build that contains the test code, '
    276               'e.g., it can be the value of `--build`, '
    277               '`--firmware_rw_build` or `--firmware_ro_build` '
    278               'arguments. Default is None, that is, use the test '
    279               'code from `--build` (CrOS image)'))
    280     #  This should just be a boolean flag, but the autotest "proxy" code
    281     #  can't handle flags that don't take arguments.
    282     parser.add_argument(
    283         "-n", "--no_wait", dest="no_wait", default=False, type=bool_str,
    284         help='Must pass "True" or "False" if used.')
    285     # If you really want no pool, --pool="" will do it. USE WITH CARE.
    286     parser.add_argument("-p", "--pool", dest="pool", default="suites")
    287     parser.add_argument("-s", "--suite_name", dest="name")
    288     parser.add_argument("-a", "--afe_timeout_mins", type=int,
    289                         dest="afe_timeout_mins", default=30)
    290     parser.add_argument("-t", "--timeout_mins", type=int,
    291                         dest="timeout_mins", default=1440)
    292     parser.add_argument("-x", "--max_runtime_mins", type=int,
    293                         dest="max_runtime_mins", default=1440)
    294     parser.add_argument("-d", "--delay_sec", type=int,
    295                         dest="delay_sec", default=10)
    296     parser.add_argument("-m", "--mock_job_id", dest="mock_job_id",
    297                         help="Attach to existing job id for already running "
    298                         "suite, and creates report.")
    299     # NOTE(akeshet): This looks similar to --no_wait, but behaves differently.
    300     # --no_wait is passed in to the suite rpc itself and affects the suite,
    301     # while this does not.
    302     parser.add_argument("-c", "--create_and_return", dest="create_and_return",
    303                         action="store_true",
    304                         help="Create the suite and print the job id, then "
    305                         "finish immediately.")
    306     parser.add_argument("-u", "--num", dest="num", type=int, default=None,
    307                         help="Deprecated, does nothing.")
    308     #  Same boolean flag issue applies here.
    309     parser.add_argument(
    310         "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str,
    311         help=('File bugs on test failures. Must pass "True" or '
    312               '"False" if used.'))
    313     parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus",
    314                         action="store_true", help='Bypass lab status check.')
    315     # We allow either a number or a string for the priority.  This way, if you
    316     # know what you're doing, one can specify a custom priority level between
    317     # other levels.
    318     parser.add_argument("-r", "--priority", dest="priority",
    319                         type=_get_priority_value,
    320                         default=priorities.Priority.DEFAULT,
    321                         action="store",
    322                         help="Priority of suite. Either numerical value, or "
    323                         "one of (" + ", ".join(priorities.Priority.names)
    324                         + ").")
    325     parser.add_argument(
    326         '--retry', dest='retry', default=False, type=bool_str, action='store',
    327         help='Enable test retry.  Must pass "True" or "False" if used.')
    328     parser.add_argument('--max_retries', dest='max_retries', default=None,
    329                         type=int, action='store', help='Maximum retries'
    330                         'allowed at suite level. No limit if not specified.')
    331     parser.add_argument('--minimum_duts', dest='minimum_duts', type=int,
    332                         default=0, action='store',
    333                         help='Check that the pool has at least such many '
    334                         'healthy machines, otherwise suite will not run. '
    335                         'Default to 0.')
    336     parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int,
    337                         default=0, action='store',
    338                         help='Preferred minimum number of machines. Scheduler '
    339                         'will prioritize on getting such many machines for '
    340                         'the suite when it is competing with another suite '
    341                         'that has a higher priority but already got minimum '
    342                         'machines it needs. Default to 0.')
    343     parser.add_argument("--suite_args", dest="suite_args",
    344                         type=ast.literal_eval,
    345                         default=None, action="store",
    346                         help="A dict of args passed to the suite control file.")
    347     parser.add_argument('--offload_failures_only',
    348                         dest='offload_failures_only', type=bool_str,
    349                         action='store', default=False,
    350                         help='Only enable gs_offloading for failed tests. '
    351                         'Successful tests will be deleted. Must pass "True"'
    352                         ' or "False" if used.')
    353     parser.add_argument('--use_suite_attr', dest='use_suite_attr',
    354                         action='store_true', default=False,
    355                         help='Advanced. Run the suite based on ATTRIBUTES of '
    356                         'control files, rather than SUITE.')
    357     parser.add_argument('--json_dump', dest='json_dump', action='store_true',
    358                         default=False,
    359                         help='Dump the output of run_suite to stdout.')
    360     parser.add_argument(
    361         '--run_prod_code', dest='run_prod_code',
    362         action='store_true', default=False,
    363         help='Run the test code that lives in prod aka the test '
    364         'code currently on the lab servers.')
    365     parser.add_argument(
    366         '--delay_minutes', type=int, default=0,
    367         help=('Delay the creation of test jobs for a given '
    368               'number of minutes. This argument can be used to '
    369               'force provision jobs being delayed, which helps '
    370               'to distribute loads across devservers.'))
    371     parser.add_argument(
    372         '--skip_duts_check', dest='skip_duts_check', action='store_true',
    373         default=False, help='If True, skip minimum available DUTs check')
    374     parser.add_argument(
    375         '--job_keyvals', dest='job_keyvals', type=ast.literal_eval,
    376         action='store', default=None,
    377         help='A dict of job keyvals to be inject to suite control file')
    378     parser.add_argument(
    379         '--test_args', dest='test_args', type=ast.literal_eval,
    380         action='store', default=None,
    381         help=('A dict of args passed all the way to each individual test that '
    382               'will be actually ran.'))
    383     parser.add_argument(
    384         '--require_logfile', action='store_true',
    385         help=('Stream logs of run_suite.py to a local file named '
    386               'run_suite-<build name>.log.'))
    387 
    388     # Used for monitoring purposes, to measure no-op swarming proxy latency.
    389     parser.add_argument('--do_nothing', action='store_true',
    390                         help=argparse.SUPPRESS)
    391 
    392     # Used when lab/job status checking is needed. Currently its only user is
    393     # suite scheduler v2.
    394     parser.add_argument(
    395         '--pre_check', action='store_true',
    396         help=('Check lab and job status before kicking off a suite. Used by '
    397               'suite scheduler v2.'))
    398 
    399     # TODO(crbug.com/763207): This is to support calling old moblab RPC
    400     # with ToT code.  This does not need to be supported after M62.
    401     parser.add_argument('--oldrpc', action='store_true',
    402                         help='Use old AFE RPC.')
    403 
    404     return parser
    405 
    406 
    407 def verify_and_clean_options(options):
    408     """Verify the validity of options.
    409 
    410     @param options: The parsed options to verify.
    411 
    412     @returns: True if verification passes, False otherwise.
    413 
    414     """
    415     if options.mock_job_id and (
    416             not options.build or not options.name or not options.board):
    417         print ('When using -m, need to specify build, board and suite '
    418                'name which you have used for creating the original job')
    419         return False
    420     else:
    421         if not options.build:
    422             print 'Need to specify which build to use'
    423             return False
    424         if not options.board:
    425             print 'Need to specify board'
    426             return False
    427         if not options.name:
    428             print 'Need to specify suite name'
    429             return False
    430     if options.num is not None:
    431         warnings.warn('-u/--num option is deprecated; it does nothing.')
    432     del options.num
    433     if not options.retry and options.max_retries is not None:
    434         print 'max_retries can only be used with --retry=True'
    435         return False
    436     if options.use_suite_attr and options.suite_args is not None:
    437         print ('The new suite control file cannot parse the suite_args: %s.'
    438                'Please not specify any suite_args here.' % options.suite_args)
    439         return False
    440     if options.no_wait and options.retry:
    441         print 'Test retry is not available when using --no_wait=True'
    442     # Default to use the test code in CrOS build.
    443     if not options.test_source_build and options.build:
    444         options.test_source_build = options.build
    445     return True
    446 
    447 
    448 def change_options_for_suite_attr(options):
    449     """Change options to be prepared to run the suite_attr_wrapper.
    450 
    451     If specify 'use_suite_attr' from the cmd line, it indicates to run the
    452     new style suite control file, suite_attr_wrapper. Then, change the
    453     options.name to 'suite_attr_wrapper', change the options.suite_args to
    454     include the arguments needed by suite_attr_wrapper.
    455 
    456     @param options: The verified options.
    457 
    458     @returns: The changed options.
    459 
    460     """
    461     # Convert the suite_name to attribute boolean expression.
    462     if type(options.name) is str:
    463         attr_filter_val = 'suite:%s' % options.name
    464     else:
    465         attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name])
    466 
    467     # change the suite_args to be a dict of arguments for suite_attr_wrapper
    468     # if suite_args is not None, store the values in 'other_args' of the dict
    469     args_dict = {}
    470     args_dict['attr_filter'] = attr_filter_val
    471     options.suite_args = args_dict
    472     options.name = 'suite_attr_wrapper'
    473 
    474     return options
    475 
    476 
    477 class TestResult(object):
    478 
    479     """Represents the result of a TestView."""
    480 
    481     def __init__(self, test_view, retry_count=0):
    482         """Initialize instance.
    483 
    484         @param test_view: TestView instance.
    485         @param retry_count: Retry count for test.  Optional.
    486         """
    487         self.name = test_view.get_testname()
    488         self.status = test_view['status']
    489         self.reason = test_view['reason']
    490         self.retry_count = retry_count
    491 
    492     _PRETTY_STATUS_MAP = {
    493         'GOOD':    '[ PASSED ]',
    494         'TEST_NA': '[  INFO  ]',
    495     }
    496 
    497     @property
    498     def _pretty_status(self):
    499         """Pretty status string."""
    500         return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]')
    501 
    502     def log_using(self, log_function, name_column_width):
    503         """Log the test result using the given log function.
    504 
    505         @param log_function: Log function to use.  Example: logging.info
    506         @param name_column_width: Width of name column for formatting.
    507         """
    508         padded_name = self.name.ljust(name_column_width)
    509         log_function('%s%s', padded_name, self._pretty_status)
    510         if self.status != 'GOOD':
    511             log_function('%s  %s: %s', padded_name, self.status, self.reason)
    512         if self.retry_count > 0:
    513             log_function('%s  retry_count: %s', padded_name, self.retry_count)
    514 
    515 
    516 def get_original_suite_name(suite_name, suite_args):
    517     """Get the original suite name when running suite_attr_wrapper.
    518 
    519     @param suite_name: the name of the suite launched in afe. When it is
    520                        suite_attr_wrapper, the suite that actually running is
    521                        specified in the suite_args.
    522     @param suite_args: dict of suite args from argument parsing.
    523 
    524     @returns: the original suite name.
    525 
    526     """
    527     if suite_name == 'suite_attr_wrapper':
    528         attrs = suite_args.get('attr_filter', '')
    529         suite_list = ([x[6:] for x in re.split('[() ]', attrs)
    530                        if x and x.startswith('suite:')])
    531         return suite_list[0] if suite_list else suite_name
    532     return suite_name
    533 
    534 
    535 class LogLink(object):
    536     """Information needed to record a link in the logs.
    537 
    538     Depending on context and the information provided at
    539     construction time, the link may point to either to log files for
    540     a job, or to a bug filed for a failure in the job.
    541 
    542     @var anchor  The link text.
    543     @var url     The link url.
    544     @var bug_id  Id of a bug to link to, or None.
    545     """
    546 
    547     # A list of tests that don't get retried so skip the dashboard.
    548     _SKIP_RETRY_DASHBOARD = ['provision']
    549 
    550     _BUG_LINK_PREFIX = 'Auto-Bug'
    551     _LOG_LINK_PREFIX = 'Test-Logs'
    552 
    553 
    554     def __init__(self, anchor, server, job_string, bug_info=None, reason=None,
    555                  retry_count=0, testname=None, sponge_url=None):
    556         """Initialize the LogLink by generating the log URL.
    557 
    558         @param anchor      The link text.
    559         @param server      The hostname of the server this suite ran on.
    560         @param job_string  The job whose logs we'd like to link to.
    561         @param bug_info    Info about the bug, if one was filed.
    562         @param reason      A string representing the reason of failure if any.
    563         @param retry_count How many times the test has been retried.
    564         @param testname    Optional Arg that supplies the testname.
    565         @param sponge_url  url to Sponge result.
    566         """
    567         self.anchor = anchor
    568         self.url = _URL_PATTERN % (rpc_client_lib.add_protocol(server),
    569                                    job_string)
    570         self.reason = reason
    571         self.retry_count = retry_count
    572         self.testname = testname
    573         self.sponge_url = sponge_url
    574         if bug_info:
    575             self.bug_id, self.bug_count = bug_info
    576         else:
    577             self.bug_id = None
    578             self.bug_count = None
    579 
    580 
    581     @property
    582     def bug_url(self):
    583         """URL of associated bug."""
    584         if self.bug_id:
    585             return reporting_utils.link_crbug(self.bug_id)
    586         else:
    587             return None
    588 
    589 
    590     @property
    591     def _bug_count_text(self):
    592         """Return bug count as human friendly text."""
    593         if self.bug_count is None:
    594             bug_info = 'unknown number of reports'
    595         elif self.bug_count == 1:
    596             bug_info = 'new report'
    597         else:
    598             bug_info = '%s reports' % self.bug_count
    599         return bug_info
    600 
    601 
    602     def GenerateBuildbotLinks(self):
    603         """Generate a link formatted to meet buildbot expectations.
    604 
    605         If there is a bug associated with this link, report a link to the bug
    606         and a link to the job logs; otherwise report a link to the job logs.
    607 
    608         @return A generator of links formatted for the buildbot log annotator.
    609         """
    610         if self.bug_url:
    611             yield self._get_link_to_bug()
    612         yield self._get_link_to_job_logs()
    613 
    614 
    615     def _get_link_to_bug(self):
    616         """Return buildbot link to bug.
    617 
    618         @return A link formatted for the buildbot log annotator.
    619         """
    620         info_strings = self._get_info_strings()
    621         info_strings.append(self._bug_count_text)
    622         anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX,
    623                                                info_strings)
    624         return annotations.StepLink(anchor_text, self.bug_url)
    625 
    626 
    627     def _get_link_to_job_logs(self):
    628         """Return buildbot link to job logs.
    629 
    630         @return A link formatted for the buildbot log annotator.
    631         """
    632         anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX,
    633                                                self._get_info_strings())
    634         return annotations.StepLink(anchor_text, self.url)
    635 
    636 
    637     def _get_info_strings(self):
    638         """Return a list of info strings for _format_anchor_text()."""
    639         info_strings = []
    640         if self.retry_count > 0:
    641             info_strings.append('retry_count: %d' % self.retry_count)
    642         if self.reason:
    643             info_strings.append(self.reason)
    644         return info_strings
    645 
    646 
    647     def _format_anchor_text(self, prefix, info_strings):
    648         """Format anchor text given a prefix and info strings.
    649 
    650         @param prefix        The prefix of the anchor text.
    651         @param info_strings  Iterable of strings.
    652         @return A anchor_text with the right prefix and info strings.
    653         """
    654         return '[{prefix}]: {anchor}: {info}'.format(
    655             prefix=prefix,
    656             anchor=self.anchor.strip(),
    657             info=', '.join(info_strings))
    658 
    659     @property
    660     def text_link(self):
    661         """Link to the job's logs, for consumption by a human.
    662 
    663         @return A link formatted for human readability.
    664         """
    665         return '%s %s' % (self.anchor, self.url)
    666 
    667     def GenerateRetryLink(self):
    668         """Generate a link to the retry dashboard.
    669 
    670         @return A link formatted for the buildbot log annotator.
    671         """
    672         if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
    673             return None
    674         return annotations.StepLink(
    675             text='[Flake-Dashboard]: %s' % self.testname,
    676             url=reporting_utils.link_retry_url(self.testname))
    677 
    678     def GenerateHistoryLink(self):
    679         """Generate a link to the test history dashboard.
    680 
    681         @return A link formatted for the buildbot log annotator.
    682         """
    683         if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
    684             return None
    685         return annotations.StepLink(
    686             text='[Test-History]: %s' % self.testname,
    687             url=reporting_utils.link_test_history(self.testname))
    688 
    689 
    690 class Timings(object):
    691     """Timings for important events during a suite.
    692 
    693     All timestamps are datetime.datetime objects.
    694 
    695     @var suite_job_id: the afe job id of the suite job for which
    696                        we are recording the timing for.
    697     @var download_start_time: the time the devserver starts staging
    698                               the build artifacts. Recorded in create_suite_job.
    699     @var payload_end_time: the time when the artifacts only necessary to start
    700                            installsing images onto DUT's are staged.
    701                            Recorded in create_suite_job.
    702     @var artifact_end_time: the remaining artifacts are downloaded after we kick
    703                             off the reimaging job, at which point we record
    704                             artifact_end_time. Recorded in dynamic_suite.py.
    705     @var suite_start_time: the time the suite started.
    706     @var tests_start_time: the time the first test started running.
    707     @var tests_end_time: the time the last test finished running.
    708     """
    709 
    710     def __init__(self, suite_job_id):
    711         self.suite_job_id = suite_job_id
    712         # Timings related to staging artifacts on devserver.
    713         self.download_start_time = None
    714         self.payload_end_time = None
    715         self.artifact_end_time = None
    716 
    717         # The test_start_time, but taken off the view that corresponds to the
    718         # suite instead of an individual test.
    719         self.suite_start_time = None
    720 
    721         # Earliest and Latest tests in the set of TestViews passed to us.
    722         self.tests_start_time = None
    723         self.tests_end_time = None
    724 
    725 
    726     def RecordTiming(self, view):
    727         """Given a test report view, extract and record pertinent time info.
    728 
    729         get_detailed_test_views() returns a list of entries that provide
    730         info about the various parts of a suite run.  This method can take
    731         any one of these entries and look up timestamp info we might want
    732         and record it.
    733 
    734         If timestamps are unavailable, datetime.datetime.min/max will be used.
    735 
    736         @param view: A TestView object.
    737         """
    738         start_candidate = datetime.min
    739         end_candidate = datetime.max
    740         if view['test_started_time']:
    741             start_candidate = time_utils.time_string_to_datetime(
    742                     view['test_started_time'])
    743         if view['test_finished_time']:
    744             end_candidate = time_utils.time_string_to_datetime(
    745                     view['test_finished_time'])
    746 
    747         if view.get_testname() == TestView.SUITE_JOB:
    748             self.suite_start_time = start_candidate
    749         else:
    750             self._UpdateFirstTestStartTime(start_candidate)
    751             self._UpdateLastTestEndTime(end_candidate)
    752         if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view:
    753             keyvals = view['job_keyvals']
    754             self.download_start_time = time_utils.time_string_to_datetime(
    755                     keyvals.get(constants.DOWNLOAD_STARTED_TIME),
    756                     handle_type_error=True)
    757 
    758             self.payload_end_time = time_utils.time_string_to_datetime(
    759                     keyvals.get(constants.PAYLOAD_FINISHED_TIME),
    760                     handle_type_error=True)
    761 
    762             self.artifact_end_time = time_utils.time_string_to_datetime(
    763                     keyvals.get(constants.ARTIFACT_FINISHED_TIME),
    764                     handle_type_error=True)
    765 
    766 
    767     def _UpdateFirstTestStartTime(self, candidate):
    768         """Update self.tests_start_time, iff candidate is an earlier time.
    769 
    770         @param candidate: a datetime.datetime object.
    771         """
    772         if not self.tests_start_time or candidate < self.tests_start_time:
    773             self.tests_start_time = candidate
    774 
    775 
    776     def _UpdateLastTestEndTime(self, candidate):
    777         """Update self.tests_end_time, iff candidate is a later time.
    778 
    779         @param candidate: a datetime.datetime object.
    780         """
    781         if not self.tests_end_time or candidate > self.tests_end_time:
    782             self.tests_end_time = candidate
    783 
    784 
    785     def __str__(self):
    786         return ('\n'
    787                 'Suite timings:\n'
    788                 'Downloads started at %s\n'
    789                 'Payload downloads ended at %s\n'
    790                 'Suite started at %s\n'
    791                 'Artifact downloads ended (at latest) at %s\n'
    792                 'Testing started at %s\n'
    793                 'Testing ended at %s\n' % (self.download_start_time,
    794                                            self.payload_end_time,
    795                                            self.suite_start_time,
    796                                            self.artifact_end_time,
    797                                            self.tests_start_time,
    798                                            self.tests_end_time))
    799 
    800 
    801 def instance_for_pool(pool_name):
    802     """
    803     Return the hostname of the server that should be used to service a suite
    804     for the specified pool.
    805 
    806     @param pool_name: The pool (without 'pool:' to schedule the suite against.
    807     @return: The correct host that should be used to service this suite run.
    808     """
    809     return CONFIG.get_config_value(
    810             'POOL_INSTANCE_SHARDING', pool_name,
    811             default=_DEFAULT_AUTOTEST_INSTANCE)
    812 
    813 
    814 class TestView(object):
    815     """Represents a test view and provides a set of helper functions."""
    816 
    817 
    818     SUITE_JOB = 'Suite job'
    819 
    820 
    821     def __init__(self, view, afe_job, suite_name, build, user,
    822                  solo_test_run=False):
    823         """Init a TestView object representing a tko test view.
    824 
    825         @param view: A dictionary representing a tko test view.
    826         @param afe_job: An instance of frontend.afe.models.Job
    827                         representing the job that kicked off the test.
    828         @param suite_name: The name of the suite
    829                            that the test belongs to.
    830         @param build: The build for which the test is run.
    831         @param user: The user for which the test is run.
    832         @param solo_test_run: This is a solo test run not part of a suite.
    833         """
    834         self.view = view
    835         self.afe_job = afe_job
    836         self.suite_name = suite_name
    837         self.build = build
    838         self.is_suite_view = afe_job.parent_job is None and not solo_test_run
    839         # This is the test name that will be shown in the output.
    840         self.testname = None
    841         self.user = user
    842 
    843         # The case that a job was aborted before it got a chance to run
    844         # usually indicates suite has timed out (unless aborted by user).
    845         # In this case, the abort reason will be None.
    846         # Update the reason with proper information.
    847         if (self.is_relevant_suite_view() and
    848                 not self.get_testname() == self.SUITE_JOB and
    849                 self.view['status'] == 'ABORT' and
    850                 not self.view['reason']):
    851             self.view['reason'] = 'Timed out, did not run.'
    852 
    853 
    854     def __getitem__(self, key):
    855         """Overload __getitem__ so that we can still use []
    856 
    857         @param key: A key of the tko test view.
    858 
    859         @returns: The value of an attribute in the view.
    860 
    861         """
    862         return self.view[key]
    863 
    864 
    865     def __iter__(self):
    866         """Overload __iter__ so that it supports 'in' operator."""
    867         return iter(self.view)
    868 
    869 
    870     def get_testname(self):
    871         """Get test name that should be shown in the output.
    872 
    873         Formalize the test_name we got from the test view.
    874 
    875         Remove 'build/suite' prefix if any.
    876 
    877         If one runs a test in control file via the following code,
    878            job.runtest('my_Test', tag='tag')
    879         for most of the cases, view['test_name'] would look like 'my_Test.tag'.
    880         If this is the case, this method will just return the original
    881         test name, i.e. 'my_Test.tag'.
    882 
    883         There are four special cases.
    884         1) A test view is for the suite job's SERVER_JOB.
    885            In this case, this method will return 'Suite job'.
    886 
    887         2) A test view is of a child job or a solo test run not part of a
    888            suite, and for a SERVER_JOB or CLIENT_JOB.
    889            In this case, we will take the job name, remove the build/suite
    890            prefix from the job name, and append the rest to 'SERVER_JOB'
    891            or 'CLIENT_JOB' as a prefix. So the names returned by this
    892            method will look like:
    893              'dummy_Pass_SERVER_JOB'
    894              'dummy_Fail_SERVER_JOB'
    895 
    896         3) A test view is of a suite job and its status is ABORT.
    897            In this case, the view['test_name'] is the child job's name.
    898            For instance,
    899              'lumpy-release/R35-5712.0.0/dummy/dummy_Pass'
    900              'lumpy-release/R35-5712.0.0/dummy/dummy_Fail'
    901            The above names will be converted to the following:
    902              'dummy_Pass'
    903              'dummy_Fail'
    904 
    905         4) A test view's status is of a suite job and its status is TEST_NA.
    906            In this case, the view['test_name'] is the NAME field of the control
    907            file. For instance,
    908              'dummy_Pass'
    909              'dummy_Fail'
    910            This method will not modify these names.
    911 
    912         @returns: Test name after normalization.
    913 
    914         """
    915         if self.testname is not None:
    916             return self.testname
    917 
    918         if (self.is_suite_view and
    919                 self.view['test_name'].startswith('SERVER_JOB')):
    920             # Rename suite job's SERVER_JOB to 'Suite job'.
    921             self.testname = self.SUITE_JOB
    922             return self.testname
    923 
    924         if (self.view['test_name'].startswith('SERVER_JOB') or
    925                 self.view['test_name'].startswith('CLIENT_JOB')):
    926             # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
    927             testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
    928         else:
    929             testname = self.view['test_name']
    930         # Remove the build and suite name from testname if any.
    931         self.testname = tools.get_test_name(
    932                 self.build, self.suite_name, testname)
    933         return self.testname
    934 
    935 
    936     def is_relevant_suite_view(self):
    937         """Checks whether this is a suite view we should care about.
    938 
    939         @returns: True if it is relevant. False otherwise.
    940         """
    941         return (self.get_testname() == self.SUITE_JOB or
    942                 (self.is_suite_view and
    943                     not self.view['test_name'].startswith('CLIENT_JOB') and
    944                     not self.view['subdir']))
    945 
    946 
    947     def is_test(self):
    948         """Return whether the view is for an actual test.
    949 
    950         @returns True if the view is for an actual test.
    951                  False if the view is for SERVER_JOB or CLIENT_JOB.
    952 
    953         """
    954         return not (self.view['test_name'].startswith('SERVER_JOB') or
    955                 self.view['test_name'].startswith('CLIENT_JOB'))
    956 
    957 
    958     def is_retry(self):
    959         """Check whether the view is for a retry.
    960 
    961         @returns: True, if the view is for a retry; False otherwise.
    962 
    963         """
    964         return self.view['job_keyvals'].get('retry_original_job_id') is not None
    965 
    966 
    967     def hit_timeout(self):
    968         """Check whether the corresponding job has hit its own timeout.
    969 
    970         Note this method should not be called for those test views
    971         that belongs to a suite job and are determined as irrelevant
    972         by is_relevant_suite_view.  This is because they are associated
    973         to the suite job, whose job start/finished time make no sense
    974         to an irrelevant test view.
    975 
    976         @returns: True if the corresponding afe job has hit timeout.
    977                   False otherwise.
    978         """
    979         if (self.is_relevant_suite_view() and
    980                 self.get_testname() != self.SUITE_JOB):
    981             # Any relevant suite test view except SUITE_JOB
    982             # did not hit its own timeout because it was not ever run.
    983             return False
    984         start = (datetime.strptime(
    985                 self.view['job_started_time'], time_utils.TIME_FMT)
    986                 if self.view['job_started_time'] else None)
    987         end = (datetime.strptime(
    988                 self.view['job_finished_time'], time_utils.TIME_FMT)
    989                 if self.view['job_finished_time'] else None)
    990         if not start or not end:
    991             return False
    992         else:
    993             return ((end - start).total_seconds()/60.0
    994                         > self.afe_job.max_runtime_mins)
    995 
    996 
    997     def is_aborted(self):
    998         """Check if the view was aborted.
    999 
   1000         For suite job and child job test views, we check job keyval
   1001         'aborted_by' and test status.
   1002 
   1003         For relevant suite job test views, we only check test status
   1004         because the suite job keyval won't make sense to individual
   1005         test views.
   1006 
   1007         @returns: True if the test was as aborted, False otherwise.
   1008 
   1009         """
   1010 
   1011         if (self.is_relevant_suite_view() and
   1012                 self.get_testname() != self.SUITE_JOB):
   1013             return self.view['status'] == 'ABORT'
   1014         else:
   1015             return (bool(self.view['job_keyvals'].get('aborted_by')) and
   1016                     self.view['status'] in ['ABORT', 'RUNNING'])
   1017 
   1018 
   1019     def is_in_fail_status(self):
   1020         """Check if the given test's status corresponds to a failure.
   1021 
   1022         @returns: True if the test's status is FAIL or ERROR. False otherwise.
   1023 
   1024         """
   1025         # All the statuses tests can have when they fail.
   1026         return self.view['status'] in ['FAIL', 'ERROR', 'ABORT']
   1027 
   1028 
   1029     def is_provision(self):
   1030         """Check whether this is a provision test."""
   1031         return self.get_testname() == 'provision'
   1032 
   1033 
   1034     def get_buildbot_link_reason(self):
   1035         """Generate the buildbot link reason for the test.
   1036 
   1037         @returns: A string representing the reason.
   1038 
   1039         """
   1040         return ('%s: %s' % (self.view['status'], self.view['reason'])
   1041                 if self.view['reason'] else self.view['status'])
   1042 
   1043 
   1044     def get_job_id_owner_str(self):
   1045         """Generate the job_id_owner string for a test.
   1046 
   1047         @returns: A string which looks like 135036-username
   1048 
   1049         """
   1050         return '%s-%s' % (self.view['afe_job_id'], self.user)
   1051 
   1052 
   1053     def get_bug_info(self, suite_job_keyvals):
   1054         """Get the bug info from suite_job_keyvals.
   1055 
   1056         If a bug has been filed for the test, its bug info (bug id and counts)
   1057         will be stored in the suite job's keyvals. This method attempts to
   1058         retrieve bug info of the test from |suite_job_keyvals|. It will return
   1059         None if no bug info is found. No need to check bug info if the view is
   1060         SUITE_JOB.
   1061 
   1062         @param suite_job_keyvals: The job keyval dictionary of the suite job.
   1063                 All the bug info about child jobs are stored in
   1064                 suite job's keyvals.
   1065 
   1066         @returns: None if there is no bug info, or a pair with the
   1067                   id of the bug, and the count of the number of
   1068                   times the bug has been seen.
   1069 
   1070         """
   1071         if self.get_testname() == self.SUITE_JOB:
   1072             return None
   1073         if (self.view['test_name'].startswith('SERVER_JOB') or
   1074                 self.view['test_name'].startswith('CLIENT_JOB')):
   1075             # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
   1076             testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
   1077         else:
   1078             testname = self.view['test_name']
   1079 
   1080         return tools.get_test_failure_bug_info(
   1081                 suite_job_keyvals, self.view['afe_job_id'],
   1082                 testname)
   1083 
   1084 
   1085     def should_display_buildbot_link(self):
   1086         """Check whether a buildbot link should show for this view.
   1087 
   1088         For suite job view, show buildbot link if it fails.
   1089         For normal test view,
   1090             show buildbot link if it is a retry
   1091             show buildbot link if it hits its own timeout.
   1092             show buildbot link if it fails. This doesn't
   1093             include the case where it was aborted but has
   1094             not hit its own timeout (most likely it was aborted because
   1095             suite has timed out).
   1096 
   1097         @returns: True if we should show the buildbot link.
   1098                   False otherwise.
   1099         """
   1100         is_bad_status = (self.view['status'] != 'GOOD' and
   1101                          self.view['status'] != 'TEST_NA')
   1102         if self.get_testname() == self.SUITE_JOB:
   1103             return is_bad_status
   1104         else:
   1105             if self.is_retry():
   1106                 return True
   1107             if is_bad_status:
   1108                 return not self.is_aborted() or self.hit_timeout()
   1109 
   1110 
   1111     def get_control_file_attributes(self):
   1112         """Get the attributes from the control file of the test.
   1113 
   1114         @returns: A list of test attribute or None.
   1115         """
   1116         control_file = self.afe_job.control_file
   1117         attributes = None
   1118         if control_file:
   1119             cd = control_data.parse_control_string(control_file)
   1120             attributes = list(cd.attributes)
   1121         return attributes
   1122 
   1123 
   1124     def override_afe_job_id(self, afe_job_id):
   1125         """Overrides the AFE job id for the test.
   1126 
   1127         @param afe_job_id: The new AFE job id to use.
   1128         """
   1129         self.view['afe_job_id'] = afe_job_id
   1130 
   1131 
   1132 def log_buildbot_links(log_func, links):
   1133     """Output buildbot links to log.
   1134 
   1135     @param log_func: Logging function to use.
   1136     @param links: Iterable of LogLink instances.
   1137     """
   1138     for link in links:
   1139         for generated_link in link.GenerateBuildbotLinks():
   1140             log_func(generated_link)
   1141         retry_link = link.GenerateRetryLink()
   1142         if retry_link:
   1143             log_func(retry_link)
   1144         history_link = link.GenerateHistoryLink()
   1145         if history_link:
   1146             log_func(history_link)
   1147 
   1148 
   1149 class _ReturnCodeComputer(object):
   1150     """This is responsible for returning the _ReturnResult for a suite."""
   1151 
   1152     def __call__(self, test_views):
   1153         """Compute the exit code based on test results."""
   1154         result = _RETURN_RESULTS['ok']
   1155 
   1156         for v in test_views:
   1157             if v.get_testname() == TestView.SUITE_JOB:
   1158                 result |= self._get_suite_result(v)
   1159             else:
   1160                 result |= self._get_test_result(v)
   1161         return result
   1162 
   1163     def _get_suite_result(self, test_view):
   1164         """Return the _ReturnResult for the given suite job."""
   1165         # The order of checking each case is important.
   1166         if test_view.is_aborted() and test_view.hit_timeout():
   1167             return _RETURN_RESULTS['suite_timeout']
   1168         elif test_view.is_in_fail_status():
   1169             return _RETURN_RESULTS['suite_failed']
   1170         elif test_view['status'] == 'WARN':
   1171             return _RETURN_RESULTS['suite_warning']
   1172         else:
   1173             return _RETURN_RESULTS['ok']
   1174 
   1175     def _get_test_result(self, test_view):
   1176         """Return the _ReturnResult for the given test job."""
   1177         # The order of checking each case is important.
   1178         if test_view.is_aborted() and test_view.is_relevant_suite_view():
   1179             # The test was aborted before started
   1180             # This gurantees that the suite has timed out.
   1181             return _RETURN_RESULTS['test_aborted_prestart']
   1182         elif test_view.is_aborted() and not test_view.hit_timeout():
   1183             # The test was aborted, but
   1184             # not due to a timeout. This is most likely
   1185             # because the suite has timed out, but may
   1186             # also because it was aborted by the user.
   1187             # Since suite timing out is determined by checking
   1188             # the suite job view, we simply ignore this view here.
   1189             return _RETURN_RESULTS['test_aborted_mystery']
   1190         elif test_view.is_in_fail_status():  # The test job failed
   1191             if test_view.is_provision():
   1192                 return _RETURN_RESULTS['provision_failed']
   1193             else:
   1194                 return _RETURN_RESULTS['test_failure']
   1195         elif test_view['status'] == 'WARN':
   1196             return _RETURN_RESULTS['test_warning']
   1197         elif test_view.is_retry():
   1198             # The test is a passing retry.
   1199             return _RETURN_RESULTS['test_retry']
   1200         else:
   1201             return _RETURN_RESULTS['ok']
   1202 
   1203 
   1204 class _ProvisionReturnCodeComputer(_ReturnCodeComputer):
   1205     """This is used for returning the _ReturnResult for provision suites."""
   1206 
   1207     def __init__(self, num_required):
   1208         """Initialize instance.
   1209 
   1210         num_required is the number of passing provision jobs needed.
   1211         """
   1212         super(_ProvisionReturnCodeComputer, self).__init__()
   1213         self._num_required = num_required
   1214         self._num_successful = 0
   1215 
   1216     def __call__(self, test_views):
   1217         result = super(_ProvisionReturnCodeComputer, self).__call__(test_views)
   1218         if self._num_successful >= self._num_required:
   1219             logging.info('Return result upgraded from %r'
   1220                          ' due to enough ok provisions',
   1221                          result)
   1222             return _RETURN_RESULTS['ok']
   1223         else:
   1224             return result
   1225 
   1226     def _get_test_result(self, test_view):
   1227         result = (super(_ProvisionReturnCodeComputer, self)
   1228                   ._get_test_result(test_view))
   1229         if result in {_RETURN_RESULTS[s] for s in ('ok', 'test_retry')}:
   1230             self._num_successful += 1
   1231         return result
   1232 
   1233 
   1234 class ResultCollector(object):
   1235     """Collect test results of a suite or a single test run.
   1236 
   1237     Once a suite job has finished, use this class to collect test results.
   1238     `run` is the core method that is to be called first. Then the caller
   1239     could retrieve information like return code, return message, is_aborted,
   1240     and timings by accessing the collector's public attributes. And output
   1241     the test results and links by calling the 'output_*' methods.
   1242 
   1243     Here is a overview of what `run` method does.
   1244 
   1245     1) Collect the suite job's results from tko_test_view_2.
   1246     For the suite job, we only pull test views without a 'subdir'.
   1247     A NULL subdir indicates that the test was _not_ executed. This could be
   1248     that no child job was scheduled for this test or the child job got
   1249     aborted before starts running.
   1250     (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
   1251 
   1252     2) Collect the child jobs' results from tko_test_view_2.
   1253     For child jobs, we pull all the test views associated with them.
   1254     (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
   1255 
   1256     3) Generate web and buildbot links.
   1257     4) Compute timings of the suite run.
   1258     5) Compute the return code based on test results.
   1259 
   1260     @var _instance_server: The hostname of the server that is used
   1261                            to service the suite.
   1262     @var _afe: The afe rpc client.
   1263     @var _tko: The tko rpc client.
   1264     @var _build: The build for which the suite is run,
   1265                  e.g. 'lumpy-release/R35-5712.0.0'
   1266     @var _board: The target board for which the suite is run,
   1267                  e.g., 'lumpy', 'link'.
   1268     @var _suite_name: The suite name, e.g. 'bvt', 'dummy'.
   1269     @var _suite_job_id: The job id of the suite for which we are going to
   1270                         collect results.
   1271     @var _original_suite_name: The suite name we record timing would be
   1272                                different from _suite_name when running
   1273                                suite_attr_wrapper.
   1274     @var _return_code_function: Called to return what the overall result of
   1275                                 the suite is.
   1276     @var _suite_views: A list of TestView objects, representing relevant
   1277                        test views of the suite job.
   1278     @var _child_views: A list of TestView objects, representing test views
   1279                        of the child jobs.
   1280     @var _test_views: A list of TestView objects, representing all test views
   1281                       from _suite_views and _child_views.
   1282     @var _web_links: A list of web links pointing to the results of jobs.
   1283     @var buildbot_links: A list of buildbot links for non-passing tests.
   1284     @var _solo_test_run: True if this is a single test run.
   1285     @var return_result: The _ReturnResult of the suite run.
   1286     @var is_aborted: Whether the suite was aborted or not.
   1287                      True, False or None (aborting status is unknown yet)
   1288     @var timings: A Timing object that records the suite's timings.
   1289 
   1290     """
   1291 
   1292 
   1293     def __init__(self, instance_server, afe, tko, build, board,
   1294                  suite_name, suite_job_id,
   1295                  return_code_function,
   1296                  original_suite_name=None,
   1297                  user=None, solo_test_run=False):
   1298         self._instance_server = instance_server
   1299         self._afe = afe
   1300         self._tko = tko
   1301         self._build = build
   1302         self._board = board
   1303         self._suite_name = suite_name
   1304         self._suite_job_id = suite_job_id
   1305         self._original_suite_name = original_suite_name or suite_name
   1306         self._return_code_function = return_code_function
   1307         self._suite_views = []
   1308         self._child_views = []
   1309         self._test_views = []
   1310         self._retry_counts = {}
   1311         self._missing_results = {}
   1312         self._web_links = []
   1313         self.buildbot_links = []
   1314         self._num_child_jobs = 0
   1315         self.return_result = None
   1316         self.is_aborted = None
   1317         self.timings = None
   1318         self._user = user or getpass.getuser()
   1319         self._solo_test_run = solo_test_run
   1320 
   1321 
   1322     def _fetch_relevant_test_views_of_suite(self):
   1323         """Fetch relevant test views of the suite job.
   1324 
   1325         For the suite job, there will be a test view for SERVER_JOB, and views
   1326         for results of its child jobs. For example, assume we've created
   1327         a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail,
   1328         dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while
   1329         dummy_Path.bluetooth got TEST_NA as no duts have bluetooth.
   1330         So the suite job's test views would look like
   1331         _____________________________________________________________________
   1332         test_idx| job_idx|test_name           |subdir      |afe_job_id|status
   1333         10      | 1000   |SERVER_JOB          |----        |40        |GOOD
   1334         11      | 1000   |dummy_Pass          |NULL        |40        |ABORT
   1335         12      | 1000   |dummy_Fail.Fail     |41-onwer/...|40        |FAIL
   1336         13      | 1000   |dummy_Fail.Error    |42-owner/...|40        |ERROR
   1337         14      | 1000   |dummy_Pass.bluetooth|NULL        |40        |TEST_NA
   1338 
   1339         For a suite job, we only care about
   1340         a) The test view for the suite job's SERVER_JOB
   1341         b) The test views for real tests without a subdir. A NULL subdir
   1342            indicates that a test didn't get executed.
   1343         So, for the above example, we only keep test views whose test_idxs
   1344         are 10, 11, 14.
   1345 
   1346         @returns: A list of TestView objects, representing relevant
   1347                   test views of the suite job.
   1348 
   1349         """
   1350         suite_job = self._afe.get_jobs(id=self._suite_job_id)[0]
   1351         views = self._tko.run(call='get_detailed_test_views',
   1352                               afe_job_id=self._suite_job_id)
   1353         relevant_views = []
   1354         for v in views:
   1355             v = TestView(v, suite_job, self._suite_name, self._build, self._user,
   1356                          solo_test_run=self._solo_test_run)
   1357             if v.is_relevant_suite_view():
   1358                 # If the test doesn't have results in TKO and is being
   1359                 # displayed in the suite view instead of the child view,
   1360                 # then afe_job_id is incorrect and from the suite.
   1361                 # Override it based on the AFE job id which was missing
   1362                 # results.
   1363                 # TODO: This is likely inaccurate if a test has multiple
   1364                 # tries which all fail TKO parse stage.
   1365                 if v['test_name'] in self._missing_results:
   1366                     v.override_afe_job_id(
   1367                             self._missing_results[v['test_name']][0])
   1368                 relevant_views.append(v)
   1369         return relevant_views
   1370 
   1371 
   1372     def _compute_retry_count(self, view):
   1373         """Return how many times the test has been retried.
   1374 
   1375         @param view: A TestView instance.
   1376         @returns: An int value indicating the retry count.
   1377 
   1378         """
   1379         old_job = view['job_keyvals'].get('retry_original_job_id')
   1380         count = 0
   1381         while old_job:
   1382             count += 1
   1383             views = self._tko.run(
   1384                 call='get_detailed_test_views', afe_job_id=old_job)
   1385             old_job = (views[0]['job_keyvals'].get('retry_original_job_id')
   1386                        if views else None)
   1387         return count
   1388 
   1389 
   1390     def _fetch_test_views_of_child_jobs(self, jobs=None):
   1391         """Fetch test views of child jobs.
   1392 
   1393         @returns: A tuple (child_views, retry_counts, missing_results)
   1394                   child_views is list of TestView objects, representing
   1395                   all valid views.
   1396                   retry_counts is a dictionary that maps test_idx to retry
   1397                   counts. It only stores retry counts that are greater than 0.
   1398                   missing_results is a dictionary that maps test names to
   1399                   lists of job ids.
   1400 
   1401         """
   1402         child_views = []
   1403         retry_counts = {}
   1404         missing_results = {}
   1405         child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id)
   1406         if child_jobs:
   1407             self._num_child_jobs = len(child_jobs)
   1408         for job in child_jobs:
   1409             views = [TestView(v, job, self._suite_name, self._build, self._user)
   1410                      for v in self._tko.run(
   1411                          call='get_detailed_test_views', afe_job_id=job.id,
   1412                          invalid=0)]
   1413             if len(views) == 0:
   1414                 missing_results.setdefault(job.name, []).append(job.id)
   1415             contains_test_failure = any(
   1416                     v.is_test() and v['status'] != 'GOOD' for v in views)
   1417             for v in views:
   1418                 if (v.is_test() or
   1419                         v['status'] != 'GOOD' and not contains_test_failure):
   1420                     # For normal test view, just keep it.
   1421                     # For SERVER_JOB or CLIENT_JOB, only keep it
   1422                     # if it fails and no other test failure.
   1423                     child_views.append(v)
   1424                     retry_count = self._compute_retry_count(v)
   1425                     if retry_count > 0:
   1426                         retry_counts[v['test_idx']] = retry_count
   1427         return child_views, retry_counts, missing_results
   1428 
   1429 
   1430     def _generate_web_and_buildbot_links(self):
   1431         """Generate web links and buildbot links."""
   1432         # TODO(fdeng): If a job was aborted before it reaches Running
   1433         # state, we read the test view from the suite job
   1434         # and thus this method generates a link pointing to the
   1435         # suite job's page for the aborted job. Need a fix.
   1436         self._web_links = []
   1437         self.buildbot_links = []
   1438 
   1439         # Bug info are stored in the suite job's keyvals.
   1440         if self._solo_test_run:
   1441             suite_job_keyvals = {}
   1442         elif not self._suite_views:
   1443             suite_job_keyvals = {}
   1444         else:
   1445             suite_job_keyvals = self._suite_views[0]['job_keyvals']
   1446 
   1447         for v in self._test_views:
   1448             retry_count = self._retry_counts.get(v['test_idx'], 0)
   1449             bug_info = v.get_bug_info(suite_job_keyvals)
   1450             job_id_owner = v.get_job_id_owner_str()
   1451             link = LogLink(
   1452                     anchor=v.get_testname(),
   1453                     server=self._instance_server,
   1454                     job_string=job_id_owner,
   1455                     bug_info=bug_info, retry_count=retry_count,
   1456                     testname=v.get_testname(),
   1457                     sponge_url=suite_job_keyvals.get('sponge_url'))
   1458             self._web_links.append(link)
   1459 
   1460             if v.should_display_buildbot_link():
   1461                 link.reason = v.get_buildbot_link_reason()
   1462                 self.buildbot_links.append(link)
   1463 
   1464 
   1465     def _record_timings(self):
   1466         """Record suite timings."""
   1467         self.timings = Timings(self._suite_job_id)
   1468         for v in self._test_views:
   1469             self.timings.RecordTiming(v)
   1470 
   1471 
   1472     def _compute_return_code(self):
   1473         """Compute the exit code based on test results."""
   1474         self.return_result = self._return_code_function(self._test_views)
   1475 
   1476 
   1477     def _make_test_results(self):
   1478         """Make TestResults for collected tests.
   1479 
   1480         @returns: List of TestResult instances.
   1481         """
   1482         test_results = []
   1483         for test_view in self._test_views:
   1484             test_result = TestResult(
   1485                 test_view=test_view,
   1486                 retry_count=self._retry_counts.get(test_view['test_idx'], 0))
   1487             test_results.append(test_result)
   1488         return test_results
   1489 
   1490 
   1491     def output_results(self):
   1492         """Output test results, timings and web links."""
   1493         # Output test results
   1494         test_results = self._make_test_results()
   1495         if len(test_results) == 0:
   1496             max_name_length = 0
   1497         else:
   1498             max_name_length = max(len(t.name) for t in test_results)
   1499         for test_result in test_results:
   1500             test_result.log_using(logging.info, max_name_length + 3)
   1501         # Output suite timings
   1502         logging.info(self.timings)
   1503         # Output links to test logs
   1504         logging.info('\nLinks to test logs:')
   1505         for link in self._web_links:
   1506             logging.info(link.text_link)
   1507         logging.info('\n')
   1508 
   1509 
   1510     def get_results_dict(self):
   1511         """Write test results, timings and web links into a dict.
   1512 
   1513         @returns: A dict of results in the format like:
   1514                   {
   1515                   'tests': {
   1516                         'test_1': {'status': 'PASSED', 'attributes': [1,2], ...}
   1517                         'test_2': {'status': 'FAILED', 'attributes': [1],...}
   1518                   }
   1519                   'suite_timings': {
   1520                         'download_start': '1998-07-17 00:00:00',
   1521                         'payload_download_end': '1998-07-17 00:00:05',
   1522                         ...
   1523                   }
   1524                   }
   1525         """
   1526         output_dict = {}
   1527         tests_dict = output_dict.setdefault('tests', {})
   1528         for v in self._test_views:
   1529             test_name = v.get_testname()
   1530             test_info = tests_dict.setdefault(test_name, {})
   1531             test_info.update({
   1532                 'status': v['status'],
   1533                 'attributes': v.get_control_file_attributes() or list(),
   1534                 'reason': v['reason'],
   1535                 'retry_count': self._retry_counts.get(v['test_idx'], 0),
   1536                 })
   1537             # For aborted test, the control file will not be parsed and thus
   1538             # fail to get the attributes info. Therefore, the subsystems the
   1539             # abort test testing will be missing. For this case, we will assume
   1540             # the aborted test will test all subsystems, set subsystem:default.
   1541             if (test_info['status'] == 'ABORT' and
   1542                 not any('subsystem:' in a for a in test_info['attributes'])):
   1543                 test_info['attributes'].append('subsystem:default')
   1544 
   1545         # Write the links to test logs into the |tests_dict| of |output_dict|.
   1546         # For test whose status is not 'GOOD', the link is also buildbot_link.
   1547         for link in self._web_links:
   1548             test_name = link.anchor.strip()
   1549             test_info = tests_dict.get(test_name)
   1550             if test_info:
   1551                 test_info['link_to_logs'] = link.url
   1552                 test_info['sponge_url'] = link.sponge_url
   1553                 # Write the retry dashboard link into the dict.
   1554                 if link in self.buildbot_links and link.testname:
   1555                     test_info['retry_dashboard_link'] \
   1556                         = reporting_utils.link_retry_url(link.testname)
   1557                     # Always write the wmatrix link for compatibility.
   1558                     test_info['wmatrix_link'] \
   1559                         = reporting_utils.link_wmatrix_retry_url(link.testname)
   1560                 # Write the bug url into the dict.
   1561                 if link.bug_id:
   1562                     test_info['bug_url'] = link.bug_url
   1563 
   1564         # Write the suite timings into |output_dict|
   1565         timings = self.timings
   1566         if timings is not None:
   1567             time_dict = output_dict.setdefault('suite_timings', {})
   1568             time_dict.update({
   1569                 'download_start' : str(timings.download_start_time),
   1570                 'payload_download_end' : str(timings.payload_end_time),
   1571                 'suite_start' : str(timings.suite_start_time),
   1572                 'artifact_download_end' : str(timings.artifact_end_time),
   1573                 'tests_start' : str(timings.tests_start_time),
   1574                 'tests_end' : str(timings.tests_end_time),
   1575                 })
   1576 
   1577         output_dict['suite_job_id'] = self._suite_job_id
   1578 
   1579         return output_dict
   1580 
   1581 
   1582     def run(self):
   1583         """Collect test results.
   1584 
   1585         This method goes through the following steps:
   1586             Fetch relevent test views of the suite job.
   1587             Fetch test views of child jobs
   1588             Check whether the suite was aborted.
   1589             Generate links.
   1590             Calculate suite timings.
   1591             Compute return code based on the test result.
   1592 
   1593         """
   1594         if self._solo_test_run:
   1595             self._test_views, self._retry_counts, self._missing_results = (
   1596                   self._fetch_test_views_of_child_jobs(
   1597                           jobs=self._afe.get_jobs(id=self._suite_job_id)))
   1598         else:
   1599             self._child_views, self._retry_counts, self._missing_results = (
   1600                     self._fetch_test_views_of_child_jobs())
   1601             self._suite_views = self._fetch_relevant_test_views_of_suite()
   1602             self._test_views = self._suite_views + self._child_views
   1603         # For hostless job in Starting status, there is no test view associated.
   1604         # This can happen when a suite job in Starting status is aborted. When
   1605         # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone,
   1606         # max_jobs_started_per_cycle, a suite job can stays in Starting status.
   1607         if not self._test_views:
   1608             self.return_result = _RETURN_RESULTS['test_views_missing']
   1609             return
   1610         self.is_aborted = any([view['job_keyvals'].get('aborted_by')
   1611                                for view in self._suite_views])
   1612         self._generate_web_and_buildbot_links()
   1613         self._record_timings()
   1614         self._compute_return_code()
   1615 
   1616 
   1617     def gather_timing_stats(self):
   1618         """Collect timing related statistics."""
   1619         # Record suite runtime in metadata db.
   1620         # Some failure modes can leave times unassigned, report sentinel value
   1621         # in that case.
   1622         runtime_in_secs = -1
   1623         if (self.timings.tests_end_time is not None and
   1624             self.timings.suite_start_time is not None):
   1625             runtime_in_secs = (self.timings.tests_end_time -
   1626                     self.timings.suite_start_time).total_seconds()
   1627 
   1628         job_overhead.record_suite_runtime(self._suite_job_id, self._suite_name,
   1629                 self._board, self._build, self._num_child_jobs, runtime_in_secs)
   1630 
   1631 
   1632 
   1633 def _make_builds_from_options(options):
   1634     """Create a dict of builds for creating a suite job.
   1635 
   1636     The returned dict maps version label prefixes to build names.  Together,
   1637     each key-value pair describes a complete label.
   1638 
   1639     @param options: SimpleNamespace from argument parsing.
   1640 
   1641     @return: dict mapping version label prefixes to build names
   1642     """
   1643     builds = {}
   1644     build_prefix = None
   1645     if options.build:
   1646         build_prefix = provision.get_version_label_prefix(options.build)
   1647         builds[build_prefix] = options.build
   1648     if options.cheets_build:
   1649         builds[provision.CROS_ANDROID_VERSION_PREFIX] = options.cheets_build
   1650         if build_prefix == provision.CROS_VERSION_PREFIX:
   1651             builds[build_prefix] += provision.CHEETS_SUFFIX
   1652     if options.firmware_rw_build:
   1653         builds[provision.FW_RW_VERSION_PREFIX] = options.firmware_rw_build
   1654     if options.firmware_ro_build:
   1655         builds[provision.FW_RO_VERSION_PREFIX] = options.firmware_ro_build
   1656     return builds
   1657 
   1658 
   1659 def _make_child_deps_from_options(options):
   1660     """Creates a list of extra dependencies for child jobs.
   1661 
   1662     @param options: Parsed arguments to run_suite.
   1663 
   1664     @returns: A list of label strings if any dependencies should be added. None
   1665             otherwise.
   1666     """
   1667     if not options.model:
   1668         return ()
   1669     return ['model:%s' % options.model]
   1670 
   1671 
   1672 @retry.retry(error.StageControlFileFailure, timeout_min=10)
   1673 def create_suite(afe, options):
   1674     """Create a suite with retries.
   1675 
   1676     @param afe: The afe object to insert the new suite job into.
   1677     @param options: The options to use in creating the suite.
   1678 
   1679     @return: The afe_job_id of the new suite job.
   1680     """
   1681     logging.info('%s Submitted create_suite_job rpc',
   1682                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1683 
   1684     # TODO(crbug.com/763207): This is to support calling old moblab RPC
   1685     # with ToT code.  This does not need to be supported after M62.
   1686     if options.oldrpc:
   1687         suite_args = options.suite_args
   1688         if 'tests' in suite_args:
   1689             # This is for test_that_wrapper
   1690             suite_args = ' '.join([':lab:'] + suite_args['tests'])
   1691         else:
   1692             # This is for suite_attr_wrapper
   1693             suite_args = repr(suite_args)
   1694         options.suite_args = suite_args
   1695 
   1696     return afe.run(
   1697         'create_suite_job',
   1698         name=options.name,
   1699         board=options.board,
   1700         builds=_make_builds_from_options(options),
   1701         test_source_build=options.test_source_build,
   1702         check_hosts=not options.no_wait,
   1703         pool=options.pool,
   1704         file_bugs=options.file_bugs,
   1705         priority=options.priority,
   1706         suite_args=options.suite_args,
   1707         wait_for_results=not options.no_wait,
   1708         timeout_mins=options.timeout_mins + options.delay_minutes,
   1709         max_runtime_mins=options.max_runtime_mins + options.delay_minutes,
   1710         job_retry=options.retry,
   1711         max_retries=options.max_retries,
   1712         suite_min_duts=options.suite_min_duts,
   1713         offload_failures_only=options.offload_failures_only,
   1714         run_prod_code=options.run_prod_code,
   1715         delay_minutes=options.delay_minutes,
   1716         job_keyvals=options.job_keyvals,
   1717         test_args=options.test_args,
   1718         child_dependencies=_make_child_deps_from_options(options),
   1719     )
   1720 
   1721 
   1722 class SuiteResult(namedtuple('SuiteResult', ['return_code', 'output_dict'])):
   1723     """Result of running a suite to return."""
   1724 
   1725     def __new__(cls, return_code, output_dict=None):
   1726         if output_dict is None:
   1727             output_dict = dict()
   1728         else:
   1729             output_dict = output_dict.copy()
   1730         output_dict['return_code'] = return_code
   1731         return super(SuiteResult, cls).__new__(cls, return_code, output_dict)
   1732 
   1733 
   1734 def _run_suite(options):
   1735     """
   1736     run_suite script without exception handling.
   1737 
   1738     @param options: The parsed options.
   1739 
   1740     @returns: A tuple contains the return_code of run_suite and the dictionary
   1741               of the output.
   1742 
   1743     """
   1744     # If indicate to use the new style suite control file, convert the args
   1745     if options.use_suite_attr:
   1746         options = change_options_for_suite_attr(options)
   1747 
   1748     log_name = _get_log_name(options)
   1749     utils.setup_logging(logfile=log_name)
   1750 
   1751     if not options.bypass_labstatus and not options.web:
   1752         utils.check_lab_status(options.build)
   1753 
   1754     afe = _create_afe(options)
   1755     instance_server = afe.server
   1756 
   1757     rpc_helper = diagnosis_utils.RPCHelper(afe)
   1758     is_real_time = True
   1759     if options.mock_job_id:
   1760         job_id = int(options.mock_job_id)
   1761         existing_job = afe.get_jobs(id=job_id, finished=True)
   1762         if existing_job:
   1763             is_real_time = False
   1764         else:
   1765             existing_job = afe.get_jobs(id=job_id)
   1766         if existing_job:
   1767             job_created_on = time_utils.date_string_to_epoch_time(
   1768                     existing_job[0].created_on)
   1769         else:
   1770             raise utils.TestLabException('Failed to retrieve job: %d' % job_id)
   1771     else:
   1772         try:
   1773             rpc_helper.check_dut_availability(options.board, options.pool,
   1774                                               options.minimum_duts,
   1775                                               options.skip_duts_check)
   1776             job_id = create_suite(afe, options)
   1777             job_created_on = time.time()
   1778         except (error.CrosDynamicSuiteException,
   1779                 error.RPCException, proxy.JSONRPCException) as e:
   1780             logging.exception('Error Message: %s', e)
   1781             return SuiteResult(RETURN_CODES.INFRA_FAILURE,
   1782                                {'return_message': str(e)})
   1783         except AttributeError as e:
   1784             logging.exception('Error Message: %s', e)
   1785             return SuiteResult(RETURN_CODES.INVALID_OPTIONS)
   1786 
   1787     job_timer = diagnosis_utils.JobTimer(
   1788             job_created_on, float(options.timeout_mins))
   1789     job_url = reporting_utils.link_job(job_id,
   1790                                        instance_server=instance_server)
   1791     logging.info('%s Created suite job: %s',
   1792                  job_timer.format_time(job_timer.job_created_time),
   1793                  job_url)
   1794     logging.info(annotations.StepLink(
   1795         text='Link to suite',
   1796         url=job_url))
   1797 
   1798     if options.create_and_return:
   1799         msg = '--create_and_return was specified, terminating now.'
   1800         logging.info(msg)
   1801         return SuiteResult(RETURN_CODES.OK, {'return_message': msg})
   1802 
   1803     if options.no_wait:
   1804         return _handle_job_nowait(job_id, options, instance_server)
   1805     else:
   1806         return _handle_job_wait(afe, job_id, options, job_timer, is_real_time)
   1807 
   1808 
   1809 def _get_log_name(options):
   1810     """Return local log file's name.
   1811 
   1812     @param options:         Parsed options.
   1813 
   1814     @return log_name, a string file name.
   1815     """
   1816     if options.require_logfile:
   1817         # options.build is verified to exist in verify_options.
   1818         # convert build name from containing / to containing only _.
   1819         log_name = 'run_suite-%s.log' % options.build.replace('/', '_')
   1820         log_dir = os.path.join(common.autotest_dir, 'logs')
   1821         if os.path.exists(log_dir):
   1822             log_name = os.path.join(log_dir, log_name)
   1823 
   1824         return log_name
   1825     else:
   1826         return None
   1827 
   1828 
   1829 def _create_afe(options):
   1830     """Return an afe instance based on options.
   1831 
   1832     @param options          Parsed options.
   1833 
   1834     @return afe, an AFE instance.
   1835     """
   1836     instance_server = (options.web if options.web else
   1837                        instance_for_pool(options.pool))
   1838     afe = frontend_wrappers.RetryingAFE(server=instance_server,
   1839                                         timeout_min=options.afe_timeout_mins,
   1840                                         delay_sec=options.delay_sec)
   1841     logging.info('Autotest instance created: %s', instance_server)
   1842     return afe
   1843 
   1844 
   1845 def _handle_job_wait(afe, job_id, options, job_timer, is_real_time):
   1846     """Handle suite job synchronously.
   1847 
   1848     @param afe              AFE instance.
   1849     @param job_id           Suite job id.
   1850     @param options          Parsed options.
   1851     @param job_timer        JobTimer for suite job.
   1852     @param is_real_time     Whether or not to handle job timeout.
   1853 
   1854     @return SuiteResult of suite job.
   1855     """
   1856     rpc_helper = diagnosis_utils.RPCHelper(afe)
   1857     instance_server = afe.server
   1858     while not afe.get_jobs(id=job_id, finished=True):
   1859         _poke_buildbot_with_output(afe, job_id, job_timer)
   1860         if job_timer.debug_output_timer.poll():
   1861             logging.info('The suite job has another %s till timeout.',
   1862                          job_timer.timeout_hours - job_timer.elapsed_time())
   1863         time.sleep(10)
   1864     logging.info('%s Suite job is finished.',
   1865                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1866     # For most cases, ResultCollector should be able to determine whether
   1867     # a suite has timed out by checking information in the test view.
   1868     # However, occationally tko parser may fail on parsing the
   1869     # job_finished time from the job's keyval file. So we add another
   1870     # layer of timeout check in run_suite. We do the check right after
   1871     # the suite finishes to make it as accurate as possible.
   1872     # There is a minor race condition here where we might have aborted
   1873     # for some reason other than a timeout, and the job_timer thinks
   1874     # it's a timeout because of the jitter in waiting for results.
   1875     # The consequence would be that run_suite exits with code
   1876     # SUITE_TIMEOUT while it should  have returned INFRA_FAILURE
   1877     # instead, which should happen very rarely.
   1878     # Note the timeout will have no sense when using -m option.
   1879     is_suite_timeout = job_timer.is_suite_timeout()
   1880 
   1881     # Extract the original suite name to record timing.
   1882     original_suite_name = get_original_suite_name(options.name,
   1883                                                   options.suite_args)
   1884     # Start collecting test results.
   1885     logging.info('%s Start collecting test results and dump them to json.',
   1886                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1887     TKO = frontend_wrappers.RetryingTKO(server=instance_server,
   1888                                         timeout_min=options.afe_timeout_mins,
   1889                                         delay_sec=options.delay_sec)
   1890     # TODO(crbug.com/672348): It needs to be possible for provision
   1891     # suite to pass if only a few tests fail.  Otherwise, a single
   1892     # failing test will be reported as failure even if the suite reports
   1893     # success.
   1894     if options.name == _PROVISION_SUITE:
   1895         # TODO(crbug.com/672348): Creating the suite job requires that
   1896         # suite_args contains num_required.
   1897         return_code_function = _ProvisionReturnCodeComputer(
   1898             num_required=options.suite_args['num_required'])
   1899     else:
   1900         return_code_function = _ReturnCodeComputer()
   1901     collector = ResultCollector(instance_server=instance_server,
   1902                                 afe=afe, tko=TKO, build=options.build,
   1903                                 board=options.board,
   1904                                 suite_name=options.name,
   1905                                 suite_job_id=job_id,
   1906                                 return_code_function=return_code_function,
   1907                                 original_suite_name=original_suite_name)
   1908     collector.run()
   1909     # Dump test outputs into json.
   1910     output_dict = collector.get_results_dict()
   1911     output_dict['autotest_instance'] = instance_server
   1912     if not options.json_dump:
   1913         collector.output_results()
   1914     result = collector.return_result
   1915     if is_real_time:
   1916         # Do not record stats if the suite was aborted (either by a user
   1917         # or through the golo rpc).
   1918         # Also do not record stats if is_aborted is None, indicating
   1919         # aborting status is unknown yet.
   1920         if collector.is_aborted == False:
   1921             logging.info('%s Gathering timing stats for the suite job.',
   1922                          diagnosis_utils.JobTimer.format_time(datetime.now()))
   1923             collector.gather_timing_stats()
   1924 
   1925         if collector.is_aborted == True and is_suite_timeout:
   1926             # There are two possible cases when a suite times out.
   1927             # 1. the suite job was aborted due to timing out
   1928             # 2. the suite job succeeded, but some child jobs
   1929             #    were already aborted before the suite job exited.
   1930             # The case 2 was handled by ResultCollector,
   1931             # here we handle case 1.
   1932             result |= _RETURN_RESULTS['suite_timeout']
   1933         logging.info('\n %s Attempting to display pool info: %s',
   1934                      diagnosis_utils.JobTimer.format_time(datetime.now()),
   1935                      options.pool)
   1936         try:
   1937             # Add some jitter to make up for any latency in
   1938             # aborting the suite or checking for results.
   1939             cutoff = job_timer.timeout_hours + timedelta(hours=0.3)
   1940             rpc_helper.diagnose_pool(
   1941                     options.board, options.pool, cutoff)
   1942         except proxy.JSONRPCException:
   1943             logging.warning('Unable to display pool info.')
   1944 
   1945     # And output return message.
   1946     if result.message:
   1947         logging.info('Reason: %s', result.message)
   1948 
   1949     logging.info('\n %s Output below this line is for buildbot consumption:',
   1950                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1951     log_buildbot_links(logging.info, collector.buildbot_links)
   1952     return result.suite_result(output_dict)
   1953 
   1954 
   1955 def _handle_job_nowait(job_id, options, instance_server):
   1956     """Handle suite job asynchronously.
   1957 
   1958     @param job_id           Suite job id.
   1959     @param options          Parsed options.
   1960     @param instance_server  Autotest instance hostname.
   1961 
   1962     @return SuiteResult of suite job.
   1963     """
   1964     logging.info('Created suite job: %r', job_id)
   1965     link = LogLink(options.name, instance_server,
   1966                    '%s-%s' % (job_id, getpass.getuser()))
   1967     for generate_link in link.GenerateBuildbotLinks():
   1968         logging.info(generate_link)
   1969     logging.info('--no_wait specified; Exiting.')
   1970     return SuiteResult(RETURN_CODES.OK,
   1971                        {'return_message': '--no_wait specified; Exiting.'})
   1972 
   1973 
   1974 def _should_run(options):
   1975     """Check whether the suite should be run based on lab/job status checking.
   1976 
   1977     @param options          Parsed options.
   1978     """
   1979     try:
   1980         site_utils.check_lab_status(options.test_source_build)
   1981     except site_utils.TestLabException as ex:
   1982         logging.exception('Lab is closed or build is blocked. Skipping '
   1983                           'suite %s, board %s, build %s:  %s',
   1984                           options.name, options.board,
   1985                           options.test_source_build, str(ex))
   1986         return False
   1987 
   1988     start_time = str(datetime.now() -
   1989                      timedelta(days=_SEARCH_JOB_MAX_DAYS))
   1990     afe = _create_afe(options)
   1991     afe_job_id = afe.get_jobs(
   1992             name__istartswith=options.test_source_build,
   1993             name__iendswith='control.'+options.name,
   1994             created_on__gte=start_time,
   1995             min_rpc_timeout=_MIN_RPC_TIMEOUT)
   1996     if afe_job_id:
   1997         logging.info('Found duplicate suite %s scheduled in past.',
   1998                      afe_job_id)
   1999         return False
   2000 
   2001     return True
   2002 
   2003 
   2004 def _poke_buildbot_with_output(afe, job_id, job_timer):
   2005     """Poke buildbot so it doesn't timeout from silence.
   2006 
   2007     @param afe              AFE instance.
   2008     @param job_id           Suite job id.
   2009     @param job_timer        JobTimer for suite job.
   2010     """
   2011     rpc_helper = diagnosis_utils.RPCHelper(afe)
   2012     # Note that this call logs output, preventing buildbot's
   2013     # 9000 second silent timeout from kicking in. Let there be no
   2014     # doubt, this is a hack. The timeout is from upstream buildbot and
   2015     # this is the easiest work around.
   2016     if job_timer.first_past_halftime():
   2017         rpc_helper.diagnose_job(job_id, afe.server)
   2018 
   2019 
   2020 
   2021 def _run_task(options):
   2022     """Perform this script's function minus setup.
   2023 
   2024     Boilerplate like argument parsing, logging, output formatting happen
   2025     elsewhere.
   2026 
   2027     Returns a SuiteResult instance.
   2028 
   2029     TODO(ayatane): The try/except should be moved into _run_suite().
   2030     Good luck trying to figure out which function calls are supposed to
   2031     raise which of the exceptions.
   2032     """
   2033     try:
   2034         return _run_suite(options)
   2035     except diagnosis_utils.BoardNotAvailableError as e:
   2036         result = SuiteResult(
   2037             RETURN_CODES.BOARD_NOT_AVAILABLE,
   2038             {'return_message': 'Skipping testing: %s' % e.message})
   2039         logging.info(result.output_dict['return_message'])
   2040         return result
   2041     except utils.TestLabException as e:
   2042         result = SuiteResult(
   2043             RETURN_CODES.INFRA_FAILURE,
   2044             {'return_message': 'TestLabException: %s' % e})
   2045         logging.exception(result.output_dict['return_message'])
   2046         return result
   2047 
   2048 
   2049 class _ExceptionHandler(object):
   2050     """Global exception handler replacement."""
   2051 
   2052     def __init__(self, dump_json):
   2053         """Initialize instance.
   2054 
   2055         @param dump_json: Whether to print a JSON dump of the result dict to
   2056                           stdout.
   2057         """
   2058         self._should_dump_json = dump_json
   2059 
   2060     def __call__(self, exc_type, value, traceback):
   2061         if self._should_dump_json:
   2062             _dump_json({'return_message': ('Unhandled run_suite exception: %s'
   2063                                            % value)})
   2064         sys.exit(RETURN_CODES.INFRA_FAILURE)
   2065 
   2066 
   2067 def main():
   2068     """Entry point."""
   2069     utils.verify_not_root_user()
   2070 
   2071     parser = make_parser()
   2072     options = parser.parse_args()
   2073     if options.do_nothing:
   2074         return 0
   2075 
   2076     sys.exceptionhandler = _ExceptionHandler(dump_json=options.json_dump)
   2077     if options.json_dump:
   2078         logging.disable(logging.CRITICAL)
   2079 
   2080     options_okay = verify_and_clean_options(options)
   2081     # Set StreamHandler first to capture error messages if suite is not run.
   2082     utils.setup_logging()
   2083     if not options_okay:
   2084         parser.print_help()
   2085         result = SuiteResult(RETURN_CODES.INVALID_OPTIONS)
   2086     elif options.pre_check and not _should_run(options):
   2087         logging.info('Suite %s-%s is terminated: Lab is closed, OR build is '
   2088                      'blocked, OR this suite has already been kicked off '
   2089                      'once in past %d days.',
   2090                      options.test_source_build, options.name,
   2091                      _SEARCH_JOB_MAX_DAYS)
   2092         result = SuiteResult(
   2093             RETURN_CODES.ERROR,
   2094             {'return_message': ("Lab is closed OR other reason"
   2095                                 " (see code, it's complicated)")})
   2096     else:
   2097         result = _run_task(options)
   2098 
   2099     if options.json_dump:
   2100         _dump_json(result.output_dict)
   2101 
   2102     logging.info('Will return from run_suite with status: %s',
   2103                   RETURN_CODES.get_string(result.return_code))
   2104     return result.return_code
   2105 
   2106 
   2107 def _dump_json(obj):
   2108     """Write obj JSON to stdout."""
   2109     output_json = json.dumps(obj, sort_keys=True)
   2110     sys.stdout.write('#JSON_START#%s#JSON_END#' % output_json.strip())
   2111 
   2112 
   2113 if __name__ == "__main__":
   2114     sys.exit(main())
   2115