Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/python
      2 #
      3 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
      4 # Use of this source code is governed by a BSD-style license that can be
      5 # found in the LICENSE file.
      6 
      7 
      8 """Tool for running suites of tests and waiting for completion.
      9 
     10 The desired test suite will be scheduled with autotest. By default,
     11 this tool will block until the job is complete, printing a summary
     12 at the end.  Error conditions result in exceptions.
     13 
     14 This is intended for use only with Chrome OS test suits that leverage the
     15 dynamic suite infrastructure in server/cros/dynamic_suite.py.
     16 
     17 This script exits with one of the following codes:
     18 0 - OK: Suite finished successfully
     19 1 - ERROR: Test(s) failed, or hits its own timeout
     20 2 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out.
     21 3 - INFRA_FAILURE: Infrastructure related issues, e.g.
     22     * Lab is down
     23     * Too many duts (defined as a constant) in repair failed status
     24     * Suite job issues, like bug in dynamic suite,
     25       user aborted the suite, lose a drone/all devservers/rpc server,
     26       0 tests ran, etc.
     27     * provision failed
     28       TODO(fdeng): crbug.com/413918, reexamine treating all provision
     29                    failures as INFRA failures.
     30 4 - SUITE_TIMEOUT: Suite timed out, some tests ran,
     31     none failed by the time the suite job was aborted. This will cover,
     32     but not limited to, the following cases:
     33     * A devserver failure that manifests as a timeout
     34     * No DUTs available midway through a suite
     35     * Provision/Reset/Cleanup took longer time than expected for new image
     36     * A regression in scheduler tick time.
     37 5- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool.
     38 6- INVALID_OPTIONS: If options are not valid.
     39 """
     40 
     41 import argparse
     42 import ast
     43 import collections
     44 from datetime import datetime
     45 from datetime import timedelta
     46 import functools
     47 import getpass
     48 import logging
     49 import os
     50 import re
     51 import sys
     52 import time
     53 import warnings
     54 
     55 import common
     56 from chromite.lib import buildbot_annotations as annotations
     57 from chromite.lib import gs
     58 from chromite.lib import osutils
     59 
     60 from django.core import exceptions as django_exceptions
     61 
     62 try:
     63     from suite_scheduler import config_reader
     64     from suite_scheduler import skylab
     65 except ImportError:
     66     # For unittest
     67     config_reader = None
     68     skylab = None
     69 
     70 from autotest_lib.client.common_lib import control_data
     71 from autotest_lib.client.common_lib import error
     72 from autotest_lib.client.common_lib import global_config
     73 from autotest_lib.client.common_lib import priorities
     74 from autotest_lib.client.common_lib import time_utils
     75 from autotest_lib.client.common_lib.cros import retry
     76 from autotest_lib.frontend.afe import rpc_client_lib
     77 from autotest_lib.frontend.afe.json_rpc import proxy
     78 from autotest_lib.server import site_utils
     79 from autotest_lib.server import utils
     80 from autotest_lib.server.cros.dynamic_suite import constants
     81 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     82 from autotest_lib.server.cros.dynamic_suite import reporting_utils
     83 from autotest_lib.server.cros.dynamic_suite import suite_common
     84 from autotest_lib.server.cros.dynamic_suite import tools
     85 try:
     86     from autotest_lib.site_utils import diagnosis_utils
     87 except django_exceptions.ImproperlyConfigured as e:
     88     if 'Error loading MySQLdb module: libmariadbclient' in str(e):
     89         logging.error('Unable to import a necessary MySQLdb module. This is '
     90                       'commonly caused by running a command inside[outside] '
     91                       'of the chroot but having autotest utility packages '
     92                       'that were build outside[inside] the chroot. '
     93                       'Please re-run utils/build_externals.py inside[outside] '
     94                       'of the chroot accordingly.')
     95     raise
     96 from autotest_lib.site_utils import run_suite_common
     97 
     98 CONFIG = global_config.global_config
     99 
    100 _DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value(
    101         'SERVER', 'hostname', type=str)
    102 _URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
    103 _ENABLE_RUN_SUITE_TRAMPOLINE = CONFIG.get_config_value(
    104         'CROS', 'enable_run_suite_trampoline', type=bool, default=False)
    105 
    106 _MIGRATION_CONFIG_FILE = 'migration_config.ini'
    107 _MIGRATION_CONFIG_BUCKET = 'suite-scheduler.google.com.a.appspot.com'
    108 _TRAMPOLINE_CONFIG = 'gs://%s/%s' % (_MIGRATION_CONFIG_BUCKET,
    109                                      _MIGRATION_CONFIG_FILE)
    110 
    111 # Minimum RPC timeout setting for calls expected to take long time, e.g.,
    112 # create_suite_job. If default socket time (socket.getdefaulttimeout()) is
    113 # None or greater than this value, the default will be used.
    114 # The value here is set to be the same as the timeout for the RetryingAFE object
    115 # so long running RPCs can wait long enough before being aborted.
    116 _MIN_RPC_TIMEOUT = 600
    117 
    118 # Number of days back to search for existing job.
    119 _SEARCH_JOB_MAX_DAYS = 14
    120 
    121 _PROVISION_SUITE = 'provision'
    122 
    123 
    124 @functools.total_ordering
    125 class _ReturnResult(object):
    126     """Represents overall result of run_suite operation.
    127 
    128     _ReturnResult instances sort based on priority (the order in
    129     _RETURN_RESULTS).
    130 
    131     Furthermore, _ReturnResult instances can be combined by bitwise or
    132     ("union"), which returns the instance with the higher priority
    133     between the two (the instance with higher priority is a "superset"
    134     of the other).
    135 
    136     Do not create new instances of this; use _RETURN_RESULTS instead.
    137     """
    138 
    139     def __init__(self, return_code, message):
    140         self.return_code = return_code
    141         self.message = message
    142 
    143     def __repr__(self):
    144         return '<{cls} {key}, {this.return_code}, {this.message}>'.format(
    145             cls=type(self).__name__,
    146             key=self._getkey(),
    147             this=self)
    148 
    149     def __gt__(self, other):
    150         if isinstance(other, type(self)):
    151             return self._getkey() > other._getkey()
    152         else:
    153             return NotImplemented
    154 
    155     def __eq__(self, other):
    156         if isinstance(other, type(self)):
    157             return (self.return_code == other.return_code
    158                     and self.message == other.message)
    159         else:
    160             return NotImplemented
    161 
    162     def __hash__(self):
    163         return hash(self.return_code) ^ hash(self.message)
    164 
    165     def __or__(self, other):
    166         if isinstance(other, type(self)):
    167             if self > other:
    168                 return self
    169             else:
    170                 return other
    171         else:
    172             return NotImplemented
    173 
    174     def _getkey(self):
    175         """Return sort key."""
    176         return _RETURN_RESULTS_LIST.index(self)
    177 
    178     def suite_result(self, output_dict=None):
    179         """Make a SuiteResult using this _ReturnResult.
    180 
    181         @param output_dict: output_dict to merge into SuiteResult.
    182         """
    183         if output_dict is None:
    184             output_dict = dict()
    185         else:
    186             output_dict = output_dict.copy()
    187         if self.message:
    188             output_dict['return_message'] = self.message
    189         return run_suite_common.SuiteResult(self.return_code, output_dict)
    190 
    191 
    192 _RETURN_RESULTS = collections.OrderedDict([
    193     ('ok', _ReturnResult(run_suite_common.RETURN_CODES.OK, '')),
    194 
    195     ('test_warning', _ReturnResult(
    196         run_suite_common.RETURN_CODES.WARNING, 'Test job raised warning.')),
    197     ('suite_warning', _ReturnResult(
    198         run_suite_common.RETURN_CODES.WARNING, 'Suite job raised warning.')),
    199     ('test_retry', _ReturnResult(
    200         run_suite_common.RETURN_CODES.WARNING, 'Tests were retried.')),
    201 
    202     ('test_aborted_prestart', _ReturnResult(
    203         run_suite_common.RETURN_CODES.SUITE_TIMEOUT,
    204         'Tests were aborted before running; suite must have timed out.')),
    205     # This really indicates a user action or an infra failure. But, suite
    206     # timeouts cause similar fauilres in the individual tests, so we must
    207     # classify these lower than suite_timeout. In case of a suite_timeout, the
    208     # result from the suite job will promote the result to suite_timeout.
    209     ('test_aborted_mystery',
    210      _ReturnResult(
    211              run_suite_common.RETURN_CODES.SUITE_TIMEOUT,
    212              'Tests were aborted after running, but before timeout; '
    213              'Test was manually aborted or parsing results failed: '
    214              'crbug.com/796348.')),
    215     ('suite_timeout', _ReturnResult(
    216         run_suite_common.RETURN_CODES.SUITE_TIMEOUT, 'Suite job timed out.')),
    217 
    218     ('test_views_missing', _ReturnResult(
    219         run_suite_common.RETURN_CODES.INFRA_FAILURE, 'No test views found.')),
    220     ('suite_failed', _ReturnResult(
    221         run_suite_common.RETURN_CODES.INFRA_FAILURE, 'Suite job failed.')),
    222     ('provision_failed', _ReturnResult(
    223         run_suite_common.RETURN_CODES.INFRA_FAILURE, 'Provisioning failed.')),
    224 
    225     ('test_failure', _ReturnResult(
    226         run_suite_common.RETURN_CODES.ERROR, 'Tests failed.')),
    227 ])
    228 _RETURN_RESULTS_LIST = list(_RETURN_RESULTS.values())
    229 
    230 
    231 def bool_str(x):
    232     """Boolean string type for option arguments.
    233 
    234     @param x: string representation of boolean value.
    235 
    236     """
    237     if x == 'True':
    238         return True
    239     elif x == 'False':
    240         return False
    241     else:
    242         raise argparse.ArgumentTypeError(
    243             '%s is not one of True or False' % (x,))
    244 
    245 
    246 def _get_priority_value(x):
    247     """Convert a priority representation to its int value.
    248 
    249     Priorities can be described either by an int value (possibly as a string)
    250     or a name string.  This function coerces both forms to an int value.
    251 
    252     This function is intended for casting command line arguments during
    253     parsing.
    254 
    255     @param x: priority value as an int, int string, or name string
    256 
    257     @returns: int value of priority
    258     """
    259     try:
    260         return int(x)
    261     except ValueError:
    262         try:
    263             return priorities.Priority.get_value(x)
    264         except AttributeError:
    265             raise argparse.ArgumentTypeError(
    266                 'Unknown priority level %s.  Try one of %s.'
    267                 % (x, ', '.join(priorities.Priority.names)))
    268 
    269 
    270 def make_parser():
    271     """Make ArgumentParser instance for run_suite.py."""
    272     parser = argparse.ArgumentParser(
    273         usage="%(prog)s [options]")
    274     parser.add_argument("-b", "--board", dest="board")
    275     parser.add_argument(
    276             "--model",
    277             help="The device model to run tests against. For non-unified "
    278                  "builds, model and board are synonymous, but board is more "
    279                  "accurate in some cases. Only pass this option if your build "
    280                  "is a unified build.",
    281     )
    282     parser.add_argument("-i", "--build", dest="build")
    283     parser.add_argument(
    284         "-w", "--web", dest="web", default=None,
    285         help="Address of a webserver to receive suite requests.")
    286     parser.add_argument(
    287         '--cheets_build', dest='cheets_build', default=None,
    288         help='ChromeOS Android build to be installed on dut.')
    289     parser.add_argument(
    290         '--firmware_rw_build', dest='firmware_rw_build', default=None,
    291         help='Firmware build to be installed in dut RW firmware.')
    292     parser.add_argument(
    293         '--firmware_ro_build', dest='firmware_ro_build', default=None,
    294         help='Firmware build to be installed in dut RO firmware.')
    295     parser.add_argument(
    296         '--test_source_build', dest='test_source_build', default=None,
    297         help=('Build that contains the test code, '
    298               'e.g., it can be the value of `--build`, '
    299               '`--firmware_rw_build` or `--firmware_ro_build` '
    300               'arguments. Default is None, that is, use the test '
    301               'code from `--build` (CrOS image)'))
    302     #  This should just be a boolean flag, but the autotest "proxy" code
    303     #  can't handle flags that don't take arguments.
    304     parser.add_argument(
    305         "-n", "--no_wait", dest="no_wait", default=False, type=bool_str,
    306         help='Must pass "True" or "False" if used.')
    307     # If you really want no pool, --pool="" will do it. USE WITH CARE.
    308     parser.add_argument("-p", "--pool", dest="pool", default="suites")
    309     parser.add_argument("-s", "--suite_name", dest="name")
    310     parser.add_argument("-a", "--afe_timeout_mins", type=int,
    311                         dest="afe_timeout_mins", default=30)
    312     parser.add_argument("-t", "--timeout_mins", type=int,
    313                         dest="timeout_mins", default=1440)
    314     parser.add_argument("-x", "--max_runtime_mins", type=int,
    315                         dest="max_runtime_mins", default=1440)
    316     parser.add_argument("-d", "--delay_sec", type=int,
    317                         dest="delay_sec", default=10)
    318     parser.add_argument("-m", "--mock_job_id", dest="mock_job_id",
    319                         help="Attach to existing job id for already running "
    320                         "suite, and creates report.")
    321     # NOTE(akeshet): This looks similar to --no_wait, but behaves differently.
    322     # --no_wait is passed in to the suite rpc itself and affects the suite,
    323     # while this does not.
    324     parser.add_argument("-c", "--create_and_return", dest="create_and_return",
    325                         action="store_true",
    326                         help="Create the suite and print the job id, then "
    327                         "finish immediately.")
    328     parser.add_argument("-u", "--num", dest="num", type=int, default=None,
    329                         help="Deprecated, does nothing.")
    330     #  Same boolean flag issue applies here.
    331     parser.add_argument(
    332         "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str,
    333         help=('File bugs on test failures. Must pass "True" or '
    334               '"False" if used.'))
    335     parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus",
    336                         action="store_true", help='Bypass lab status check.')
    337     # We allow either a number or a string for the priority.  This way, if you
    338     # know what you're doing, one can specify a custom priority level between
    339     # other levels.
    340     parser.add_argument("-r", "--priority", dest="priority",
    341                         type=_get_priority_value,
    342                         default=priorities.Priority.DEFAULT,
    343                         action="store",
    344                         help="Priority of suite. Either numerical value, or "
    345                         "one of (" + ", ".join(priorities.Priority.names)
    346                         + ").")
    347     parser.add_argument(
    348         '--retry', dest='retry', default=False, type=bool_str, action='store',
    349         help='Enable test retry.  Must pass "True" or "False" if used.')
    350     parser.add_argument('--max_retries', dest='max_retries', default=None,
    351                         type=int, action='store', help='Maximum retries'
    352                         'allowed at suite level. No limit if not specified.')
    353     parser.add_argument('--minimum_duts', dest='minimum_duts', type=int,
    354                         default=0, action='store',
    355                         help='Check that the pool has at least such many '
    356                         'healthy machines, otherwise suite will not run. '
    357                         'Default to 0.')
    358     parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int,
    359                         default=0, action='store',
    360                         help='Preferred minimum number of machines. Scheduler '
    361                         'will prioritize on getting such many machines for '
    362                         'the suite when it is competing with another suite '
    363                         'that has a higher priority but already got minimum '
    364                         'machines it needs. Default to 0.')
    365     parser.add_argument("--suite_args", dest="suite_args",
    366                         type=ast.literal_eval,
    367                         default=None, action="store",
    368                         help="A dict of args passed to the suite control file.")
    369     parser.add_argument('--offload_failures_only',
    370                         dest='offload_failures_only', type=bool_str,
    371                         action='store', default=False,
    372                         help='Only enable gs_offloading for failed tests. '
    373                         'Successful tests will be deleted. Must pass "True"'
    374                         ' or "False" if used.')
    375     parser.add_argument('--use_suite_attr', dest='use_suite_attr',
    376                         action='store_true', default=False,
    377                         help='Advanced. Run the suite based on ATTRIBUTES of '
    378                         'control files, rather than SUITE.')
    379     parser.add_argument('--json_dump', dest='json_dump', action='store_true',
    380                         default=False,
    381                         help='Dump the output of run_suite to stdout.')
    382     parser.add_argument(
    383         '--run_prod_code', dest='run_prod_code',
    384         action='store_true', default=False,
    385         help='Run the test code that lives in prod aka the test '
    386         'code currently on the lab servers.')
    387     parser.add_argument(
    388         '--delay_minutes', type=int, default=0,
    389         help=('Delay the creation of test jobs for a given '
    390               'number of minutes. This argument can be used to '
    391               'force provision jobs being delayed, which helps '
    392               'to distribute loads across devservers.'))
    393     parser.add_argument(
    394         '--skip_duts_check', dest='skip_duts_check', action='store_true',
    395         default=False, help='If True, skip minimum available DUTs check')
    396     parser.add_argument(
    397         '--job_keyvals', dest='job_keyvals', type=ast.literal_eval,
    398         action='store', default=None,
    399         help='A dict of job keyvals to be inject to suite control file')
    400     parser.add_argument(
    401         '--test_args', dest='test_args', type=ast.literal_eval,
    402         action='store', default=None,
    403         help=('A dict of args passed all the way to each individual test that '
    404               'will be actually ran.'))
    405     parser.add_argument(
    406         '--require_logfile', action='store_true',
    407         help=('Stream logs of run_suite.py to a local file named '
    408               'run_suite-<build name>.log.'))
    409 
    410     # Used for monitoring purposes, to measure no-op swarming proxy latency.
    411     parser.add_argument('--do_nothing', action='store_true',
    412                         help=argparse.SUPPRESS)
    413 
    414     # Used when lab/job status checking is needed. Currently its only user is
    415     # suite scheduler v2.
    416     parser.add_argument(
    417         '--pre_check', action='store_true',
    418         help=('Check lab and job status before kicking off a suite. Used by '
    419               'suite scheduler v2.'))
    420 
    421     # TODO(crbug.com/763207): This is to support calling old moblab RPC
    422     # with ToT code.  This does not need to be supported after M62.
    423     parser.add_argument('--oldrpc', action='store_true',
    424                         help='Use old AFE RPC.')
    425 
    426     return parser
    427 
    428 
    429 def verify_and_clean_options(options):
    430     """Verify the validity of options.
    431 
    432     @param options: The parsed options to verify.
    433 
    434     @returns: True if verification passes, False otherwise.
    435 
    436     """
    437     if options.mock_job_id and (
    438             not options.build or not options.name or not options.board):
    439         print ('When using -m, need to specify build, board and suite '
    440                'name which you have used for creating the original job')
    441         return False
    442     else:
    443         if not options.build:
    444             print 'Need to specify which build to use'
    445             return False
    446         if not options.board:
    447             print 'Need to specify board'
    448             return False
    449         if not options.name:
    450             print 'Need to specify suite name'
    451             return False
    452     if options.num is not None:
    453         warnings.warn('-u/--num option is deprecated; it does nothing.')
    454     del options.num
    455     if not options.retry and options.max_retries is not None:
    456         print 'max_retries can only be used with --retry=True'
    457         return False
    458     if options.use_suite_attr and options.suite_args is not None:
    459         print ('The new suite control file cannot parse the suite_args: %s.'
    460                'Please not specify any suite_args here.' % options.suite_args)
    461         return False
    462     if options.no_wait and options.retry:
    463         print 'Test retry is not available when using --no_wait=True'
    464     # Default to use the test code in CrOS build.
    465     if not options.test_source_build and options.build:
    466         options.test_source_build = options.build
    467     options.child_dependencies = _make_child_dependencies(options)
    468     base_dependencies = ('board:%s' % options.board,
    469                          'pool:%s' % options.pool)
    470     options.dependencies = base_dependencies + options.child_dependencies
    471     return True
    472 
    473 
    474 def change_options_for_suite_attr(options):
    475     """Change options to be prepared to run the suite_attr_wrapper.
    476 
    477     If specify 'use_suite_attr' from the cmd line, it indicates to run the
    478     new style suite control file, suite_attr_wrapper. Then, change the
    479     options.name to 'suite_attr_wrapper', change the options.suite_args to
    480     include the arguments needed by suite_attr_wrapper.
    481 
    482     @param options: The verified options.
    483 
    484     @returns: The changed options.
    485 
    486     """
    487     # Convert the suite_name to attribute boolean expression.
    488     if type(options.name) is str:
    489         attr_filter_val = 'suite:%s' % options.name
    490     else:
    491         attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name])
    492 
    493     # change the suite_args to be a dict of arguments for suite_attr_wrapper
    494     # if suite_args is not None, store the values in 'other_args' of the dict
    495     args_dict = {}
    496     args_dict['attr_filter'] = attr_filter_val
    497     options.suite_args = args_dict
    498     options.name = 'suite_attr_wrapper'
    499 
    500     return options
    501 
    502 
    503 class TestResult(object):
    504 
    505     """Represents the result of a TestView."""
    506 
    507     def __init__(self, test_view, retry_count=0):
    508         """Initialize instance.
    509 
    510         @param test_view: TestView instance.
    511         @param retry_count: Retry count for test.  Optional.
    512         """
    513         self.name = test_view.get_testname()
    514         self.status = test_view['status']
    515         self.reason = test_view['reason']
    516         self.retry_count = retry_count
    517 
    518     _PRETTY_STATUS_MAP = {
    519         'GOOD':    '[ PASSED ]',
    520         'TEST_NA': '[  INFO  ]',
    521     }
    522 
    523     @property
    524     def _pretty_status(self):
    525         """Pretty status string."""
    526         return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]')
    527 
    528     def log_using(self, log_function, name_column_width):
    529         """Log the test result using the given log function.
    530 
    531         @param log_function: Log function to use.  Example: logging.info
    532         @param name_column_width: Width of name column for formatting.
    533         """
    534         padded_name = self.name.ljust(name_column_width)
    535         log_function('%s%s', padded_name, self._pretty_status)
    536         if self.status != 'GOOD':
    537             log_function('%s  %s: %s', padded_name, self.status, self.reason)
    538         if self.retry_count > 0:
    539             log_function('%s  retry_count: %s', padded_name, self.retry_count)
    540 
    541 
    542 def get_original_suite_name(suite_name, suite_args):
    543     """Get the original suite name when running suite_attr_wrapper.
    544 
    545     @param suite_name: the name of the suite launched in afe. When it is
    546                        suite_attr_wrapper, the suite that actually running is
    547                        specified in the suite_args.
    548     @param suite_args: dict of suite args from argument parsing.
    549 
    550     @returns: the original suite name.
    551 
    552     """
    553     if suite_name == 'suite_attr_wrapper':
    554         attrs = suite_args.get('attr_filter', '')
    555         suite_list = ([x[6:] for x in re.split('[() ]', attrs)
    556                        if x and x.startswith('suite:')])
    557         return suite_list[0] if suite_list else suite_name
    558     return suite_name
    559 
    560 
    561 class LogLink(object):
    562     """Information needed to record a link in the logs.
    563 
    564     Depending on context and the information provided at
    565     construction time, the link may point to either to log files for
    566     a job, or to a bug filed for a failure in the job.
    567 
    568     @var anchor  The link text.
    569     @var url     The link url.
    570     @var bug_id  Id of a bug to link to, or None.
    571     """
    572 
    573     # A list of tests that don't get retried so skip the dashboard.
    574     _SKIP_RETRY_DASHBOARD = ['provision']
    575 
    576     _BUG_LINK_PREFIX = 'Auto-Bug'
    577     _LOG_LINK_PREFIX = 'Test-Logs'
    578 
    579 
    580     def __init__(self, anchor, server, job_string, bug_info=None, reason=None,
    581                  retry_count=0, testname=None, sponge_url=None):
    582         """Initialize the LogLink by generating the log URL.
    583 
    584         @param anchor      The link text.
    585         @param server      The hostname of the server this suite ran on.
    586         @param job_string  The job whose logs we'd like to link to.
    587         @param bug_info    Info about the bug, if one was filed.
    588         @param reason      A string representing the reason of failure if any.
    589         @param retry_count How many times the test has been retried.
    590         @param testname    Optional Arg that supplies the testname.
    591         @param sponge_url  url to Sponge result.
    592         """
    593         self.anchor = anchor
    594         self.url = _URL_PATTERN % (rpc_client_lib.add_protocol(server),
    595                                    job_string)
    596         self.reason = reason
    597         self.retry_count = retry_count
    598         self.testname = testname
    599         self.sponge_url = sponge_url
    600         if bug_info:
    601             self.bug_id, self.bug_count = bug_info
    602         else:
    603             self.bug_id = None
    604             self.bug_count = None
    605 
    606 
    607     @property
    608     def bug_url(self):
    609         """URL of associated bug."""
    610         if self.bug_id:
    611             return reporting_utils.link_crbug(self.bug_id)
    612         else:
    613             return None
    614 
    615 
    616     @property
    617     def _bug_count_text(self):
    618         """Return bug count as human friendly text."""
    619         if self.bug_count is None:
    620             bug_info = 'unknown number of reports'
    621         elif self.bug_count == 1:
    622             bug_info = 'new report'
    623         else:
    624             bug_info = '%s reports' % self.bug_count
    625         return bug_info
    626 
    627 
    628     def GenerateBuildbotLinks(self):
    629         """Generate a link formatted to meet buildbot expectations.
    630 
    631         If there is a bug associated with this link, report a link to the bug
    632         and a link to the job logs; otherwise report a link to the job logs.
    633 
    634         @return A generator of links formatted for the buildbot log annotator.
    635         """
    636         if self.bug_url:
    637             yield self._get_link_to_bug()
    638         yield self._get_link_to_job_logs()
    639 
    640 
    641     def _get_link_to_bug(self):
    642         """Return buildbot link to bug.
    643 
    644         @return A link formatted for the buildbot log annotator.
    645         """
    646         info_strings = self._get_info_strings()
    647         info_strings.append(self._bug_count_text)
    648         anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX,
    649                                                info_strings)
    650         return annotations.StepLink(anchor_text, self.bug_url)
    651 
    652 
    653     def _get_link_to_job_logs(self):
    654         """Return buildbot link to job logs.
    655 
    656         @return A link formatted for the buildbot log annotator.
    657         """
    658         anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX,
    659                                                self._get_info_strings())
    660         return annotations.StepLink(anchor_text, self.url)
    661 
    662 
    663     def _get_info_strings(self):
    664         """Return a list of info strings for _format_anchor_text()."""
    665         info_strings = []
    666         if self.retry_count > 0:
    667             info_strings.append('retry_count: %d' % self.retry_count)
    668         if self.reason:
    669             info_strings.append(self.reason)
    670         return info_strings
    671 
    672 
    673     def _format_anchor_text(self, prefix, info_strings):
    674         """Format anchor text given a prefix and info strings.
    675 
    676         @param prefix        The prefix of the anchor text.
    677         @param info_strings  Iterable of strings.
    678         @return A anchor_text with the right prefix and info strings.
    679         """
    680         return '[{prefix}]: {anchor}: {info}'.format(
    681             prefix=prefix,
    682             anchor=self.anchor.strip(),
    683             info=', '.join(info_strings))
    684 
    685     @property
    686     def text_link(self):
    687         """Link to the job's logs, for consumption by a human.
    688 
    689         @return A link formatted for human readability.
    690         """
    691         return '%s %s' % (self.anchor, self.url)
    692 
    693     def GenerateRetryLink(self):
    694         """Generate a link to the retry dashboard.
    695 
    696         @return A link formatted for the buildbot log annotator.
    697         """
    698         if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
    699             return None
    700 
    701         # TODO(xixuan): Return the right flake dashboard later.
    702         return None
    703 
    704     def GenerateHistoryLink(self):
    705         """Generate a link to the test history dashboard.
    706 
    707         @return A link formatted for the buildbot log annotator.
    708         """
    709         if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
    710             return None
    711         return annotations.StepLink(
    712             text='[Test-History]: %s' % self.testname,
    713             url=reporting_utils.link_test_history(self.testname))
    714 
    715 
    716 class Timings(object):
    717     """Timings for important events during a suite.
    718 
    719     All timestamps are datetime.datetime objects.
    720 
    721     @var suite_job_id: the afe job id of the suite job for which
    722                        we are recording the timing for.
    723     @var download_start_time: the time the devserver starts staging
    724                               the build artifacts. Recorded in create_suite_job.
    725     @var payload_end_time: the time when the artifacts only necessary to start
    726                            installsing images onto DUT's are staged.
    727                            Recorded in create_suite_job.
    728     @var artifact_end_time: the remaining artifacts are downloaded after we kick
    729                             off the reimaging job, at which point we record
    730                             artifact_end_time. Recorded in dynamic_suite.py.
    731     @var suite_start_time: the time the suite started.
    732     @var tests_start_time: the time the first test started running.
    733     @var tests_end_time: the time the last test finished running.
    734     """
    735 
    736     def __init__(self, suite_job_id):
    737         self.suite_job_id = suite_job_id
    738         # Timings related to staging artifacts on devserver.
    739         self.download_start_time = None
    740         self.payload_end_time = None
    741         self.artifact_end_time = None
    742 
    743         # The test_start_time, but taken off the view that corresponds to the
    744         # suite instead of an individual test.
    745         self.suite_start_time = None
    746 
    747         # Earliest and Latest tests in the set of TestViews passed to us.
    748         self.tests_start_time = None
    749         self.tests_end_time = None
    750 
    751 
    752     def RecordTiming(self, view):
    753         """Given a test report view, extract and record pertinent time info.
    754 
    755         get_detailed_test_views() returns a list of entries that provide
    756         info about the various parts of a suite run.  This method can take
    757         any one of these entries and look up timestamp info we might want
    758         and record it.
    759 
    760         If timestamps are unavailable, datetime.datetime.min/max will be used.
    761 
    762         @param view: A TestView object.
    763         """
    764         start_candidate = datetime.min
    765         end_candidate = datetime.max
    766         if view['test_started_time']:
    767             start_candidate = time_utils.time_string_to_datetime(
    768                     view['test_started_time'])
    769         if view['test_finished_time']:
    770             end_candidate = time_utils.time_string_to_datetime(
    771                     view['test_finished_time'])
    772 
    773         if view.get_testname() == TestView.SUITE_JOB:
    774             self.suite_start_time = start_candidate
    775         else:
    776             self._UpdateFirstTestStartTime(start_candidate)
    777             self._UpdateLastTestEndTime(end_candidate)
    778         if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view:
    779             keyvals = view['job_keyvals']
    780             self.download_start_time = time_utils.time_string_to_datetime(
    781                     keyvals.get(constants.DOWNLOAD_STARTED_TIME),
    782                     handle_type_error=True)
    783 
    784             self.payload_end_time = time_utils.time_string_to_datetime(
    785                     keyvals.get(constants.PAYLOAD_FINISHED_TIME),
    786                     handle_type_error=True)
    787 
    788             self.artifact_end_time = time_utils.time_string_to_datetime(
    789                     keyvals.get(constants.ARTIFACT_FINISHED_TIME),
    790                     handle_type_error=True)
    791 
    792 
    793     def _UpdateFirstTestStartTime(self, candidate):
    794         """Update self.tests_start_time, iff candidate is an earlier time.
    795 
    796         @param candidate: a datetime.datetime object.
    797         """
    798         if not self.tests_start_time or candidate < self.tests_start_time:
    799             self.tests_start_time = candidate
    800 
    801 
    802     def _UpdateLastTestEndTime(self, candidate):
    803         """Update self.tests_end_time, iff candidate is a later time.
    804 
    805         @param candidate: a datetime.datetime object.
    806         """
    807         if not self.tests_end_time or candidate > self.tests_end_time:
    808             self.tests_end_time = candidate
    809 
    810 
    811     def __str__(self):
    812         return ('\n'
    813                 'Suite timings:\n'
    814                 'Downloads started at %s\n'
    815                 'Payload downloads ended at %s\n'
    816                 'Suite started at %s\n'
    817                 'Artifact downloads ended (at latest) at %s\n'
    818                 'Testing started at %s\n'
    819                 'Testing ended at %s\n' % (self.download_start_time,
    820                                            self.payload_end_time,
    821                                            self.suite_start_time,
    822                                            self.artifact_end_time,
    823                                            self.tests_start_time,
    824                                            self.tests_end_time))
    825 
    826 
    827 def instance_for_pool(pool_name):
    828     """
    829     Return the hostname of the server that should be used to service a suite
    830     for the specified pool.
    831 
    832     @param pool_name: The pool (without 'pool:' to schedule the suite against.
    833     @return: The correct host that should be used to service this suite run.
    834     """
    835     return CONFIG.get_config_value(
    836             'POOL_INSTANCE_SHARDING', pool_name,
    837             default=_DEFAULT_AUTOTEST_INSTANCE)
    838 
    839 
    840 class TestView(object):
    841     """Represents a test view and provides a set of helper functions."""
    842 
    843 
    844     SUITE_JOB = 'Suite job'
    845 
    846 
    847     def __init__(self, view, afe_job, suite_name, build, user,
    848                  solo_test_run=False):
    849         """Init a TestView object representing a tko test view.
    850 
    851         @param view: A dictionary representing a tko test view.
    852         @param afe_job: An instance of frontend.afe.models.Job
    853                         representing the job that kicked off the test.
    854         @param suite_name: The name of the suite
    855                            that the test belongs to.
    856         @param build: The build for which the test is run.
    857         @param user: The user for which the test is run.
    858         @param solo_test_run: This is a solo test run not part of a suite.
    859         """
    860         self.view = view
    861         self.afe_job = afe_job
    862         self.suite_name = suite_name
    863         self.build = build
    864         self.is_suite_view = afe_job.parent_job is None and not solo_test_run
    865         # This is the test name that will be shown in the output.
    866         self.testname = None
    867         self.user = user
    868 
    869         # The case that a job was aborted before it got a chance to run
    870         # usually indicates suite has timed out (unless aborted by user).
    871         # In this case, the abort reason will be None.
    872         # Update the reason with proper information.
    873         if (self.is_relevant_suite_view() and
    874                 not self.get_testname() == self.SUITE_JOB and
    875                 self.view['status'] == 'ABORT' and
    876                 not self.view['reason']):
    877             self.view['reason'] = 'Timed out, did not run.'
    878 
    879 
    880     def __getitem__(self, key):
    881         """Overload __getitem__ so that we can still use []
    882 
    883         @param key: A key of the tko test view.
    884 
    885         @returns: The value of an attribute in the view.
    886 
    887         """
    888         return self.view[key]
    889 
    890 
    891     def __iter__(self):
    892         """Overload __iter__ so that it supports 'in' operator."""
    893         return iter(self.view)
    894 
    895 
    896     def get_testname(self):
    897         """Get test name that should be shown in the output.
    898 
    899         Formalize the test_name we got from the test view.
    900 
    901         Remove 'build/suite' prefix if any.
    902 
    903         If one runs a test in control file via the following code,
    904            job.runtest('my_Test', tag='tag')
    905         for most of the cases, view['test_name'] would look like 'my_Test.tag'.
    906         If this is the case, this method will just return the original
    907         test name, i.e. 'my_Test.tag'.
    908 
    909         There are four special cases.
    910         1) A test view is for the suite job's SERVER_JOB.
    911            In this case, this method will return 'Suite job'.
    912 
    913         2) A test view is of a child job or a solo test run not part of a
    914            suite, and for a SERVER_JOB or CLIENT_JOB.
    915            In this case, we will take the job name, remove the build/suite
    916            prefix from the job name, and append the rest to 'SERVER_JOB'
    917            or 'CLIENT_JOB' as a prefix. So the names returned by this
    918            method will look like:
    919              'dummy_Pass_SERVER_JOB'
    920              'dummy_Fail_SERVER_JOB'
    921 
    922         3) A test view is of a suite job and its status is ABORT.
    923            In this case, the view['test_name'] is the child job's name.
    924            For instance,
    925              'lumpy-release/R35-5712.0.0/dummy/dummy_Pass'
    926              'lumpy-release/R35-5712.0.0/dummy/dummy_Fail'
    927            The above names will be converted to the following:
    928              'dummy_Pass'
    929              'dummy_Fail'
    930 
    931         4) A test view's status is of a suite job and its status is TEST_NA.
    932            In this case, the view['test_name'] is the NAME field of the control
    933            file. For instance,
    934              'dummy_Pass'
    935              'dummy_Fail'
    936            This method will not modify these names.
    937 
    938         @returns: Test name after normalization.
    939 
    940         """
    941         if self.testname is not None:
    942             return self.testname
    943 
    944         if (self.is_suite_view and
    945                 self.view['test_name'].startswith('SERVER_JOB')):
    946             # Rename suite job's SERVER_JOB to 'Suite job'.
    947             self.testname = self.SUITE_JOB
    948             return self.testname
    949 
    950         if (self.view['test_name'].startswith('SERVER_JOB') or
    951                 self.view['test_name'].startswith('CLIENT_JOB')):
    952             # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
    953             testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
    954         else:
    955             testname = self.view['test_name']
    956         # Remove the build and suite name from testname if any.
    957         self.testname = tools.get_test_name(
    958                 self.build, self.suite_name, testname)
    959         return self.testname
    960 
    961 
    962     def is_relevant_suite_view(self):
    963         """Checks whether this is a suite view we should care about.
    964 
    965         @returns: True if it is relevant. False otherwise.
    966         """
    967         return (self.get_testname() == self.SUITE_JOB or
    968                 (self.is_suite_view and
    969                     not self.view['test_name'].startswith('CLIENT_JOB') and
    970                     not self.view['subdir']))
    971 
    972 
    973     def is_test(self):
    974         """Return whether the view is for an actual test.
    975 
    976         @returns True if the view is for an actual test.
    977                  False if the view is for SERVER_JOB or CLIENT_JOB.
    978 
    979         """
    980         return not (self.view['test_name'].startswith('SERVER_JOB') or
    981                 self.view['test_name'].startswith('CLIENT_JOB'))
    982 
    983 
    984     def is_retry(self):
    985         """Check whether the view is for a retry.
    986 
    987         @returns: True, if the view is for a retry; False otherwise.
    988 
    989         """
    990         return self.view['job_keyvals'].get('retry_original_job_id') is not None
    991 
    992 
    993     def hit_timeout(self):
    994         """Check whether the corresponding job has hit its own timeout.
    995 
    996         Note this method should not be called for those test views
    997         that belongs to a suite job and are determined as irrelevant
    998         by is_relevant_suite_view.  This is because they are associated
    999         to the suite job, whose job start/finished time make no sense
   1000         to an irrelevant test view.
   1001 
   1002         @returns: True if the corresponding afe job has hit timeout.
   1003                   False otherwise.
   1004         """
   1005         if (self.is_relevant_suite_view() and
   1006                 self.get_testname() != self.SUITE_JOB):
   1007             # Any relevant suite test view except SUITE_JOB
   1008             # did not hit its own timeout because it was not ever run.
   1009             return False
   1010         start = (datetime.strptime(
   1011                 self.view['job_started_time'], time_utils.TIME_FMT)
   1012                 if self.view['job_started_time'] else None)
   1013         end = (datetime.strptime(
   1014                 self.view['job_finished_time'], time_utils.TIME_FMT)
   1015                 if self.view['job_finished_time'] else None)
   1016         if not start or not end:
   1017             return False
   1018         else:
   1019             return ((end - start).total_seconds()/60.0
   1020                         > self.afe_job.max_runtime_mins)
   1021 
   1022 
   1023     def is_aborted(self):
   1024         """Check if the view was aborted.
   1025 
   1026         For suite job and child job test views, we check job keyval
   1027         'aborted_by' and test status.
   1028 
   1029         For relevant suite job test views, we only check test status
   1030         because the suite job keyval won't make sense to individual
   1031         test views.
   1032 
   1033         @returns: True if the test was as aborted, False otherwise.
   1034 
   1035         """
   1036 
   1037         if (self.is_relevant_suite_view() and
   1038                 self.get_testname() != self.SUITE_JOB):
   1039             return self.view['status'] == 'ABORT'
   1040         else:
   1041             return (bool(self.view['job_keyvals'].get('aborted_by')) and
   1042                     self.view['status'] in ['ABORT', 'RUNNING'])
   1043 
   1044 
   1045     def is_in_fail_status(self):
   1046         """Check if the given test's status corresponds to a failure.
   1047 
   1048         @returns: True if the test's status is FAIL or ERROR. False otherwise.
   1049 
   1050         """
   1051         # All the statuses tests can have when they fail.
   1052         return self.view['status'] in ['FAIL', 'ERROR', 'ABORT']
   1053 
   1054 
   1055     def is_provision(self):
   1056         """Check whether this is a provision test."""
   1057         return self.get_testname() == 'provision'
   1058 
   1059 
   1060     def get_buildbot_link_reason(self):
   1061         """Generate the buildbot link reason for the test.
   1062 
   1063         @returns: A string representing the reason.
   1064 
   1065         """
   1066         return ('%s: %s' % (self.view['status'], self.view['reason'])
   1067                 if self.view['reason'] else self.view['status'])
   1068 
   1069 
   1070     def get_job_id_owner_str(self):
   1071         """Generate the job_id_owner string for a test.
   1072 
   1073         @returns: A string which looks like 135036-username
   1074 
   1075         """
   1076         return '%s-%s' % (self.view['afe_job_id'], self.user)
   1077 
   1078 
   1079     def get_bug_info(self, suite_job_keyvals):
   1080         """Get the bug info from suite_job_keyvals.
   1081 
   1082         If a bug has been filed for the test, its bug info (bug id and counts)
   1083         will be stored in the suite job's keyvals. This method attempts to
   1084         retrieve bug info of the test from |suite_job_keyvals|. It will return
   1085         None if no bug info is found. No need to check bug info if the view is
   1086         SUITE_JOB.
   1087 
   1088         @param suite_job_keyvals: The job keyval dictionary of the suite job.
   1089                 All the bug info about child jobs are stored in
   1090                 suite job's keyvals.
   1091 
   1092         @returns: None if there is no bug info, or a pair with the
   1093                   id of the bug, and the count of the number of
   1094                   times the bug has been seen.
   1095 
   1096         """
   1097         if self.get_testname() == self.SUITE_JOB:
   1098             return None
   1099         if (self.view['test_name'].startswith('SERVER_JOB') or
   1100                 self.view['test_name'].startswith('CLIENT_JOB')):
   1101             # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
   1102             testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
   1103         else:
   1104             testname = self.view['test_name']
   1105 
   1106         return tools.get_test_failure_bug_info(
   1107                 suite_job_keyvals, self.view['afe_job_id'],
   1108                 testname)
   1109 
   1110 
   1111     def should_display_buildbot_link(self):
   1112         """Check whether a buildbot link should show for this view.
   1113 
   1114         For suite job view, show buildbot link if it fails.
   1115         For normal test view,
   1116             show buildbot link if it is a retry
   1117             show buildbot link if it hits its own timeout.
   1118             show buildbot link if it fails. This doesn't
   1119             include the case where it was aborted but has
   1120             not hit its own timeout (most likely it was aborted because
   1121             suite has timed out).
   1122 
   1123         @returns: True if we should show the buildbot link.
   1124                   False otherwise.
   1125         """
   1126         is_bad_status = (self.view['status'] != 'GOOD' and
   1127                          self.view['status'] != 'TEST_NA')
   1128         if self.get_testname() == self.SUITE_JOB:
   1129             return is_bad_status
   1130         else:
   1131             if self.is_retry():
   1132                 return True
   1133             if is_bad_status:
   1134                 return not self.is_aborted() or self.hit_timeout()
   1135 
   1136 
   1137     def get_control_file_attributes(self):
   1138         """Get the attributes from the control file of the test.
   1139 
   1140         @returns: A list of test attribute or None.
   1141         """
   1142         control_file = self.afe_job.control_file
   1143         attributes = None
   1144         if control_file:
   1145             cd = control_data.parse_control_string(control_file)
   1146             attributes = list(cd.attributes)
   1147         return attributes
   1148 
   1149 
   1150     def override_afe_job_id(self, afe_job_id):
   1151         """Overrides the AFE job id for the test.
   1152 
   1153         @param afe_job_id: The new AFE job id to use.
   1154         """
   1155         self.view['afe_job_id'] = afe_job_id
   1156 
   1157 
   1158 def log_buildbot_links(log_func, links):
   1159     """Output buildbot links to log.
   1160 
   1161     @param log_func: Logging function to use.
   1162     @param links: Iterable of LogLink instances.
   1163     """
   1164     for link in links:
   1165         for generated_link in link.GenerateBuildbotLinks():
   1166             log_func(generated_link)
   1167         retry_link = link.GenerateRetryLink()
   1168         if retry_link:
   1169             log_func(retry_link)
   1170         history_link = link.GenerateHistoryLink()
   1171         if history_link:
   1172             log_func(history_link)
   1173 
   1174 
   1175 class _ReturnCodeComputer(object):
   1176     """This is responsible for returning the _ReturnResult for a suite."""
   1177 
   1178     def __call__(self, test_views):
   1179         """Compute the exit code based on test results."""
   1180         result = _RETURN_RESULTS['ok']
   1181 
   1182         for v in test_views:
   1183             if v.get_testname() == TestView.SUITE_JOB:
   1184                 result |= self._get_suite_result(v)
   1185             else:
   1186                 result |= self._get_test_result(v)
   1187         return result
   1188 
   1189     def _get_suite_result(self, test_view):
   1190         """Return the _ReturnResult for the given suite job."""
   1191         # The order of checking each case is important.
   1192         if test_view.is_aborted() and test_view.hit_timeout():
   1193             return _RETURN_RESULTS['suite_timeout']
   1194         elif test_view.is_in_fail_status():
   1195             return _RETURN_RESULTS['suite_failed']
   1196         elif test_view['status'] == 'WARN':
   1197             return _RETURN_RESULTS['suite_warning']
   1198         else:
   1199             return _RETURN_RESULTS['ok']
   1200 
   1201     def _get_test_result(self, test_view):
   1202         """Return the _ReturnResult for the given test job."""
   1203         # The order of checking each case is important.
   1204         if test_view.is_aborted() and test_view.is_relevant_suite_view():
   1205             # The test was aborted before started
   1206             # This gurantees that the suite has timed out.
   1207             return _RETURN_RESULTS['test_aborted_prestart']
   1208         elif test_view.is_aborted() and not test_view.hit_timeout():
   1209             # The test was aborted, but
   1210             # not due to a timeout. This is most likely
   1211             # because the suite has timed out, but may
   1212             # also because it was aborted by the user.
   1213             # Since suite timing out is determined by checking
   1214             # the suite job view, we simply ignore this view here.
   1215             return _RETURN_RESULTS['test_aborted_mystery']
   1216         elif test_view.is_in_fail_status():  # The test job failed
   1217             if test_view.is_provision():
   1218                 return _RETURN_RESULTS['provision_failed']
   1219             else:
   1220                 return _RETURN_RESULTS['test_failure']
   1221         elif test_view['status'] == 'WARN':
   1222             return _RETURN_RESULTS['test_warning']
   1223         elif test_view.is_retry():
   1224             # The test is a passing retry.
   1225             return _RETURN_RESULTS['test_retry']
   1226         else:
   1227             return _RETURN_RESULTS['ok']
   1228 
   1229 
   1230 class _ProvisionReturnCodeComputer(_ReturnCodeComputer):
   1231     """This is used for returning the _ReturnResult for provision suites."""
   1232 
   1233     def __init__(self, num_required):
   1234         """Initialize instance.
   1235 
   1236         num_required is the number of passing provision jobs needed.
   1237         """
   1238         super(_ProvisionReturnCodeComputer, self).__init__()
   1239         self._num_required = num_required
   1240         self._num_successful = 0
   1241 
   1242     def __call__(self, test_views):
   1243         result = super(_ProvisionReturnCodeComputer, self).__call__(test_views)
   1244         if self._num_successful >= self._num_required:
   1245             logging.info('Return result upgraded from %r'
   1246                          ' due to enough ok provisions',
   1247                          result)
   1248             return _RETURN_RESULTS['ok']
   1249         else:
   1250             return result
   1251 
   1252     def _get_test_result(self, test_view):
   1253         result = (super(_ProvisionReturnCodeComputer, self)
   1254                   ._get_test_result(test_view))
   1255         if result in {_RETURN_RESULTS[s] for s in ('ok', 'test_retry')}:
   1256             self._num_successful += 1
   1257         return result
   1258 
   1259 
   1260 class ResultCollector(object):
   1261     """Collect test results of a suite or a single test run.
   1262 
   1263     Once a suite job has finished, use this class to collect test results.
   1264     `run` is the core method that is to be called first. Then the caller
   1265     could retrieve information like return code, return message, is_aborted,
   1266     and timings by accessing the collector's public attributes. And output
   1267     the test results and links by calling the 'output_*' methods.
   1268 
   1269     Here is a overview of what `run` method does.
   1270 
   1271     1) Collect the suite job's results from tko_test_view_2.
   1272     For the suite job, we only pull test views without a 'subdir'.
   1273     A NULL subdir indicates that the test was _not_ executed. This could be
   1274     that no child job was scheduled for this test or the child job got
   1275     aborted before starts running.
   1276     (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
   1277 
   1278     2) Collect the child jobs' results from tko_test_view_2.
   1279     For child jobs, we pull all the test views associated with them.
   1280     (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
   1281 
   1282     3) Generate web and buildbot links.
   1283     4) Compute timings of the suite run.
   1284     5) Compute the return code based on test results.
   1285 
   1286     @var _instance_server: The hostname of the server that is used
   1287                            to service the suite.
   1288     @var _afe: The afe rpc client.
   1289     @var _tko: The tko rpc client.
   1290     @var _build: The build for which the suite is run,
   1291                  e.g. 'lumpy-release/R35-5712.0.0'
   1292     @var _suite_name: The suite name, e.g. 'bvt', 'dummy'.
   1293     @var _suite_job_id: The job id of the suite for which we are going to
   1294                         collect results.
   1295     @var _original_suite_name: The suite name we record timing would be
   1296                                different from _suite_name when running
   1297                                suite_attr_wrapper.
   1298     @var _return_code_function: Called to return what the overall result of
   1299                                 the suite is.
   1300     @var _suite_views: A list of TestView objects, representing relevant
   1301                        test views of the suite job.
   1302     @var _child_views: A list of TestView objects, representing test views
   1303                        of the child jobs.
   1304     @var _test_views: A list of TestView objects, representing all test views
   1305                       from _suite_views and _child_views.
   1306     @var _web_links: A list of web links pointing to the results of jobs.
   1307     @var buildbot_links: A list of buildbot links for non-passing tests.
   1308     @var _solo_test_run: True if this is a single test run.
   1309     @var return_result: The _ReturnResult of the suite run.
   1310     @var is_aborted: Whether the suite was aborted or not.
   1311                      True, False or None (aborting status is unknown yet)
   1312     @var timings: A Timing object that records the suite's timings.
   1313 
   1314     """
   1315 
   1316 
   1317     def __init__(self, instance_server, afe, tko, build,
   1318                  suite_name, suite_job_id, return_code_function,
   1319                  original_suite_name=None,
   1320                  user=None, solo_test_run=False):
   1321         self._instance_server = instance_server
   1322         self._afe = afe
   1323         self._tko = tko
   1324         self._build = build
   1325         self._suite_name = suite_name
   1326         self._suite_job_id = suite_job_id
   1327         self._original_suite_name = original_suite_name or suite_name
   1328         self._return_code_function = return_code_function
   1329         self._suite_views = []
   1330         self._child_views = []
   1331         self._test_views = []
   1332         self._retry_counts = {}
   1333         self._missing_results = {}
   1334         self._web_links = []
   1335         self.buildbot_links = []
   1336         self._num_child_jobs = 0
   1337         self.return_result = None
   1338         self.is_aborted = None
   1339         self.timings = None
   1340         self._user = user or getpass.getuser()
   1341         self._solo_test_run = solo_test_run
   1342 
   1343 
   1344     def _fetch_relevant_test_views_of_suite(self):
   1345         """Fetch relevant test views of the suite job.
   1346 
   1347         For the suite job, there will be a test view for SERVER_JOB, and views
   1348         for results of its child jobs. For example, assume we've created
   1349         a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail,
   1350         dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while
   1351         dummy_Path.bluetooth got TEST_NA as no duts have bluetooth.
   1352         So the suite job's test views would look like
   1353         _____________________________________________________________________
   1354         test_idx| job_idx|test_name           |subdir      |afe_job_id|status
   1355         10      | 1000   |SERVER_JOB          |----        |40        |GOOD
   1356         11      | 1000   |dummy_Pass          |NULL        |40        |ABORT
   1357         12      | 1000   |dummy_Fail.Fail     |41-onwer/...|40        |FAIL
   1358         13      | 1000   |dummy_Fail.Error    |42-owner/...|40        |ERROR
   1359         14      | 1000   |dummy_Pass.bluetooth|NULL        |40        |TEST_NA
   1360 
   1361         For a suite job, we only care about
   1362         a) The test view for the suite job's SERVER_JOB
   1363         b) The test views for real tests without a subdir. A NULL subdir
   1364            indicates that a test didn't get executed.
   1365         So, for the above example, we only keep test views whose test_idxs
   1366         are 10, 11, 14.
   1367 
   1368         @returns: A list of TestView objects, representing relevant
   1369                   test views of the suite job.
   1370 
   1371         """
   1372         suite_job = self._afe.get_jobs(id=self._suite_job_id)[0]
   1373         views = self._tko.run(call='get_detailed_test_views',
   1374                               afe_job_id=self._suite_job_id)
   1375         relevant_views = []
   1376         for v in views:
   1377             v = TestView(v, suite_job, self._suite_name, self._build, self._user,
   1378                          solo_test_run=self._solo_test_run)
   1379             if v.is_relevant_suite_view():
   1380                 # If the test doesn't have results in TKO and is being
   1381                 # displayed in the suite view instead of the child view,
   1382                 # then afe_job_id is incorrect and from the suite.
   1383                 # Override it based on the AFE job id which was missing
   1384                 # results.
   1385                 # TODO: This is likely inaccurate if a test has multiple
   1386                 # tries which all fail TKO parse stage.
   1387                 if v['test_name'] in self._missing_results:
   1388                     v.override_afe_job_id(
   1389                             self._missing_results[v['test_name']][0])
   1390                 relevant_views.append(v)
   1391         return relevant_views
   1392 
   1393 
   1394     def _compute_retry_count(self, view):
   1395         """Return how many times the test has been retried.
   1396 
   1397         @param view: A TestView instance.
   1398         @returns: An int value indicating the retry count.
   1399 
   1400         """
   1401         old_job = view['job_keyvals'].get('retry_original_job_id')
   1402         count = 0
   1403         while old_job:
   1404             count += 1
   1405             views = self._tko.run(
   1406                 call='get_detailed_test_views', afe_job_id=old_job)
   1407             old_job = (views[0]['job_keyvals'].get('retry_original_job_id')
   1408                        if views else None)
   1409         return count
   1410 
   1411 
   1412     def _fetch_test_views_of_child_jobs(self, jobs=None):
   1413         """Fetch test views of child jobs.
   1414 
   1415         @returns: A tuple (child_views, retry_counts, missing_results)
   1416                   child_views is list of TestView objects, representing
   1417                   all valid views.
   1418                   retry_counts is a dictionary that maps test_idx to retry
   1419                   counts. It only stores retry counts that are greater than 0.
   1420                   missing_results is a dictionary that maps test names to
   1421                   lists of job ids.
   1422 
   1423         """
   1424         child_views = []
   1425         retry_counts = {}
   1426         missing_results = {}
   1427         child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id)
   1428         if child_jobs:
   1429             self._num_child_jobs = len(child_jobs)
   1430         for job in child_jobs:
   1431             views = [TestView(v, job, self._suite_name, self._build, self._user)
   1432                      for v in self._tko.run(
   1433                          call='get_detailed_test_views', afe_job_id=job.id,
   1434                          invalid=0)]
   1435             if len(views) == 0:
   1436                 missing_results.setdefault(job.name, []).append(job.id)
   1437             contains_test_failure = any(
   1438                     v.is_test() and v['status'] != 'GOOD' for v in views)
   1439             for v in views:
   1440                 if (v.is_test() or
   1441                         v['status'] != 'GOOD' and not contains_test_failure):
   1442                     # For normal test view, just keep it.
   1443                     # For SERVER_JOB or CLIENT_JOB, only keep it
   1444                     # if it fails and no other test failure.
   1445                     child_views.append(v)
   1446                     retry_count = self._compute_retry_count(v)
   1447                     if retry_count > 0:
   1448                         retry_counts[v['test_idx']] = retry_count
   1449         return child_views, retry_counts, missing_results
   1450 
   1451 
   1452     def _generate_web_and_buildbot_links(self):
   1453         """Generate web links and buildbot links."""
   1454         # TODO(fdeng): If a job was aborted before it reaches Running
   1455         # state, we read the test view from the suite job
   1456         # and thus this method generates a link pointing to the
   1457         # suite job's page for the aborted job. Need a fix.
   1458         self._web_links = []
   1459         self.buildbot_links = []
   1460 
   1461         # Bug info are stored in the suite job's keyvals.
   1462         if self._solo_test_run:
   1463             suite_job_keyvals = {}
   1464         elif not self._suite_views:
   1465             suite_job_keyvals = {}
   1466         else:
   1467             suite_job_keyvals = self._suite_views[0]['job_keyvals']
   1468 
   1469         for v in self._test_views:
   1470             retry_count = self._retry_counts.get(v['test_idx'], 0)
   1471             bug_info = v.get_bug_info(suite_job_keyvals)
   1472             job_id_owner = v.get_job_id_owner_str()
   1473             link = LogLink(
   1474                     anchor=v.get_testname(),
   1475                     server=self._instance_server,
   1476                     job_string=job_id_owner,
   1477                     bug_info=bug_info, retry_count=retry_count,
   1478                     testname=v.get_testname(),
   1479                     sponge_url=suite_job_keyvals.get('sponge_url'))
   1480             self._web_links.append(link)
   1481 
   1482             if v.should_display_buildbot_link():
   1483                 link.reason = v.get_buildbot_link_reason()
   1484                 self.buildbot_links.append(link)
   1485 
   1486 
   1487     def _record_timings(self):
   1488         """Record suite timings."""
   1489         self.timings = Timings(self._suite_job_id)
   1490         for v in self._test_views:
   1491             self.timings.RecordTiming(v)
   1492 
   1493 
   1494     def _compute_return_code(self):
   1495         """Compute the exit code based on test results."""
   1496         self.return_result = self._return_code_function(self._test_views)
   1497 
   1498 
   1499     def _make_test_results(self):
   1500         """Make TestResults for collected tests.
   1501 
   1502         @returns: List of TestResult instances.
   1503         """
   1504         test_results = []
   1505         for test_view in self._test_views:
   1506             test_result = TestResult(
   1507                 test_view=test_view,
   1508                 retry_count=self._retry_counts.get(test_view['test_idx'], 0))
   1509             test_results.append(test_result)
   1510         return test_results
   1511 
   1512 
   1513     def output_results(self):
   1514         """Output test results, timings and web links."""
   1515         # Output test results
   1516         test_results = self._make_test_results()
   1517         if len(test_results) == 0:
   1518             max_name_length = 0
   1519         else:
   1520             max_name_length = max(len(t.name) for t in test_results)
   1521         for test_result in test_results:
   1522             test_result.log_using(logging.info, max_name_length + 3)
   1523         # Output suite timings
   1524         logging.info(self.timings)
   1525         # Output links to test logs
   1526         logging.info('\nLinks to test logs:')
   1527         for link in self._web_links:
   1528             logging.info(link.text_link)
   1529         logging.info('\n')
   1530 
   1531 
   1532     def get_results_dict(self):
   1533         """Write test results, timings and web links into a dict.
   1534 
   1535         @returns: A dict of results in the format like:
   1536                   {
   1537                   'tests': {
   1538                         'test_1': {'status': 'PASSED', 'attributes': [1,2], ...}
   1539                         'test_2': {'status': 'FAILED', 'attributes': [1],...}
   1540                   }
   1541                   'suite_timings': {
   1542                         'download_start': '1998-07-17 00:00:00',
   1543                         'payload_download_end': '1998-07-17 00:00:05',
   1544                         ...
   1545                   }
   1546                   }
   1547         """
   1548         output_dict = {}
   1549         tests_dict = output_dict.setdefault('tests', {})
   1550         for v in self._test_views:
   1551             test_name = v.get_testname()
   1552             test_info = tests_dict.setdefault(test_name, {})
   1553             test_info.update({
   1554                 'status': v['status'],
   1555                 'attributes': v.get_control_file_attributes() or list(),
   1556                 'reason': v['reason'],
   1557                 'retry_count': self._retry_counts.get(v['test_idx'], 0),
   1558                 })
   1559             # For aborted test, the control file will not be parsed and thus
   1560             # fail to get the attributes info. Therefore, the subsystems the
   1561             # abort test testing will be missing. For this case, we will assume
   1562             # the aborted test will test all subsystems, set subsystem:default.
   1563             if (test_info['status'] == 'ABORT' and
   1564                 not any('subsystem:' in a for a in test_info['attributes'])):
   1565                 test_info['attributes'].append('subsystem:default')
   1566 
   1567         # Write the links to test logs into the |tests_dict| of |output_dict|.
   1568         # For test whose status is not 'GOOD', the link is also buildbot_link.
   1569         for link in self._web_links:
   1570             test_name = link.anchor.strip()
   1571             test_info = tests_dict.get(test_name)
   1572             if test_info:
   1573                 test_info['link_to_logs'] = link.url
   1574                 test_info['sponge_url'] = link.sponge_url
   1575                 # Write the retry dashboard link into the dict.
   1576                 if link in self.buildbot_links and link.testname:
   1577                     test_info['retry_dashboard_link'] \
   1578                         = reporting_utils.link_retry_url(link.testname)
   1579                     # Always write the wmatrix link for compatibility.
   1580                     test_info['wmatrix_link'] \
   1581                         = reporting_utils.link_wmatrix_retry_url(link.testname)
   1582                 # Write the bug url into the dict.
   1583                 if link.bug_id:
   1584                     test_info['bug_url'] = link.bug_url
   1585 
   1586         # Write the suite timings into |output_dict|
   1587         timings = self.timings
   1588         if timings is not None:
   1589             time_dict = output_dict.setdefault('suite_timings', {})
   1590             time_dict.update({
   1591                 'download_start' : str(timings.download_start_time),
   1592                 'payload_download_end' : str(timings.payload_end_time),
   1593                 'suite_start' : str(timings.suite_start_time),
   1594                 'artifact_download_end' : str(timings.artifact_end_time),
   1595                 'tests_start' : str(timings.tests_start_time),
   1596                 'tests_end' : str(timings.tests_end_time),
   1597                 })
   1598 
   1599         output_dict['suite_job_id'] = self._suite_job_id
   1600 
   1601         return output_dict
   1602 
   1603 
   1604     def run(self):
   1605         """Collect test results.
   1606 
   1607         This method goes through the following steps:
   1608             Fetch relevent test views of the suite job.
   1609             Fetch test views of child jobs
   1610             Check whether the suite was aborted.
   1611             Generate links.
   1612             Calculate suite timings.
   1613             Compute return code based on the test result.
   1614 
   1615         """
   1616         if self._solo_test_run:
   1617             self._test_views, self._retry_counts, self._missing_results = (
   1618                   self._fetch_test_views_of_child_jobs(
   1619                           jobs=self._afe.get_jobs(id=self._suite_job_id)))
   1620         else:
   1621             self._child_views, self._retry_counts, self._missing_results = (
   1622                     self._fetch_test_views_of_child_jobs())
   1623             self._suite_views = self._fetch_relevant_test_views_of_suite()
   1624             self._test_views = self._suite_views + self._child_views
   1625         # For hostless job in Starting status, there is no test view associated.
   1626         # This can happen when a suite job in Starting status is aborted. When
   1627         # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone,
   1628         # max_jobs_started_per_cycle, a suite job can stays in Starting status.
   1629         if not self._test_views:
   1630             self.return_result = _RETURN_RESULTS['test_views_missing']
   1631             return
   1632         self.is_aborted = any([view['job_keyvals'].get('aborted_by')
   1633                                for view in self._suite_views])
   1634         self._generate_web_and_buildbot_links()
   1635         self._record_timings()
   1636         self._compute_return_code()
   1637 
   1638 
   1639     def gather_timing_stats(self):
   1640         """Collect timing related statistics."""
   1641         # Record suite runtime in metadata db.
   1642         # Some failure modes can leave times unassigned, report sentinel value
   1643         # in that case.
   1644         runtime_in_secs = -1
   1645         if (self.timings.tests_end_time is not None and
   1646             self.timings.suite_start_time is not None):
   1647             runtime_in_secs = (self.timings.tests_end_time -
   1648                     self.timings.suite_start_time).total_seconds()
   1649 
   1650 
   1651 def _make_child_dependencies(options):
   1652     """Creates a list of extra dependencies for child jobs.
   1653 
   1654     @param options: Parsed arguments to run_suite.
   1655 
   1656     @returns: A list of label strings if any dependencies should be added. None
   1657             otherwise.
   1658     """
   1659     if not options.model:
   1660         return ()
   1661     return ('model:%s' % options.model,)
   1662 
   1663 
   1664 @retry.retry(error.StageControlFileFailure, timeout_min=10)
   1665 def create_suite(afe, options):
   1666     """Create a suite with retries.
   1667 
   1668     @param afe: The afe object to insert the new suite job into.
   1669     @param options: The options to use in creating the suite.
   1670 
   1671     @return: The afe_job_id of the new suite job.
   1672     """
   1673     logging.info('%s Submitted create_suite_job rpc',
   1674                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1675 
   1676     # TODO(crbug.com/763207): This is to support calling old moblab RPC
   1677     # with ToT code.  This does not need to be supported after M62.
   1678     if options.oldrpc:
   1679         suite_args = options.suite_args
   1680         if 'tests' in suite_args:
   1681             # This is for test_that_wrapper
   1682             suite_args = ' '.join([':lab:'] + suite_args['tests'])
   1683         else:
   1684             # This is for suite_attr_wrapper
   1685             suite_args = repr(suite_args)
   1686         options.suite_args = suite_args
   1687 
   1688     return afe.run(
   1689         'create_suite_job',
   1690         name=options.name,
   1691         board=options.board,
   1692         builds=suite_common.make_builds_from_options(options),
   1693         test_source_build=options.test_source_build,
   1694         check_hosts=not options.no_wait,
   1695         pool=options.pool,
   1696         file_bugs=options.file_bugs,
   1697         priority=options.priority,
   1698         suite_args=options.suite_args,
   1699         wait_for_results=not options.no_wait,
   1700         timeout_mins=options.timeout_mins + options.delay_minutes,
   1701         max_runtime_mins=options.max_runtime_mins + options.delay_minutes,
   1702         job_retry=options.retry,
   1703         max_retries=options.max_retries,
   1704         suite_min_duts=options.suite_min_duts,
   1705         offload_failures_only=options.offload_failures_only,
   1706         run_prod_code=options.run_prod_code,
   1707         delay_minutes=options.delay_minutes,
   1708         job_keyvals=options.job_keyvals,
   1709         test_args=options.test_args,
   1710         child_dependencies=options.child_dependencies,
   1711     )
   1712 
   1713 
   1714 def _run_suite(options):
   1715     """
   1716     run_suite script without exception handling.
   1717 
   1718     @param options: The parsed options.
   1719 
   1720     @returns: A tuple contains the return_code of run_suite and the dictionary
   1721               of the output.
   1722 
   1723     """
   1724     # If indicate to use the new style suite control file, convert the args
   1725     if options.use_suite_attr:
   1726         options = change_options_for_suite_attr(options)
   1727 
   1728     log_name = _get_log_name(options)
   1729     utils.setup_logging(logfile=log_name)
   1730 
   1731     if not options.bypass_labstatus and not options.web:
   1732         utils.check_lab_status(options.build)
   1733 
   1734     afe = _create_afe(options)
   1735     instance_server = afe.server
   1736 
   1737     rpc_helper = diagnosis_utils.RPCHelper(afe)
   1738     is_real_time = True
   1739     if options.mock_job_id:
   1740         job_id = int(options.mock_job_id)
   1741         existing_job = afe.get_jobs(id=job_id, finished=True)
   1742         if existing_job:
   1743             is_real_time = False
   1744         else:
   1745             existing_job = afe.get_jobs(id=job_id)
   1746         if existing_job:
   1747             job_created_on = time_utils.date_string_to_epoch_time(
   1748                     existing_job[0].created_on)
   1749         else:
   1750             raise utils.TestLabException('Failed to retrieve job: %d' % job_id)
   1751     else:
   1752         try:
   1753             rpc_helper.check_dut_availability(options.dependencies,
   1754                                               options.minimum_duts,
   1755                                               options.skip_duts_check)
   1756             job_id = create_suite(afe, options)
   1757             job_created_on = time.time()
   1758         except (error.CrosDynamicSuiteException,
   1759                 error.RPCException, proxy.JSONRPCException) as e:
   1760             logging.exception('Error Message: %s', e)
   1761             return run_suite_common.SuiteResult(
   1762                     run_suite_common.RETURN_CODES.INFRA_FAILURE,
   1763                     {'return_message': str(e)})
   1764         except AttributeError as e:
   1765             logging.exception('Error Message: %s', e)
   1766             return run_suite_common.SuiteResult(
   1767                     run_suite_common.RETURN_CODES.INVALID_OPTIONS)
   1768 
   1769     job_timer = diagnosis_utils.JobTimer(
   1770             job_created_on, float(options.timeout_mins))
   1771     job_url = reporting_utils.link_job(job_id,
   1772                                        instance_server=instance_server)
   1773     logging.info('%s Created suite job: %s',
   1774                  job_timer.format_time(job_timer.job_created_time),
   1775                  job_url)
   1776     logging.info(annotations.StepLink(
   1777         text='Link to suite',
   1778         url=job_url))
   1779 
   1780     if options.create_and_return:
   1781         msg = '--create_and_return was specified, terminating now.'
   1782         logging.info(msg)
   1783         return run_suite_common.SuiteResult(
   1784                 run_suite_common.RETURN_CODES.OK,
   1785                 {'return_message': msg})
   1786 
   1787     if options.no_wait:
   1788         return _handle_job_nowait(job_id, options, instance_server)
   1789     else:
   1790         return _handle_job_wait(afe, job_id, options, job_timer, is_real_time)
   1791 
   1792 
   1793 def _get_log_name(options):
   1794     """Return local log file's name.
   1795 
   1796     @param options:         Parsed options.
   1797 
   1798     @return log_name, a string file name.
   1799     """
   1800     if options.require_logfile:
   1801         # options.build is verified to exist in verify_options.
   1802         # convert build name from containing / to containing only _.
   1803         log_name = 'run_suite-%s.log' % options.build.replace('/', '_')
   1804         log_dir = os.path.join(common.autotest_dir, 'logs')
   1805         if os.path.exists(log_dir):
   1806             log_name = os.path.join(log_dir, log_name)
   1807 
   1808         return log_name
   1809     else:
   1810         return None
   1811 
   1812 
   1813 def _create_afe(options):
   1814     """Return an afe instance based on options.
   1815 
   1816     @param options          Parsed options.
   1817 
   1818     @return afe, an AFE instance.
   1819     """
   1820     instance_server = (options.web if options.web else
   1821                        instance_for_pool(options.pool))
   1822     afe = frontend_wrappers.RetryingAFE(server=instance_server,
   1823                                         timeout_min=options.afe_timeout_mins,
   1824                                         delay_sec=options.delay_sec)
   1825     logging.info('Autotest instance created: %s', instance_server)
   1826     return afe
   1827 
   1828 
   1829 def _handle_job_wait(afe, job_id, options, job_timer, is_real_time):
   1830     """Handle suite job synchronously.
   1831 
   1832     @param afe              AFE instance.
   1833     @param job_id           Suite job id.
   1834     @param options          Parsed options.
   1835     @param job_timer        JobTimer for suite job.
   1836     @param is_real_time     Whether or not to handle job timeout.
   1837 
   1838     @return SuiteResult of suite job.
   1839     """
   1840     rpc_helper = diagnosis_utils.RPCHelper(afe)
   1841     instance_server = afe.server
   1842     while not afe.get_jobs(id=job_id, finished=True):
   1843         _poke_buildbot_with_output(afe, job_id, job_timer)
   1844         if job_timer.debug_output_timer.poll():
   1845             logging.info('The suite job has another %s till timeout.',
   1846                          job_timer.timeout_hours - job_timer.elapsed_time())
   1847         time.sleep(10)
   1848     logging.info('%s Suite job is finished.',
   1849                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1850     # For most cases, ResultCollector should be able to determine whether
   1851     # a suite has timed out by checking information in the test view.
   1852     # However, occationally tko parser may fail on parsing the
   1853     # job_finished time from the job's keyval file. So we add another
   1854     # layer of timeout check in run_suite. We do the check right after
   1855     # the suite finishes to make it as accurate as possible.
   1856     # There is a minor race condition here where we might have aborted
   1857     # for some reason other than a timeout, and the job_timer thinks
   1858     # it's a timeout because of the jitter in waiting for results.
   1859     # The consequence would be that run_suite exits with code
   1860     # SUITE_TIMEOUT while it should  have returned INFRA_FAILURE
   1861     # instead, which should happen very rarely.
   1862     # Note the timeout will have no sense when using -m option.
   1863     is_suite_timeout = job_timer.is_suite_timeout()
   1864 
   1865     # Extract the original suite name to record timing.
   1866     original_suite_name = get_original_suite_name(options.name,
   1867                                                   options.suite_args)
   1868     # Start collecting test results.
   1869     logging.info('%s Start collecting test results and dump them to json.',
   1870                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1871     TKO = frontend_wrappers.RetryingTKO(server=instance_server,
   1872                                         timeout_min=options.afe_timeout_mins,
   1873                                         delay_sec=options.delay_sec)
   1874     # TODO(crbug.com/672348): It needs to be possible for provision
   1875     # suite to pass if only a few tests fail.  Otherwise, a single
   1876     # failing test will be reported as failure even if the suite reports
   1877     # success.
   1878     if options.name == _PROVISION_SUITE:
   1879         # TODO(crbug.com/672348): Creating the suite job requires that
   1880         # suite_args contains num_required.
   1881         return_code_function = _ProvisionReturnCodeComputer(
   1882             num_required=options.suite_args['num_required'])
   1883     else:
   1884         return_code_function = _ReturnCodeComputer()
   1885     collector = ResultCollector(instance_server=instance_server,
   1886                                 afe=afe, tko=TKO, build=options.build,
   1887                                 suite_name=options.name,
   1888                                 suite_job_id=job_id,
   1889                                 return_code_function=return_code_function,
   1890                                 original_suite_name=original_suite_name)
   1891     collector.run()
   1892     # Dump test outputs into json.
   1893     output_dict = collector.get_results_dict()
   1894     output_dict['autotest_instance'] = instance_server
   1895     if not options.json_dump:
   1896         collector.output_results()
   1897     result = collector.return_result
   1898     if is_real_time:
   1899         # Do not record stats if the suite was aborted (either by a user
   1900         # or through the golo rpc).
   1901         # Also do not record stats if is_aborted is None, indicating
   1902         # aborting status is unknown yet.
   1903         if collector.is_aborted == False:
   1904             logging.info('%s Gathering timing stats for the suite job.',
   1905                          diagnosis_utils.JobTimer.format_time(datetime.now()))
   1906             collector.gather_timing_stats()
   1907 
   1908         if collector.is_aborted == True and is_suite_timeout:
   1909             # There are two possible cases when a suite times out.
   1910             # 1. the suite job was aborted due to timing out
   1911             # 2. the suite job succeeded, but some child jobs
   1912             #    were already aborted before the suite job exited.
   1913             # The case 2 was handled by ResultCollector,
   1914             # here we handle case 1.
   1915             result |= _RETURN_RESULTS['suite_timeout']
   1916         logging.info('\n %s Attempting to display pool info: %s',
   1917                      diagnosis_utils.JobTimer.format_time(datetime.now()),
   1918                      options.pool)
   1919         try:
   1920             # Add some jitter to make up for any latency in
   1921             # aborting the suite or checking for results.
   1922             cutoff = job_timer.timeout_hours + timedelta(hours=0.3)
   1923             rpc_helper.diagnose_pool(options.dependencies, cutoff)
   1924         except proxy.JSONRPCException:
   1925             logging.warning('Unable to display pool info.')
   1926 
   1927     # And output return message.
   1928     if result.message:
   1929         logging.info('Reason: %s', result.message)
   1930 
   1931     logging.info('\n %s Output below this line is for buildbot consumption:',
   1932                  diagnosis_utils.JobTimer.format_time(datetime.now()))
   1933     log_buildbot_links(logging.info, collector.buildbot_links)
   1934     return result.suite_result(output_dict)
   1935 
   1936 
   1937 def _handle_job_nowait(job_id, options, instance_server):
   1938     """Handle suite job asynchronously.
   1939 
   1940     @param job_id           Suite job id.
   1941     @param options          Parsed options.
   1942     @param instance_server  Autotest instance hostname.
   1943 
   1944     @return SuiteResult of suite job.
   1945     """
   1946     logging.info('Created suite job: %r', job_id)
   1947     link = LogLink(options.name, instance_server,
   1948                    '%s-%s' % (job_id, getpass.getuser()))
   1949     for generate_link in link.GenerateBuildbotLinks():
   1950         logging.info(generate_link)
   1951     logging.info('--no_wait specified; Exiting.')
   1952     return run_suite_common.SuiteResult(
   1953             run_suite_common.RETURN_CODES.OK,
   1954             {'return_message': '--no_wait specified; Exiting.'})
   1955 
   1956 
   1957 def _should_run(options):
   1958     """Check whether the suite should be run based on lab/job status checking.
   1959 
   1960     @param options          Parsed options.
   1961     """
   1962     try:
   1963         site_utils.check_lab_status(options.test_source_build)
   1964     except site_utils.TestLabException as ex:
   1965         logging.exception('Lab is closed or build is blocked. Skipping '
   1966                           'suite %s, board %s, build %s:  %s',
   1967                           options.name, options.board,
   1968                           options.test_source_build, str(ex))
   1969         return False
   1970 
   1971     start_time = str(datetime.now() -
   1972                      timedelta(days=_SEARCH_JOB_MAX_DAYS))
   1973     afe = _create_afe(options)
   1974     afe_jobs = afe.get_jobs(
   1975             name__istartswith=options.test_source_build,
   1976             name__iendswith='control.'+options.name,
   1977             created_on__gte=start_time,
   1978             min_rpc_timeout=_MIN_RPC_TIMEOUT)
   1979     if options.model:
   1980         model_tag = 'model:%s' % options.model
   1981         filtered_jobs = [j for j in afe_jobs if model_tag in j.control_file]
   1982     else:
   1983         filtered_jobs = afe_jobs
   1984 
   1985     if filtered_jobs:
   1986         logging.info('Found duplicate suite %s scheduled in past.',
   1987                      filtered_jobs)
   1988         return False
   1989 
   1990     return True
   1991 
   1992 
   1993 def _poke_buildbot_with_output(afe, job_id, job_timer):
   1994     """Poke buildbot so it doesn't timeout from silence.
   1995 
   1996     @param afe              AFE instance.
   1997     @param job_id           Suite job id.
   1998     @param job_timer        JobTimer for suite job.
   1999     """
   2000     rpc_helper = diagnosis_utils.RPCHelper(afe)
   2001     # Note that this call logs output, preventing buildbot's
   2002     # 9000 second silent timeout from kicking in. Let there be no
   2003     # doubt, this is a hack. The timeout is from upstream buildbot and
   2004     # this is the easiest work around.
   2005     if job_timer.first_past_halftime():
   2006         rpc_helper.diagnose_job(job_id, afe.server)
   2007 
   2008 
   2009 
   2010 def _run_task(options):
   2011     """Perform this script's function minus setup.
   2012 
   2013     Boilerplate like argument parsing, logging, output formatting happen
   2014     elsewhere.
   2015 
   2016     Returns a SuiteResult instance.
   2017 
   2018     TODO(ayatane): The try/except should be moved into _run_suite().
   2019     Good luck trying to figure out which function calls are supposed to
   2020     raise which of the exceptions.
   2021     """
   2022     try:
   2023         return _run_suite(options)
   2024     except diagnosis_utils.DUTsNotAvailableError as e:
   2025         result = run_suite_common.SuiteResult(
   2026             run_suite_common.RETURN_CODES.BOARD_NOT_AVAILABLE,
   2027             {'return_message': 'Skipping testing: %s' % e.message})
   2028         logging.info(result.output_dict['return_message'])
   2029         return result
   2030     except utils.TestLabException as e:
   2031         result = run_suite_common.SuiteResult(
   2032             run_suite_common.RETURN_CODES.INFRA_FAILURE,
   2033             {'return_message': 'TestLabException: %s' % e})
   2034         logging.exception(result.output_dict['return_message'])
   2035         return result
   2036 
   2037 
   2038 class _ExceptionHandler(object):
   2039     """Global exception handler replacement."""
   2040 
   2041     def __init__(self, dump_json):
   2042         """Initialize instance.
   2043 
   2044         @param dump_json: Whether to print a JSON dump of the result dict to
   2045                           stdout.
   2046         """
   2047         self._should_dump_json = dump_json
   2048 
   2049     def __call__(self, exc_type, value, traceback):
   2050         if self._should_dump_json:
   2051             run_suite_common.dump_json(
   2052                     {'return_message': ('Unhandled run_suite exception: %s'
   2053                                         % value)})
   2054         sys.exit(run_suite_common.RETURN_CODES.INFRA_FAILURE)
   2055 
   2056 
   2057 def _check_if_use_skylab(options):
   2058     """Detect whether to run suite in skylab."""
   2059     if not _ENABLE_RUN_SUITE_TRAMPOLINE:
   2060         logging.info('trampoline to skylab is not enabled.')
   2061         return False
   2062 
   2063     task_info = 'suite:%s, board:%s, model:%s, pool:%s' % (
   2064             options.name, options.board, options.model, options.pool)
   2065     ctx = gs.GSContext()
   2066     with osutils.TempDir(prefix='trampoline_') as tempdir:
   2067         temp_file = os.path.join(tempdir, _MIGRATION_CONFIG_FILE)
   2068         ctx.Copy(_TRAMPOLINE_CONFIG, temp_file)
   2069         _migration_config = config_reader.MigrationConfig(
   2070                 config_reader.ConfigReader(temp_file))
   2071 
   2072         logging.info('Checking whether to run in skylab: Task(%s)', task_info)
   2073         if skylab.should_run_in_skylab(_migration_config,
   2074                                        options.board,
   2075                                        options.model,
   2076                                        options.name,
   2077                                        options.pool):
   2078             logging.info('Task (%s) Should run in skylab', task_info)
   2079             return True
   2080 
   2081     logging.info('Task (%s) Should run in autotest', task_info)
   2082     return False
   2083 
   2084 
   2085 def _run_with_skylab(options):
   2086     """Run suite inside skylab."""
   2087     # TODO(xixuan): Implement running suite in skylab.
   2088     return _RETURN_RESULTS['ok']
   2089 
   2090 
   2091 def _run_with_autotest(options):
   2092     """Run suite inside autotest."""
   2093     if options.pre_check and not _should_run(options):
   2094         logging.info('Suite %s-%s is terminated: Lab is closed, OR build is '
   2095                      'blocked, OR this suite has already been kicked off '
   2096                      'once in past %d days.',
   2097                      options.test_source_build, options.name,
   2098                      _SEARCH_JOB_MAX_DAYS)
   2099         result = run_suite_common.SuiteResult(
   2100             run_suite_common.RETURN_CODES.ERROR,
   2101             {'return_message': ("Lab is closed OR other reason"
   2102                                 " (see code, it's complicated)")})
   2103     else:
   2104         result = _run_task(options)
   2105 
   2106     if options.json_dump:
   2107         run_suite_common.dump_json(result.output_dict)
   2108 
   2109     return result
   2110 
   2111 
   2112 def main():
   2113     """Entry point."""
   2114     utils.verify_not_root_user()
   2115 
   2116     parser = make_parser()
   2117     options = parser.parse_args()
   2118     if options.do_nothing:
   2119         return 0
   2120 
   2121     sys.exceptionhandler = _ExceptionHandler(dump_json=options.json_dump)
   2122     if options.json_dump:
   2123         logging.disable(logging.CRITICAL)
   2124 
   2125     options_okay = verify_and_clean_options(options)
   2126     # Set StreamHandler first to capture error messages if suite is not run.
   2127     utils.setup_logging()
   2128     if not options_okay:
   2129         parser.print_help()
   2130         result = run_suite_common.SuiteResult(
   2131                 run_suite_common.RETURN_CODES.INVALID_OPTIONS)
   2132     else:
   2133         if _check_if_use_skylab(options):
   2134             result = _run_with_skylab(options)
   2135         else:
   2136             result = _run_with_autotest(options)
   2137 
   2138     logging.info('Will return from run_suite with status: %s',
   2139                   run_suite_common.RETURN_CODES.get_string(result.return_code))
   2140     return result.return_code
   2141 
   2142 
   2143 if __name__ == "__main__":
   2144     sys.exit(main())
   2145