Home | History | Annotate | Download | only in cros
      1 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 from distutils import version
      6 import cStringIO
      7 import HTMLParser
      8 import httplib
      9 import json
     10 import logging
     11 import multiprocessing
     12 import os
     13 import re
     14 import socket
     15 import time
     16 import urllib2
     17 import urlparse
     18 
     19 from autotest_lib.client.bin import utils as bin_utils
     20 from autotest_lib.client.common_lib import android_utils
     21 from autotest_lib.client.common_lib import error
     22 from autotest_lib.client.common_lib import global_config
     23 from autotest_lib.client.common_lib import utils
     24 from autotest_lib.client.common_lib.cros import retry
     25 from autotest_lib.server import utils as server_utils
     26 # TODO(cmasone): redo this class using requests module; http://crosbug.com/30107
     27 
     28 try:
     29     from chromite.lib import metrics
     30 except ImportError:
     31     metrics = utils.metrics_mock
     32 
     33 
     34 CONFIG = global_config.global_config
     35 # This file is generated at build time and specifies, per suite and per test,
     36 # the DEPENDENCIES list specified in each control file.  It's a dict of dicts:
     37 # {'bvt':   {'/path/to/autotest/control/site_tests/test1/control': ['dep1']}
     38 #  'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']}
     39 #  'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'],
     40 #            '/path/to/autotest/control/site_tests/test3/control': ['dep3']}
     41 # }
     42 DEPENDENCIES_FILE = 'test_suites/dependency_info'
     43 # Number of seconds for caller to poll devserver's is_staged call to check if
     44 # artifacts are staged.
     45 _ARTIFACT_STAGE_POLLING_INTERVAL = 5
     46 # Artifacts that should be staged when client calls devserver RPC to stage an
     47 # image.
     48 _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful'
     49 # Artifacts that should be staged when client calls devserver RPC to stage an
     50 # image with autotest artifact.
     51 _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,'
     52                                                    'control_files,stateful,'
     53                                                    'autotest_packages')
     54 # Artifacts that should be staged when client calls devserver RPC to stage an
     55 # Android build.
     56 _BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions')
     57 SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value(
     58         'CROS', 'skip_devserver_health_check', type=bool)
     59 # Number of seconds for the call to get devserver load to time out.
     60 TIMEOUT_GET_DEVSERVER_LOAD = 2.0
     61 
     62 # Android artifact path in devserver
     63 ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value(
     64         'CROS', 'android_build_name_pattern', type=str).replace('\\', '')
     65 
     66 # Return value from a devserver RPC indicating the call succeeded.
     67 SUCCESS = 'Success'
     68 
     69 # The timeout minutes for a given devserver ssh call.
     70 DEVSERVER_SSH_TIMEOUT_MINS = 1
     71 
     72 # Error message for invalid devserver response.
     73 ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error'
     74 ERR_MSG_FOR_DOWN_DEVSERVER = 'Service Unavailable'
     75 
     76 # Error message for devserver call timedout.
     77 ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout'
     78 
     79 # The timeout minutes for waiting a devserver staging.
     80 DEVSERVER_IS_STAGING_RETRY_MIN = 100
     81 
     82 # The timeout minutes for waiting a DUT auto-update finished.
     83 DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100
     84 
     85 # The total times of devserver triggering CrOS auto-update.
     86 AU_RETRY_LIMIT = 2
     87 
     88 # Number of seconds for caller to poll devserver's get_au_status call to
     89 # check if cros auto-update is finished.
     90 CROS_AU_POLLING_INTERVAL = 10
     91 
     92 # Number of seconds for intervals between retrying auto-update calls.
     93 CROS_AU_RETRY_INTERVAL = 20
     94 
     95 # The file name for auto-update logs.
     96 CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log'
     97 
     98 # Provision error patterns.
     99 # People who see this should know that they shouldn't change these
    100 # classification strings. These strings are used for monitoring provision
    101 # failures. Any changes may mess up the stats.
    102 _EXCEPTION_PATTERNS = [
    103         # Raised when devserver portfile does not exist on host.
    104         (r".*Devserver portfile does not exist!.*$",
    105          '(1) Devserver portfile does not exist on host'),
    106         # Raised when devserver cannot copy packages to host.
    107         (r".*Could not copy .* to device.*$",
    108          '(2) Cannot copy packages to host'),
    109         # Raised when devserver fails to run specific commands on host.
    110         (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$",
    111          '(3) Fail to run specific command on host'),
    112         # Raised when new build fails to boot on the host.
    113         (r'.*RootfsUpdateError: Build .* failed to boot on.*$',
    114          '(4) Build failed to boot on host'),
    115         # Raised when the auto-update process is timed out.
    116         (r'.*The CrOS auto-update process is timed out, '
    117          'thus will be terminated.*$',
    118          '(5) Auto-update is timed out'),
    119         # Raised when the host is not pingable.
    120         (r".*DeviceNotPingableError.*$",
    121          '(6) Host is not pingable during auto-update'),
    122         # Raised when hosts have unexpected status after rootfs update.
    123         (r'.*Update failed with unexpected update status: '
    124          'UPDATE_STATUS_IDLE.*$',
    125          '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs '
    126          'update'),
    127         # Raised when devserver returns non-json response to shard/drone.
    128         (r'.*No JSON object could be decoded.*$',
    129          '(8) Devserver returned non-json object'),
    130         # Raised when devserver loses host's ssh connection
    131         (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$',
    132          "(9) Devserver lost host's ssh connection"),
    133         # Raised when error happens in writing files to host
    134         (r'.*Write failed\: Broken pipe.*$',
    135          "(10) Broken pipe while writing or connecting to host")]
    136 
    137 PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value(
    138         'CROS', 'prefer_local_devserver', type=bool, default=False)
    139 
    140 ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value(
    141         'CROS', 'enable_ssh_connection_for_devserver', type=bool,
    142         default=False)
    143 
    144 # Directory to save auto-update logs
    145 AUTO_UPDATE_LOG_DIR = 'autoupdate_logs'
    146 
    147 DEFAULT_SUBNET_MASKBIT = 19
    148 
    149 # Metrics basepaths.
    150 METRICS_PATH = 'chromeos/autotest'
    151 PROVISION_PATH = METRICS_PATH + '/provision'
    152 
    153 
    154 class DevServerException(Exception):
    155     """Raised when the dev server returns a non-200 HTTP response."""
    156     pass
    157 
    158 
    159 class BadBuildException(DevServerException):
    160     """Raised when build failed to boot on DUT."""
    161     pass
    162 
    163 
    164 class RetryableProvisionException(DevServerException):
    165     """Raised when provision fails due to a retryable reason."""
    166     pass
    167 
    168 class DevServerOverloadException(Exception):
    169     """Raised when the dev server returns a 502 HTTP response."""
    170     pass
    171 
    172 class DevServerFailToLocateException(Exception):
    173     """Raised when fail to locate any devserver."""
    174     pass
    175 
    176 
    177 class DevServerExceptionClassifier(object):
    178     """A Class represents exceptions raised from DUT by calling auto_update."""
    179     def __init__(self, err, keep_full_trace=True):
    180         """
    181         @param err: A single string representing one time provision
    182             error happened in auto_update().
    183         @param keep_full_trace: True to keep the whole track trace of error.
    184             False when just keep the last line.
    185         """
    186         self._err = err if keep_full_trace else err.split('\n')[-1]
    187         self._classification = None
    188 
    189     def _classify(self):
    190         for err_pattern, classification in _EXCEPTION_PATTERNS:
    191             if re.match(err_pattern, self._err):
    192                 return classification
    193 
    194         return '(0) Unknown exception'
    195 
    196     @property
    197     def classification(self):
    198         """Classify the error
    199 
    200         @return: return a classified exception type (string) from
    201             _EXCEPTION_PATTERNS or 'Unknown exception'. Current patterns in
    202             _EXCEPTION_PATTERNS are very specific so that errors cannot match
    203             more than one pattern.
    204         """
    205         if not self._classification:
    206             self._classification = self._classify()
    207         return self._classification
    208 
    209     @property
    210     def summary(self):
    211         """Use one line to show the error message."""
    212         return ' '.join(self._err.splitlines())
    213 
    214     @property
    215     def classified_exception(self):
    216         """What kind of exception will be raised to higher.
    217 
    218         @return: return a special Exception when the raised error is an
    219             RootfsUpdateError. Otherwise, return general DevServerException.
    220         """
    221         # The classification of RootfsUpdateError in _EXCEPTION_PATTERNS starts
    222         # with "(4)"
    223         if self.classification.startswith('(4)'):
    224             return BadBuildException
    225 
    226         return DevServerException
    227 
    228 
    229 class MarkupStripper(HTMLParser.HTMLParser):
    230     """HTML parser that strips HTML tags, coded characters like &
    231 
    232     Works by, basically, not doing anything for any tags, and only recording
    233     the content of text nodes in an internal data structure.
    234     """
    235     def __init__(self):
    236         self.reset()
    237         self.fed = []
    238 
    239 
    240     def handle_data(self, d):
    241         """Consume content of text nodes, store it away."""
    242         self.fed.append(d)
    243 
    244 
    245     def get_data(self):
    246         """Concatenate and return all stored data."""
    247         return ''.join(self.fed)
    248 
    249 
    250 def _strip_http_message(message):
    251     """Strip the HTTP marker from the an HTTP message.
    252 
    253     @param message: A string returned by an HTTP call.
    254 
    255     @return: A string with HTTP marker being stripped.
    256     """
    257     strip = MarkupStripper()
    258     try:
    259         strip.feed(message.decode('utf_32'))
    260     except UnicodeDecodeError:
    261         strip.feed(message)
    262     return strip.get_data()
    263 
    264 
    265 def _get_image_storage_server():
    266     return CONFIG.get_config_value('CROS', 'image_storage_server', type=str)
    267 
    268 
    269 def _get_canary_channel_server():
    270     """
    271     Get the url of the canary-channel server,
    272     eg: gsutil://chromeos-releases/canary-channel/<board>/<release>
    273 
    274     @return: The url to the canary channel server.
    275     """
    276     return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str)
    277 
    278 
    279 def _get_storage_server_for_artifacts(artifacts=None):
    280     """Gets the appropriate storage server for the given artifacts.
    281 
    282     @param artifacts: A list of artifacts we need to stage.
    283     @return: The address of the storage server that has these artifacts.
    284              The default image storage server if no artifacts are specified.
    285     """
    286     factory_artifact = global_config.global_config.get_config_value(
    287             'CROS', 'factory_artifact', type=str, default='')
    288     if artifacts and factory_artifact and factory_artifact in artifacts:
    289         return _get_canary_channel_server()
    290     return _get_image_storage_server()
    291 
    292 
    293 def _gs_or_local_archive_url_args(archive_url):
    294     """Infer the devserver call arguments to use with the given archive_url.
    295 
    296     @param archive_url: The archive url to include the in devserver RPC. This
    297             can either e a GS path or a local path.
    298     @return: A dict of arguments to include in the devserver call.
    299     """
    300     if not archive_url:
    301         return {}
    302     elif archive_url.startswith('gs://'):
    303         return {'archive_url': archive_url}
    304     else:
    305         # For a local path, we direct the devserver to move the files while
    306         # staging. This is the fastest way to stage local files, but deletes the
    307         # files from the source. This is OK because the files are available on
    308         # the devserver once staged.
    309         return {
    310                 'local_path': archive_url,
    311                 'delete_source': True,
    312         }
    313 
    314 
    315 def _reverse_lookup_from_config(address):
    316     """Look up hostname for the given IP address.
    317 
    318     This uses the hostname-address map from the config file.
    319 
    320     If multiple hostnames map to the same IP address, the first one
    321     defined in the configuration file takes precedence.
    322 
    323     @param address: IP address string
    324     @returns: hostname string, or original input if not found
    325     """
    326     for hostname, addr in _get_hostname_addr_map().iteritems():
    327         if addr == address:
    328             return hostname
    329     return address
    330 
    331 
    332 def _get_hostname_addr_map():
    333     """Get hostname address mapping from config.
    334 
    335     @return: dict mapping server hostnames to addresses
    336     """
    337     return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP')
    338 
    339 
    340 def _get_dev_server_list():
    341     return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[])
    342 
    343 
    344 def _get_crash_server_list():
    345     return CONFIG.get_config_value('CROS', 'crash_server', type=list,
    346         default=[])
    347 
    348 
    349 def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN,
    350                           exception_to_raise=DevServerException):
    351     """A decorator to use with remote devserver calls.
    352 
    353     This decorator converts urllib2.HTTPErrors into DevServerExceptions
    354     with any embedded error info converted into plain text. The method
    355     retries on urllib2.URLError or error.CmdError to avoid devserver flakiness.
    356     """
    357     #pylint: disable=C0111
    358 
    359     def inner_decorator(method):
    360         label = method.__name__ if hasattr(method, '__name__') else None
    361         def metrics_wrapper(*args, **kwargs):
    362             @retry.retry((urllib2.URLError, error.CmdError,
    363                           DevServerOverloadException),
    364                          timeout_min=timeout_min,
    365                          exception_to_raise=exception_to_raise,
    366                         label=label)
    367             def wrapper():
    368                 """This wrapper actually catches the HTTPError."""
    369                 try:
    370                     return method(*args, **kwargs)
    371                 except urllib2.HTTPError as e:
    372                     error_markup = e.read()
    373                     raise DevServerException(_strip_http_message(error_markup))
    374 
    375             try:
    376                 return wrapper()
    377             except Exception as e:
    378                 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e):
    379                     dev_server = None
    380                     if args and isinstance(args[0], DevServer):
    381                         dev_server = args[0].hostname
    382                     elif 'devserver' in kwargs:
    383                         dev_server = get_hostname(kwargs['devserver'])
    384 
    385                     logging.debug('RPC call %s has timed out on devserver %s.',
    386                                   label, dev_server)
    387                     c = metrics.Counter(
    388                             'chromeos/autotest/devserver/call_timeout')
    389                     c.increment(fields={'dev_server': dev_server,
    390                                         'healthy': label})
    391 
    392                 raise
    393 
    394         return metrics_wrapper
    395 
    396     return inner_decorator
    397 
    398 
    399 def get_hostname(url):
    400     """Get the hostname portion of a URL
    401 
    402     schema://hostname:port/path
    403 
    404     @param url: a Url string
    405     @return: a hostname string
    406     """
    407     return urlparse.urlparse(url).hostname
    408 
    409 
    410 class DevServer(object):
    411     """Base class for all DevServer-like server stubs.
    412 
    413     This is the base class for interacting with all Dev Server-like servers.
    414     A caller should instantiate a sub-class of DevServer with:
    415 
    416     host = SubClassServer.resolve(build)
    417     server = SubClassServer(host)
    418     """
    419     _MIN_FREE_DISK_SPACE_GB = 20
    420     _MAX_APACHE_CLIENT_COUNT = 75
    421     # Threshold for the CPU load percentage for a devserver to be selected.
    422     MAX_CPU_LOAD = 80.0
    423     # Threshold for the network IO, set to 80MB/s
    424     MAX_NETWORK_IO = 1024 * 1024 * 80
    425     DISK_IO = 'disk_total_bytes_per_second'
    426     NETWORK_IO = 'network_total_bytes_per_second'
    427     CPU_LOAD = 'cpu_percent'
    428     FREE_DISK = 'free_disk'
    429     AU_PROCESS = 'au_process_count'
    430     STAGING_THREAD_COUNT = 'staging_thread_count'
    431     APACHE_CLIENT_COUNT = 'apache_client_count'
    432 
    433 
    434     def __init__(self, devserver):
    435         self._devserver = devserver
    436 
    437 
    438     def url(self):
    439         """Returns the url for this devserver."""
    440         return self._devserver
    441 
    442 
    443     @property
    444     def hostname(self):
    445         """Return devserver hostname parsed from the devserver URL.
    446 
    447         Note that this is likely parsed from the devserver URL from
    448         shadow_config.ini, meaning that the "hostname" part of the
    449         devserver URL is actually an IP address.
    450 
    451         @return hostname string
    452         """
    453         return get_hostname(self.url())
    454 
    455 
    456     @property
    457     def resolved_hostname(self):
    458         """Return devserver hostname, resolved from its IP address.
    459 
    460         Unlike the hostname property, this property attempts to look up
    461         the proper hostname from the devserver IP address.  If lookup
    462         fails, then fall back to whatever the hostname property would
    463         have returned.
    464 
    465         @return hostname string
    466         """
    467         return _reverse_lookup_from_config(self.hostname)
    468 
    469 
    470     @staticmethod
    471     def get_server_url(url):
    472         """Get the devserver url from a repo url, which includes build info.
    473 
    474         @param url: A job repo url.
    475 
    476         @return A devserver url, e.g., http://127.0.0.10:8080
    477         """
    478         res = urlparse.urlparse(url)
    479         if res.netloc:
    480             return res.scheme + '://' + res.netloc
    481 
    482 
    483     @classmethod
    484     def get_devserver_load_wrapper(cls, devserver, timeout_sec, output):
    485         """A wrapper function to call get_devserver_load in parallel.
    486 
    487         @param devserver: url of the devserver.
    488         @param timeout_sec: Number of seconds before time out the devserver
    489                             call.
    490         @param output: An output queue to save results to.
    491         """
    492         load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0)
    493         if load:
    494             load['devserver'] = devserver
    495         output.put(load)
    496 
    497 
    498     @classmethod
    499     def get_devserver_load(cls, devserver,
    500                            timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
    501         """Returns True if the |devserver| is healthy to stage build.
    502 
    503         @param devserver: url of the devserver.
    504         @param timeout_min: How long to wait in minutes before deciding the
    505                             the devserver is not up (float).
    506 
    507         @return: A dictionary of the devserver's load.
    508 
    509         """
    510         call = cls._build_call(devserver, 'check_health')
    511         @remote_devserver_call(timeout_min=timeout_min)
    512         def get_load(devserver=devserver):
    513             """Inner method that makes the call."""
    514             return cls.run_call(call, timeout=timeout_min*60)
    515 
    516         try:
    517             return json.load(cStringIO.StringIO(get_load(devserver=devserver)))
    518         except Exception as e:
    519             logging.error('Devserver call failed: "%s", timeout: %s seconds,'
    520                           ' Error: %s', call, timeout_min * 60, e)
    521 
    522 
    523     @classmethod
    524     def is_free_disk_ok(cls, load):
    525         """Check if a devserver has enough free disk.
    526 
    527         @param load: A dict of the load of the devserver.
    528 
    529         @return: True if the devserver has enough free disk or disk check is
    530                  skipped in global config.
    531 
    532         """
    533         if SKIP_DEVSERVER_HEALTH_CHECK:
    534             logging.debug('devserver health check is skipped.')
    535         elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB:
    536             return False
    537 
    538         return True
    539 
    540 
    541     @classmethod
    542     def is_apache_client_count_ok(cls, load):
    543         """Check if a devserver has enough Apache connections available.
    544 
    545         Apache server by default has maximum of 150 concurrent connections. If
    546         a devserver has too many live connections, it likely indicates the
    547         server is busy handling many long running download requests, e.g.,
    548         downloading stateful partitions. It is better not to add more requests
    549         to it.
    550 
    551         @param load: A dict of the load of the devserver.
    552 
    553         @return: True if the devserver has enough Apache connections available,
    554                  or disk check is skipped in global config.
    555 
    556         """
    557         if SKIP_DEVSERVER_HEALTH_CHECK:
    558             logging.debug('devserver health check is skipped.')
    559         elif cls.APACHE_CLIENT_COUNT not in load:
    560             logging.debug('Apache client count is not collected from devserver.')
    561         elif (load[cls.APACHE_CLIENT_COUNT] >
    562               cls._MAX_APACHE_CLIENT_COUNT):
    563             return False
    564 
    565         return True
    566 
    567 
    568     @classmethod
    569     def devserver_healthy(cls, devserver,
    570                           timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
    571         """Returns True if the |devserver| is healthy to stage build.
    572 
    573         @param devserver: url of the devserver.
    574         @param timeout_min: How long to wait in minutes before deciding the
    575                             the devserver is not up (float).
    576 
    577         @return: True if devserver is healthy. Return False otherwise.
    578 
    579         """
    580         c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy')
    581         reason = ''
    582         healthy = False
    583         load = cls.get_devserver_load(devserver, timeout_min=timeout_min)
    584         try:
    585             if not load:
    586                 # Failed to get the load of devserver.
    587                 reason = '(1) Failed to get load.'
    588                 return False
    589 
    590             apache_ok = cls.is_apache_client_count_ok(load)
    591             if not apache_ok:
    592                 reason = '(2) Apache client count too high.'
    593                 logging.error('Devserver check_health failed. Live Apache client '
    594                               'count is too high: %d.',
    595                               load[cls.APACHE_CLIENT_COUNT])
    596                 return False
    597 
    598             disk_ok = cls.is_free_disk_ok(load)
    599             if not disk_ok:
    600                 reason = '(3) Disk space too low.'
    601                 logging.error('Devserver check_health failed. Free disk space is '
    602                               'low. Only %dGB is available.',
    603                               load[cls.FREE_DISK])
    604             healthy = bool(disk_ok)
    605             return disk_ok
    606         finally:
    607             c.increment(fields={'dev_server': cls(devserver).resolved_hostname,
    608                                 'healthy': healthy,
    609                                 'reason': reason})
    610             # Monitor how many AU processes the devserver is currently running.
    611             if load is not None and load.get(DevServer.AU_PROCESS):
    612                 c_au = metrics.Gauge(
    613                         'chromeos/autotest/devserver/devserver_au_count')
    614                 c_au.set(
    615                     load.get(DevServer.AU_PROCESS),
    616                     fields={'dev_server': cls(devserver).resolved_hostname})
    617 
    618 
    619     @staticmethod
    620     def _build_call(host, method, **kwargs):
    621         """Build a URL to |host| that calls |method|, passing |kwargs|.
    622 
    623         Builds a URL that calls |method| on the dev server defined by |host|,
    624         passing a set of key/value pairs built from the dict |kwargs|.
    625 
    626         @param host: a string that is the host basename e.g. http://server:90.
    627         @param method: the dev server method to call.
    628         @param kwargs: a dict mapping arg names to arg values.
    629         @return the URL string.
    630         """
    631         # If the archive_url is a local path, the args expected by the devserver
    632         # are a little different.
    633         archive_url_args = _gs_or_local_archive_url_args(
    634                 kwargs.pop('archive_url', None))
    635         kwargs.update(archive_url_args)
    636 
    637         argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems()))
    638         return "%(host)s/%(method)s?%(argstr)s" % dict(
    639                 host=host, method=method, argstr=argstr)
    640 
    641 
    642     def build_call(self, method, **kwargs):
    643         """Builds a devserver RPC string that is used by 'run_call()'.
    644 
    645         @param method: remote devserver method to call.
    646         """
    647         return self._build_call(self._devserver, method, **kwargs)
    648 
    649 
    650     @classmethod
    651     def build_all_calls(cls, method, **kwargs):
    652         """Builds a list of URLs that makes RPC calls on all devservers.
    653 
    654         Build a URL that calls |method| on the dev server, passing a set
    655         of key/value pairs built from the dict |kwargs|.
    656 
    657         @param method: the dev server method to call.
    658         @param kwargs: a dict mapping arg names to arg values
    659 
    660         @return the URL string
    661         """
    662         calls = []
    663         # Note we use cls.servers as servers is class specific.
    664         for server in cls.servers():
    665             if cls.devserver_healthy(server):
    666                 calls.append(cls._build_call(server, method, **kwargs))
    667 
    668         return calls
    669 
    670 
    671     @classmethod
    672     def run_call(cls, call, readline=False, timeout=None):
    673         """Invoke a given devserver call using urllib.open.
    674 
    675         Open the URL with HTTP, and return the text of the response. Exceptions
    676         may be raised as for urllib2.urlopen().
    677 
    678         @param call: a url string that calls a method to a devserver.
    679         @param readline: whether read http response line by line.
    680         @param timeout: The timeout seconds for this urlopen call.
    681 
    682         @return the results of this call.
    683         """
    684         if timeout is not None:
    685             return utils.urlopen_socket_timeout(
    686                     call, timeout=timeout).read()
    687         elif readline:
    688             response = urllib2.urlopen(call)
    689             return [line.rstrip() for line in response]
    690         else:
    691             return urllib2.urlopen(call).read()
    692 
    693 
    694     @staticmethod
    695     def servers():
    696         """Returns a list of servers that can serve as this type of server."""
    697         raise NotImplementedError()
    698 
    699 
    700     @classmethod
    701     def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT,
    702                                       unrestricted_only=False):
    703         """Get the devservers in the same subnet of the given ip.
    704 
    705         @param ip: The IP address of a dut to look for devserver.
    706         @param mask_bits: Number of mask bits. Default is 19.
    707         @param unrestricted_only: Set to True to select from devserver in
    708                 unrestricted subnet only. Default is False.
    709 
    710         @return: A list of devservers in the same subnet of the given ip.
    711 
    712         """
    713         # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so
    714         # we need a dict to return the full devserver path once the IPs are
    715         # filtered in get_servers_in_same_subnet.
    716         server_names = {}
    717         all_devservers = []
    718         devservers = (cls.get_unrestricted_devservers() if unrestricted_only
    719                       else cls.servers())
    720         for server in devservers:
    721             server_name = get_hostname(server)
    722             server_names[server_name] = server
    723             all_devservers.append(server_name)
    724         if not all_devservers:
    725             devserver_type = 'unrestricted only' if unrestricted_only else 'all'
    726             raise DevServerFailToLocateException(
    727                 'Fail to locate a devserver for dut %s in %s devservers'
    728                 % (ip, devserver_type))
    729 
    730         devservers = utils.get_servers_in_same_subnet(ip, mask_bits,
    731                                                       all_devservers)
    732         return [server_names[s] for s in devservers]
    733 
    734 
    735     @classmethod
    736     def get_unrestricted_devservers(
    737                 cls, restricted_subnets=utils.RESTRICTED_SUBNETS):
    738         """Get the devservers not in any restricted subnet specified in
    739         restricted_subnets.
    740 
    741         @param restricted_subnets: A list of restriected subnets.
    742 
    743         @return: A list of devservers not in any restricted subnet.
    744 
    745         """
    746         if not restricted_subnets:
    747             return cls.servers()
    748 
    749         devservers = []
    750         for server in cls.servers():
    751             server_name = get_hostname(server)
    752             if not utils.get_restricted_subnet(server_name, restricted_subnets):
    753                 devservers.append(server)
    754         return devservers
    755 
    756 
    757     @classmethod
    758     def get_healthy_devserver(cls, build, devservers, ban_list=None):
    759         """"Get a healthy devserver instance from the list of devservers.
    760 
    761         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
    762         @param devservers: The devserver list to be chosen out a healthy one.
    763         @param ban_list: The blacklist of devservers we don't want to choose.
    764                 Default is None.
    765 
    766         @return: A DevServer object of a healthy devserver. Return None if no
    767                 healthy devserver is found.
    768 
    769         """
    770         logging.debug('Pick one healthy devserver from %r', devservers)
    771         while devservers:
    772             hash_index = hash(build) % len(devservers)
    773             devserver = devservers.pop(hash_index)
    774             logging.debug('Check health for %s', devserver)
    775             if ban_list and devserver in ban_list:
    776                 continue
    777 
    778             if cls.devserver_healthy(devserver):
    779                 logging.debug('Pick %s', devserver)
    780                 return cls(devserver)
    781 
    782 
    783     @classmethod
    784     def get_available_devservers(cls, hostname=None,
    785                                  prefer_local_devserver=PREFER_LOCAL_DEVSERVER,
    786                                  restricted_subnets=utils.RESTRICTED_SUBNETS):
    787         """Get devservers in the same subnet of the given hostname.
    788 
    789         @param hostname: Hostname of a DUT to choose devserver for.
    790 
    791         @return: A tuple of (devservers, can_retry), devservers is a list of
    792                  devservers that's available for the given hostname. can_retry
    793                  is a flag that indicate if caller can retry the selection of
    794                  devserver if no devserver in the returned devservers can be
    795                  used. For example, if hostname is in a restricted subnet,
    796                  can_retry will be False.
    797         """
    798         logging.info('Getting devservers for host: %s',  hostname)
    799         host_ip = None
    800         if hostname:
    801             host_ip = bin_utils.get_ip_address(hostname)
    802             if not host_ip:
    803                 logging.error('Failed to get IP address of %s. Will pick a '
    804                               'devserver without subnet constraint.', hostname)
    805 
    806         if not host_ip:
    807             return cls.get_unrestricted_devservers(restricted_subnets), False
    808 
    809         # Go through all restricted subnet settings and check if the DUT is
    810         # inside a restricted subnet. If so, only return the devservers in the
    811         # restricted subnet and doesn't allow retry.
    812         if host_ip and restricted_subnets:
    813             subnet_ip, mask_bits = _get_subnet_for_host_ip(
    814                     host_ip, restricted_subnets=restricted_subnets)
    815             if subnet_ip:
    816                 logging.debug('The host %s (%s) is in a restricted subnet. '
    817                               'Try to locate a devserver inside subnet '
    818                               '%s:%d.', hostname, host_ip, subnet_ip,
    819                               mask_bits)
    820                 devservers = cls.get_devservers_in_same_subnet(
    821                         subnet_ip, mask_bits)
    822                 return devservers, False
    823 
    824         # If prefer_local_devserver is set to True and the host is not in
    825         # restricted subnet, pick a devserver in the same subnet if possible.
    826         # Set can_retry to True so it can pick a different devserver if all
    827         # devservers in the same subnet are down.
    828         if prefer_local_devserver:
    829             return (cls.get_devservers_in_same_subnet(
    830                     host_ip, DEFAULT_SUBNET_MASKBIT, True), True)
    831 
    832         return cls.get_unrestricted_devservers(restricted_subnets), False
    833 
    834 
    835     @classmethod
    836     def resolve(cls, build, hostname=None, ban_list=None):
    837         """"Resolves a build to a devserver instance.
    838 
    839         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
    840         @param hostname: The hostname of dut that requests a devserver. It's
    841                          used to make sure a devserver in the same subnet is
    842                          preferred.
    843         @param ban_list: The blacklist of devservers shouldn't be chosen.
    844 
    845         @raise DevServerException: If no devserver is available.
    846         """
    847         tried_devservers = set()
    848         devservers, can_retry = cls.get_available_devservers(hostname)
    849         if devservers:
    850             tried_devservers |= set(devservers)
    851 
    852         devserver = cls.get_healthy_devserver(build, devservers,
    853                                               ban_list=ban_list)
    854 
    855         if not devserver and can_retry:
    856             # Find available devservers without dut location constrain.
    857             devservers, _ = cls.get_available_devservers()
    858             devserver = cls.get_healthy_devserver(build, devservers,
    859                                                   ban_list=ban_list)
    860             if devservers:
    861                 tried_devservers |= set(devservers)
    862         if devserver:
    863             return devserver
    864         else:
    865             subnet = 'unrestricted subnet'
    866             if hostname is not None:
    867                 host_ip = bin_utils.get_ip_address(hostname)
    868                 if host_ip:
    869                     subnet_ip, mask_bits = _get_subnet_for_host_ip(host_ip)
    870                     subnet = '%s/%s' % (str(subnet_ip), str(mask_bits))
    871 
    872             error_msg = ('All devservers in subnet: %s are currently down: '
    873                          '%s. (dut hostname: %s)' %
    874                          (subnet, tried_devservers, hostname))
    875             logging.error(error_msg)
    876             c = metrics.Counter(
    877                     'chromeos/autotest/devserver/subnet_without_devservers')
    878             c.increment(fields={'subnet': subnet, 'hostname': str(hostname)})
    879             raise DevServerException(error_msg)
    880 
    881 
    882     @classmethod
    883     def random(cls):
    884         """Return a random devserver that's available.
    885 
    886         Devserver election in `resolve` method is based on a hash of the
    887         build that a caller wants to stage. The purpose is that different
    888         callers requesting for the same build can get the same devserver,
    889         while the lab is able to distribute different builds across all
    890         devservers. That helps to reduce the duplication of builds across
    891         all devservers.
    892         This function returns a random devserver, by passing a random
    893         pseudo build name to `resolve `method.
    894         """
    895         return cls.resolve(build=str(time.time()))
    896 
    897 
    898 class CrashServer(DevServer):
    899     """Class of DevServer that symbolicates crash dumps."""
    900 
    901     @staticmethod
    902     def servers():
    903         return _get_crash_server_list()
    904 
    905 
    906     @remote_devserver_call()
    907     def symbolicate_dump(self, minidump_path, build):
    908         """Ask the devserver to symbolicate the dump at minidump_path.
    909 
    910         Stage the debug symbols for |build| and, if that works, ask the
    911         devserver to symbolicate the dump at |minidump_path|.
    912 
    913         @param minidump_path: the on-disk path of the minidump.
    914         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
    915                       whose debug symbols are needed for symbolication.
    916         @return The contents of the stack trace
    917         @raise DevServerException upon any return code that's not HTTP OK.
    918         """
    919         try:
    920             import requests
    921         except ImportError:
    922             logging.warning("Can't 'import requests' to connect to dev server.")
    923             return ''
    924         f = {'dev_server': self.resolved_hostname}
    925         c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump')
    926         c.increment(fields=f)
    927         # Symbolicate minidump.
    928         m = 'chromeos/autotest/crashserver/symbolicate_dump_duration'
    929         with metrics.SecondsTimer(m, fields=f):
    930             call = self.build_call('symbolicate_dump',
    931                                    archive_url=_get_image_storage_server() + build)
    932             request = requests.post(
    933                     call, files={'minidump': open(minidump_path, 'rb')})
    934             if request.status_code == requests.codes.OK:
    935                 return request.text
    936 
    937         error_fd = cStringIO.StringIO(request.text)
    938         raise urllib2.HTTPError(
    939                 call, request.status_code, request.text, request.headers,
    940                 error_fd)
    941 
    942 
    943     @classmethod
    944     def get_available_devservers(cls, hostname):
    945         """Get all available crash servers.
    946 
    947         Crash server election doesn't need to count the location of hostname.
    948 
    949         @param hostname: Hostname of a DUT to choose devserver for.
    950 
    951         @return: A tuple of (all crash servers, False). can_retry is set to
    952                  False, as all crash servers are returned. There is no point to
    953                  retry.
    954         """
    955         return cls.servers(), False
    956 
    957 
    958 class ImageServerBase(DevServer):
    959     """Base class for devservers used to stage builds.
    960 
    961     CrOS and Android builds are staged in different ways as they have different
    962     sets of artifacts. This base class abstracts the shared functions between
    963     the two types of ImageServer.
    964     """
    965 
    966     @classmethod
    967     def servers(cls):
    968         """Returns a list of servers that can serve as a desired type of
    969         devserver.
    970         """
    971         return _get_dev_server_list()
    972 
    973 
    974     def _get_image_url(self, image):
    975         """Returns the url of the directory for this image on the devserver.
    976 
    977         @param image: the image that was fetched.
    978         """
    979         image = self.translate(image)
    980         url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
    981                                               type=str)
    982         return (url_pattern % (self.url(), image)).replace('update', 'static')
    983 
    984 
    985     @staticmethod
    986     def create_metadata(server_name, image, artifacts=None, files=None):
    987         """Create a metadata dictionary given the staged items.
    988 
    989         The metadata can be send to metadata db along with stats.
    990 
    991         @param server_name: name of the devserver, e.g 172.22.33.44.
    992         @param image: The name of the image.
    993         @param artifacts: A list of artifacts.
    994         @param files: A list of files.
    995 
    996         @return A metadata dictionary.
    997 
    998         """
    999         metadata = {'devserver': server_name,
   1000                     'image': image,
   1001                     '_type': 'devserver'}
   1002         if artifacts:
   1003             metadata['artifacts'] = ' '.join(artifacts)
   1004         if files:
   1005             metadata['files'] = ' '.join(files)
   1006         return metadata
   1007 
   1008 
   1009     @classmethod
   1010     def run_ssh_call(cls, call, readline=False, timeout=None):
   1011         """Construct an ssh-based rpc call, and execute it.
   1012 
   1013         @param call: a url string that calls a method to a devserver.
   1014         @param readline: whether read http response line by line.
   1015         @param timeout: The timeout seconds for ssh call.
   1016 
   1017         @return the results of this call.
   1018         """
   1019         hostname = get_hostname(call)
   1020         ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call))
   1021         timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60
   1022         try:
   1023             result = utils.run(ssh_call, timeout=timeout_seconds)
   1024         except error.CmdError as e:
   1025             logging.debug('Error occurred with exit_code %d when executing the '
   1026                           'ssh call: %s.', e.result_obj.exit_status,
   1027                           e.result_obj.stderr)
   1028             c = metrics.Counter('chromeos/autotest/devserver/ssh_failure')
   1029             c.increment(fields={'dev_server': hostname})
   1030             raise
   1031         response = result.stdout
   1032 
   1033         # If the curl command's returned HTTP response contains certain
   1034         # exception string, raise the DevServerException of the response.
   1035         if 'DownloaderException' in response:
   1036             raise DevServerException(_strip_http_message(response))
   1037 
   1038         if readline:
   1039             # Remove line terminators and trailing whitespace
   1040             response = response.splitlines()
   1041             return [line.rstrip() for line in response]
   1042 
   1043         return response
   1044 
   1045 
   1046     @classmethod
   1047     def run_call(cls, call, readline=False, timeout=None):
   1048         """Invoke a given devserver call using urllib.open or ssh.
   1049 
   1050         Open the URL with HTTP or SSH-based HTTP, and return the text of the
   1051         response. Exceptions may be raised as for urllib2.urlopen() or
   1052         utils.run().
   1053 
   1054         @param call: a url string that calls a method to a devserver.
   1055         @param readline: whether read http response line by line.
   1056         @param timeout: The timeout seconds for urlopen call or ssh call.
   1057 
   1058         @return the results of this call.
   1059         """
   1060         server_name = get_hostname(call)
   1061         is_in_restricted_subnet = utils.get_restricted_subnet(
   1062                 server_name, utils.RESTRICTED_SUBNETS)
   1063         _EMPTY_SENTINEL_VALUE = object()
   1064         def kickoff_call():
   1065             """Invoke a given devserver call using urllib.open or ssh.
   1066 
   1067             @param call: a url string that calls a method to a devserver.
   1068             @param is_in_restricted_subnet: whether the devserver is in subnet.
   1069             @param readline: whether read http response line by line.
   1070             @param timeout: The timeout seconds for urlopen call or ssh call.
   1071             """
   1072             if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or
   1073                 not is_in_restricted_subnet):
   1074                 response = super(ImageServerBase, cls).run_call(
   1075                         call, readline=readline, timeout=timeout)
   1076             else:
   1077                 response = cls.run_ssh_call(
   1078                         call, readline=readline, timeout=timeout)
   1079             # Retry if devserver service is temporarily down, e.g. in a
   1080             # devserver push.
   1081             if ERR_MSG_FOR_DOWN_DEVSERVER in response:
   1082                 return False
   1083 
   1084             # Don't return response directly since it may be empty string,
   1085             # which causes poll_for_condition to retry.
   1086             return _EMPTY_SENTINEL_VALUE if not response else response
   1087 
   1088         try:
   1089             response = bin_utils.poll_for_condition(
   1090                     kickoff_call,
   1091                     exception=bin_utils.TimeoutError(),
   1092                     timeout=60,
   1093                     sleep_interval=5)
   1094             return '' if response is _EMPTY_SENTINEL_VALUE else response
   1095         except bin_utils.TimeoutError:
   1096             return ERR_MSG_FOR_DOWN_DEVSERVER
   1097 
   1098 
   1099     @classmethod
   1100     def download_file(cls, remote_file, local_file, timeout=None):
   1101         """Download file from devserver.
   1102 
   1103         The format of remote_file should be:
   1104             http://devserver_ip:8082/static/board/...
   1105 
   1106         @param remote_file: The URL of the file on devserver that need to be
   1107             downloaded.
   1108         @param local_file: The path of the file saved to local.
   1109         @param timeout: The timeout seconds for this call.
   1110         """
   1111         response = cls.run_call(remote_file, timeout=timeout)
   1112         with open(local_file, 'w') as out_log:
   1113             out_log.write(response)
   1114 
   1115 
   1116     def _poll_is_staged(self, **kwargs):
   1117         """Polling devserver.is_staged until all artifacts are staged.
   1118 
   1119         @param kwargs: keyword arguments to make is_staged devserver call.
   1120 
   1121         @return: True if all artifacts are staged in devserver.
   1122         """
   1123         call = self.build_call('is_staged', **kwargs)
   1124 
   1125         def all_staged():
   1126             """Call devserver.is_staged rpc to check if all files are staged.
   1127 
   1128             @return: True if all artifacts are staged in devserver. False
   1129                      otherwise.
   1130             @rasies DevServerException, the exception is a wrapper of all
   1131                     exceptions that were raised when devserver tried to download
   1132                     the artifacts. devserver raises an HTTPError or a CmdError
   1133                     when an exception was raised in the code. Such exception
   1134                     should be re-raised here to stop the caller from waiting.
   1135                     If the call to devserver failed for connection issue, a
   1136                     URLError exception is raised, and caller should retry the
   1137                     call to avoid such network flakiness.
   1138 
   1139             """
   1140             try:
   1141                 result = self.run_call(call)
   1142                 logging.debug('whether artifact is staged: %r', result)
   1143                 return result == 'True'
   1144             except urllib2.HTTPError as e:
   1145                 error_markup = e.read()
   1146                 raise DevServerException(_strip_http_message(error_markup))
   1147             except urllib2.URLError as e:
   1148                 # Could be connection issue, retry it.
   1149                 # For example: <urlopen error [Errno 111] Connection refused>
   1150                 logging.error('URLError happens in is_stage: %r', e)
   1151                 return False
   1152             except error.CmdError as e:
   1153                 # Retry if SSH failed to connect to the devserver.
   1154                 logging.warning('CmdError happens in is_stage: %r, will retry', e)
   1155                 return False
   1156 
   1157         bin_utils.poll_for_condition(
   1158                 all_staged,
   1159                 exception=bin_utils.TimeoutError(),
   1160                 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60,
   1161                 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL)
   1162 
   1163         return True
   1164 
   1165 
   1166     def _call_and_wait(self, call_name, error_message,
   1167                        expected_response=SUCCESS, **kwargs):
   1168         """Helper method to make a urlopen call, and wait for artifacts staged.
   1169 
   1170         @param call_name: name of devserver rpc call.
   1171         @param error_message: Error message to be thrown if response does not
   1172                               match expected_response.
   1173         @param expected_response: Expected response from rpc, default to
   1174                                   |Success|. If it's set to None, do not compare
   1175                                   the actual response. Any response is consider
   1176                                   to be good.
   1177         @param kwargs: keyword arguments to make is_staged devserver call.
   1178 
   1179         @return: The response from rpc.
   1180         @raise DevServerException upon any return code that's expected_response.
   1181 
   1182         """
   1183         call = self.build_call(call_name, async=True, **kwargs)
   1184         try:
   1185             response = self.run_call(call)
   1186             logging.debug('response for RPC: %r', response)
   1187             if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response:
   1188                 logging.debug('Proxy error happens in RPC call, '
   1189                               'will retry in 30 seconds')
   1190                 time.sleep(30)
   1191                 raise DevServerOverloadException()
   1192         except httplib.BadStatusLine as e:
   1193             logging.error(e)
   1194             raise DevServerException('Received Bad Status line, Devserver %s '
   1195                                      'might have gone down while handling '
   1196                                      'the call: %s' % (self.url(), call))
   1197 
   1198         if expected_response and not response == expected_response:
   1199                 raise DevServerException(error_message)
   1200 
   1201         # `os_type` is needed in build a devserver call, but not needed for
   1202         # wait_for_artifacts_staged, since that method is implemented by
   1203         # each ImageServerBase child class.
   1204         if 'os_type' in kwargs:
   1205             del kwargs['os_type']
   1206         self.wait_for_artifacts_staged(**kwargs)
   1207         return response
   1208 
   1209 
   1210     def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs):
   1211         """Tell the devserver to download and stage |artifacts| from |image|
   1212         specified by kwargs.
   1213 
   1214         This is the main call point for staging any specific artifacts for a
   1215         given build. To see the list of artifacts one can stage see:
   1216 
   1217         ~src/platfrom/dev/artifact_info.py.
   1218 
   1219         This is maintained along with the actual devserver code.
   1220 
   1221         @param artifacts: A list of artifacts.
   1222         @param files: A list of files to stage.
   1223         @param archive_url: Optional parameter that has the archive_url to stage
   1224                 this artifact from. Default is specified in autotest config +
   1225                 image.
   1226         @param kwargs: keyword arguments that specify the build information, to
   1227                 make stage devserver call.
   1228 
   1229         @raise DevServerException upon any return code that's not HTTP OK.
   1230         """
   1231         if not archive_url:
   1232             archive_url = _get_storage_server_for_artifacts(artifacts) + build
   1233 
   1234         artifacts_arg = ','.join(artifacts) if artifacts else ''
   1235         files_arg = ','.join(files) if files else ''
   1236         error_message = ("staging %s for %s failed;"
   1237                          "HTTP OK not accompanied by 'Success'." %
   1238                          ('artifacts=%s files=%s ' % (artifacts_arg, files_arg),
   1239                           build))
   1240 
   1241         staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' %
   1242                         (build, artifacts, files, archive_url))
   1243         logging.info('Staging artifacts on devserver %s: %s',
   1244                      self.url(), staging_info)
   1245         success = False
   1246         try:
   1247             arguments = {'archive_url': archive_url,
   1248                          'artifacts': artifacts_arg,
   1249                          'files': files_arg}
   1250             if kwargs:
   1251                 arguments.update(kwargs)
   1252             # TODO(akeshet): canonicalize artifacts_arg before using it as a
   1253             # metric field (as it stands it is a not-very-well-controlled
   1254             # string).
   1255             f = {'artifacts': artifacts_arg,
   1256                  'dev_server': self.resolved_hostname}
   1257             with metrics.SecondsTimer(
   1258                     'chromeos/autotest/devserver/stage_artifact_duration',
   1259                     fields=f):
   1260                 self.call_and_wait(call_name='stage', error_message=error_message,
   1261                                    **arguments)
   1262             logging.info('Finished staging artifacts: %s', staging_info)
   1263             success = True
   1264         except (bin_utils.TimeoutError, error.TimeoutException):
   1265             logging.error('stage_artifacts timed out: %s', staging_info)
   1266             raise DevServerException(
   1267                     'stage_artifacts timed out: %s' % staging_info)
   1268         finally:
   1269             f = {'success': success,
   1270                  'artifacts': artifacts_arg,
   1271                  'dev_server': self.resolved_hostname}
   1272             metrics.Counter('chromeos/autotest/devserver/stage_artifact'
   1273                             ).increment(fields=f)
   1274 
   1275 
   1276     def call_and_wait(self, *args, **kwargs):
   1277         """Helper method to make a urlopen call, and wait for artifacts staged.
   1278 
   1279         This method needs to be overridden in the subclass to implement the
   1280         logic to call _call_and_wait.
   1281         """
   1282         raise NotImplementedError
   1283 
   1284 
   1285     def _trigger_download(self, build, artifacts, files, synchronous=True,
   1286                           **kwargs_build_info):
   1287         """Tell the devserver to download and stage image specified in
   1288         kwargs_build_info.
   1289 
   1290         Tells the devserver to fetch |image| from the image storage server
   1291         named by _get_image_storage_server().
   1292 
   1293         If |synchronous| is True, waits for the entire download to finish
   1294         staging before returning. Otherwise only the artifacts necessary
   1295         to start installing images onto DUT's will be staged before returning.
   1296         A caller can then call finish_download to guarantee the rest of the
   1297         artifacts have finished staging.
   1298 
   1299         @param synchronous: if True, waits until all components of the image are
   1300                staged before returning.
   1301         @param kwargs_build_info: Dictionary of build information.
   1302                 For CrOS, it is None as build is the CrOS image name.
   1303                 For Android, it is {'target': target,
   1304                                     'build_id': build_id,
   1305                                     'branch': branch}
   1306 
   1307         @raise DevServerException upon any return code that's not HTTP OK.
   1308 
   1309         """
   1310         if kwargs_build_info:
   1311             archive_url = None
   1312         else:
   1313             archive_url = _get_image_storage_server() + build
   1314         error_message = ("trigger_download for %s failed;"
   1315                          "HTTP OK not accompanied by 'Success'." % build)
   1316         kwargs = {'archive_url': archive_url,
   1317                   'artifacts': artifacts,
   1318                   'files': files,
   1319                   'error_message': error_message}
   1320         if kwargs_build_info:
   1321             kwargs.update(kwargs_build_info)
   1322 
   1323         logging.info('trigger_download starts for %s', build)
   1324         try:
   1325             response = self.call_and_wait(call_name='stage', **kwargs)
   1326             logging.info('trigger_download finishes for %s', build)
   1327         except (bin_utils.TimeoutError, error.TimeoutException):
   1328             logging.error('trigger_download timed out for %s.', build)
   1329             raise DevServerException(
   1330                     'trigger_download timed out for %s.' % build)
   1331         was_successful = response == SUCCESS
   1332         if was_successful and synchronous:
   1333             self._finish_download(build, artifacts, files, **kwargs_build_info)
   1334 
   1335 
   1336     def _finish_download(self, build, artifacts, files, **kwargs_build_info):
   1337         """Tell the devserver to finish staging image specified in
   1338         kwargs_build_info.
   1339 
   1340         If trigger_download is called with synchronous=False, it will return
   1341         before all artifacts have been staged. This method contacts the
   1342         devserver and blocks until all staging is completed and should be
   1343         called after a call to trigger_download.
   1344 
   1345         @param kwargs_build_info: Dictionary of build information.
   1346                 For CrOS, it is None as build is the CrOS image name.
   1347                 For Android, it is {'target': target,
   1348                                     'build_id': build_id,
   1349                                     'branch': branch}
   1350 
   1351         @raise DevServerException upon any return code that's not HTTP OK.
   1352         """
   1353         archive_url = _get_image_storage_server() + build
   1354         error_message = ("finish_download for %s failed;"
   1355                          "HTTP OK not accompanied by 'Success'." % build)
   1356         kwargs = {'archive_url': archive_url,
   1357                   'artifacts': artifacts,
   1358                   'files': files,
   1359                   'error_message': error_message}
   1360         if kwargs_build_info:
   1361             kwargs.update(kwargs_build_info)
   1362         try:
   1363             self.call_and_wait(call_name='stage', **kwargs)
   1364         except (bin_utils.TimeoutError, error.TimeoutException):
   1365             logging.error('finish_download timed out for %s', build)
   1366             raise DevServerException(
   1367                     'finish_download timed out for %s.' % build)
   1368 
   1369 
   1370     @remote_devserver_call()
   1371     def locate_file(self, file_name, artifacts, build, build_info):
   1372         """Locate a file with the given file_name on devserver.
   1373 
   1374         This method calls devserver RPC `locate_file` to look up a file with
   1375         the given file name inside specified build artifacts.
   1376 
   1377         @param file_name: Name of the file to look for a file.
   1378         @param artifacts: A list of artifact names to search for the file.
   1379         @param build: Name of the build. For Android, it's None as build_info
   1380                 should be used.
   1381         @param build_info: Dictionary of build information.
   1382                 For CrOS, it is None as build is the CrOS image name.
   1383                 For Android, it is {'target': target,
   1384                                     'build_id': build_id,
   1385                                     'branch': branch}
   1386 
   1387         @return: A devserver url to the file.
   1388         @raise DevServerException upon any return code that's not HTTP OK.
   1389         """
   1390         if not build and not build_info:
   1391             raise DevServerException('You must specify build information to '
   1392                                      'look for file %s in artifacts %s.' %
   1393                                      (file_name, artifacts))
   1394         kwargs = {'file_name': file_name,
   1395                   'artifacts': artifacts}
   1396         if build_info:
   1397             build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info
   1398             kwargs.update(build_info)
   1399             # Devserver treats Android and Brillo build in the same way as they
   1400             # are both retrieved from Launch Control and have similar build
   1401             # artifacts. Therefore, os_type for devserver calls is `android` for
   1402             # both Android and Brillo builds.
   1403             kwargs['os_type'] = 'android'
   1404         else:
   1405             build_path = build
   1406             kwargs['build'] = build
   1407         call = self.build_call('locate_file', async=False, **kwargs)
   1408         try:
   1409             file_path = self.run_call(call)
   1410             return os.path.join(self.url(), 'static', build_path, file_path)
   1411         except httplib.BadStatusLine as e:
   1412             logging.error(e)
   1413             raise DevServerException('Received Bad Status line, Devserver %s '
   1414                                      'might have gone down while handling '
   1415                                      'the call: %s' % (self.url(), call))
   1416 
   1417 
   1418     @remote_devserver_call()
   1419     def list_control_files(self, build, suite_name=''):
   1420         """Ask the devserver to list all control files for |build|.
   1421 
   1422         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
   1423                       whose control files the caller wants listed.
   1424         @param suite_name: The name of the suite for which we require control
   1425                            files.
   1426         @return None on failure, or a list of control file paths
   1427                 (e.g. server/site_tests/autoupdate/control)
   1428         @raise DevServerException upon any return code that's not HTTP OK.
   1429         """
   1430         build = self.translate(build)
   1431         call = self.build_call('controlfiles', build=build,
   1432                                suite_name=suite_name)
   1433         return self.run_call(call, readline=True)
   1434 
   1435 
   1436     @remote_devserver_call()
   1437     def get_control_file(self, build, control_path):
   1438         """Ask the devserver for the contents of a control file.
   1439 
   1440         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
   1441                       whose control file the caller wants to fetch.
   1442         @param control_path: The file to fetch
   1443                              (e.g. server/site_tests/autoupdate/control)
   1444         @return The contents of the desired file.
   1445         @raise DevServerException upon any return code that's not HTTP OK.
   1446         """
   1447         build = self.translate(build)
   1448         call = self.build_call('controlfiles', build=build,
   1449                                control_path=control_path)
   1450         return self.run_call(call)
   1451 
   1452 
   1453     @remote_devserver_call()
   1454     def list_suite_controls(self, build, suite_name=''):
   1455         """Ask the devserver to list contents of all control files for |build|.
   1456 
   1457         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
   1458                       whose control files' contents the caller wants returned.
   1459         @param suite_name: The name of the suite for which we require control
   1460                            files.
   1461         @return None on failure, or a dict of contents of all control files
   1462             (e.g. {'path1': "#Copyright controls ***", ...,
   1463                 pathX': "#Copyright controls ***"}
   1464         @raise DevServerException upon any return code that's not HTTP OK.
   1465         """
   1466         build = self.translate(build)
   1467         call = self.build_call('list_suite_controls', build=build,
   1468                                suite_name=suite_name)
   1469         return json.load(cStringIO.StringIO(self.run_call(call)))
   1470 
   1471 
   1472 class ImageServer(ImageServerBase):
   1473     """Class for DevServer that handles RPCs related to CrOS images.
   1474 
   1475     The calls to devserver to stage artifacts, including stage and download, are
   1476     made in async mode. That is, when caller makes an RPC |stage| to request
   1477     devserver to stage certain artifacts, devserver handles the call and starts
   1478     staging artifacts in a new thread, and return |Success| without waiting for
   1479     staging being completed. When caller receives message |Success|, it polls
   1480     devserver's is_staged call until all artifacts are staged.
   1481     Such mechanism is designed to prevent cherrypy threads in devserver being
   1482     running out, as staging artifacts might take long time, and cherrypy starts
   1483     with a fixed number of threads that handle devserver rpc.
   1484     """
   1485 
   1486     class ArtifactUrls(object):
   1487         """A container for URLs of staged artifacts.
   1488 
   1489         Attributes:
   1490             full_payload: URL for downloading a staged full release update
   1491             mton_payload: URL for downloading a staged M-to-N release update
   1492             nton_payload: URL for downloading a staged N-to-N release update
   1493 
   1494         """
   1495         def __init__(self, full_payload=None, mton_payload=None,
   1496                      nton_payload=None):
   1497             self.full_payload = full_payload
   1498             self.mton_payload = mton_payload
   1499             self.nton_payload = nton_payload
   1500 
   1501 
   1502     def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''):
   1503         """Polling devserver.is_staged until all artifacts are staged.
   1504 
   1505         @param archive_url: Google Storage URL for the build.
   1506         @param artifacts: Comma separated list of artifacts to download.
   1507         @param files: Comma separated list of files to download.
   1508         @return: True if all artifacts are staged in devserver.
   1509         """
   1510         kwargs = {'archive_url': archive_url,
   1511                   'artifacts': artifacts,
   1512                   'files': files}
   1513         return self._poll_is_staged(**kwargs)
   1514 
   1515 
   1516     @remote_devserver_call()
   1517     def call_and_wait(self, call_name, archive_url, artifacts, files,
   1518                       error_message, expected_response=SUCCESS):
   1519         """Helper method to make a urlopen call, and wait for artifacts staged.
   1520 
   1521         @param call_name: name of devserver rpc call.
   1522         @param archive_url: Google Storage URL for the build..
   1523         @param artifacts: Comma separated list of artifacts to download.
   1524         @param files: Comma separated list of files to download.
   1525         @param expected_response: Expected response from rpc, default to
   1526                                   |Success|. If it's set to None, do not compare
   1527                                   the actual response. Any response is consider
   1528                                   to be good.
   1529         @param error_message: Error message to be thrown if response does not
   1530                               match expected_response.
   1531 
   1532         @return: The response from rpc.
   1533         @raise DevServerException upon any return code that's expected_response.
   1534 
   1535         """
   1536         kwargs = {'archive_url': archive_url,
   1537                   'artifacts': artifacts,
   1538                   'files': files}
   1539         return self._call_and_wait(call_name, error_message,
   1540                                    expected_response, **kwargs)
   1541 
   1542 
   1543     @remote_devserver_call()
   1544     def stage_artifacts(self, image=None, artifacts=None, files='',
   1545                         archive_url=None):
   1546         """Tell the devserver to download and stage |artifacts| from |image|.
   1547 
   1548          This is the main call point for staging any specific artifacts for a
   1549         given build. To see the list of artifacts one can stage see:
   1550 
   1551         ~src/platfrom/dev/artifact_info.py.
   1552 
   1553         This is maintained along with the actual devserver code.
   1554 
   1555         @param image: the image to fetch and stage.
   1556         @param artifacts: A list of artifacts.
   1557         @param files: A list of files to stage.
   1558         @param archive_url: Optional parameter that has the archive_url to stage
   1559                 this artifact from. Default is specified in autotest config +
   1560                 image.
   1561 
   1562         @raise DevServerException upon any return code that's not HTTP OK.
   1563         """
   1564         if not artifacts and not files:
   1565             raise DevServerException('Must specify something to stage.')
   1566         image = self.translate(image)
   1567         self._stage_artifacts(image, artifacts, files, archive_url)
   1568 
   1569 
   1570     @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS)
   1571     def list_image_dir(self, image):
   1572         """List the contents of the image stage directory, on the devserver.
   1573 
   1574         @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>.
   1575 
   1576         @raise DevServerException upon any return code that's not HTTP OK.
   1577         """
   1578         image = self.translate(image)
   1579         logging.info('Requesting contents from devserver %s for image %s',
   1580                      self.url(), image)
   1581         archive_url = _get_storage_server_for_artifacts() + image
   1582         call = self.build_call('list_image_dir', archive_url=archive_url)
   1583         response = self.run_call(call, readline=True)
   1584         for line in response:
   1585             logging.info(line)
   1586 
   1587 
   1588     def trigger_download(self, image, synchronous=True):
   1589         """Tell the devserver to download and stage |image|.
   1590 
   1591         Tells the devserver to fetch |image| from the image storage server
   1592         named by _get_image_storage_server().
   1593 
   1594         If |synchronous| is True, waits for the entire download to finish
   1595         staging before returning. Otherwise only the artifacts necessary
   1596         to start installing images onto DUT's will be staged before returning.
   1597         A caller can then call finish_download to guarantee the rest of the
   1598         artifacts have finished staging.
   1599 
   1600         @param image: the image to fetch and stage.
   1601         @param synchronous: if True, waits until all components of the image are
   1602                staged before returning.
   1603 
   1604         @raise DevServerException upon any return code that's not HTTP OK.
   1605 
   1606         """
   1607         image = self.translate(image)
   1608         artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE
   1609         self._trigger_download(image, artifacts, files='',
   1610                                synchronous=synchronous)
   1611 
   1612 
   1613     @remote_devserver_call()
   1614     def setup_telemetry(self, build):
   1615         """Tell the devserver to setup telemetry for this build.
   1616 
   1617         The devserver will stage autotest and then extract the required files
   1618         for telemetry.
   1619 
   1620         @param build: the build to setup telemetry for.
   1621 
   1622         @returns path on the devserver that telemetry is installed to.
   1623         """
   1624         build = self.translate(build)
   1625         archive_url = _get_image_storage_server() + build
   1626         call = self.build_call('setup_telemetry', archive_url=archive_url)
   1627         try:
   1628             response = self.run_call(call)
   1629         except httplib.BadStatusLine as e:
   1630             logging.error(e)
   1631             raise DevServerException('Received Bad Status line, Devserver %s '
   1632                                      'might have gone down while handling '
   1633                                      'the call: %s' % (self.url(), call))
   1634         return response
   1635 
   1636 
   1637     def finish_download(self, image):
   1638         """Tell the devserver to finish staging |image|.
   1639 
   1640         If trigger_download is called with synchronous=False, it will return
   1641         before all artifacts have been staged. This method contacts the
   1642         devserver and blocks until all staging is completed and should be
   1643         called after a call to trigger_download.
   1644 
   1645         @param image: the image to fetch and stage.
   1646         @raise DevServerException upon any return code that's not HTTP OK.
   1647         """
   1648         image = self.translate(image)
   1649         artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST
   1650         self._finish_download(image, artifacts, files='')
   1651 
   1652 
   1653     def get_update_url(self, image):
   1654         """Returns the url that should be passed to the updater.
   1655 
   1656         @param image: the image that was fetched.
   1657         """
   1658         image = self.translate(image)
   1659         url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
   1660                                               type=str)
   1661         return (url_pattern % (self.url(), image))
   1662 
   1663 
   1664     def get_staged_file_url(self, filename, image):
   1665         """Returns the url of a staged file for this image on the devserver."""
   1666         return '/'.join([self._get_image_url(image), filename])
   1667 
   1668 
   1669     def get_full_payload_url(self, image):
   1670         """Returns a URL to a staged full payload.
   1671 
   1672         @param image: the image that was fetched.
   1673 
   1674         @return A fully qualified URL that can be used for downloading the
   1675                 payload.
   1676 
   1677         """
   1678         return self._get_image_url(image) + '/update.gz'
   1679 
   1680 
   1681     def get_test_image_url(self, image):
   1682         """Returns a URL to a staged test image.
   1683 
   1684         @param image: the image that was fetched.
   1685 
   1686         @return A fully qualified URL that can be used for downloading the
   1687                 image.
   1688 
   1689         """
   1690         return self._get_image_url(image) + '/chromiumos_test_image.bin'
   1691 
   1692 
   1693     @remote_devserver_call()
   1694     def get_dependencies_file(self, build):
   1695         """Ask the dev server for the contents of the suite dependencies file.
   1696 
   1697         Ask the dev server at |self._dev_server| for the contents of the
   1698         pre-processed suite dependencies file (at DEPENDENCIES_FILE)
   1699         for |build|.
   1700 
   1701         @param build: The build (e.g. x86-mario-release/R21-2333.0.0)
   1702                       whose dependencies the caller is interested in.
   1703         @return The contents of the dependencies file, which should eval to
   1704                 a dict of dicts, as per bin_utils/suite_preprocessor.py.
   1705         @raise DevServerException upon any return code that's not HTTP OK.
   1706         """
   1707         build = self.translate(build)
   1708         call = self.build_call('controlfiles',
   1709                                build=build, control_path=DEPENDENCIES_FILE)
   1710         return self.run_call(call)
   1711 
   1712 
   1713     @remote_devserver_call()
   1714     def get_latest_build_in_gs(self, board):
   1715         """Ask the devservers for the latest offical build in Google Storage.
   1716 
   1717         @param board: The board for who we want the latest official build.
   1718         @return A string of the returned build rambi-release/R37-5868.0.0
   1719         @raise DevServerException upon any return code that's not HTTP OK.
   1720         """
   1721         call = self.build_call(
   1722                 'xbuddy_translate/remote/%s/latest-official' % board,
   1723                 image_dir=_get_image_storage_server())
   1724         image_name = self.run_call(call)
   1725         return os.path.dirname(image_name)
   1726 
   1727 
   1728     def translate(self, build_name):
   1729         """Translate the build name if it's in LATEST format.
   1730 
   1731         If the build name is in the format [builder]/LATEST, return the latest
   1732         build in Google Storage otherwise return the build name as is.
   1733 
   1734         @param build_name: build_name to check.
   1735 
   1736         @return The actual build name to use.
   1737         """
   1738         match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I)
   1739         if not match:
   1740             return build_name
   1741         translated_build = self.get_latest_build_in_gs(match.groups()[0])
   1742         logging.debug('Translated relative build %s to %s', build_name,
   1743                       translated_build)
   1744         return translated_build
   1745 
   1746 
   1747     @classmethod
   1748     @remote_devserver_call()
   1749     def get_latest_build(cls, target, milestone=''):
   1750         """Ask all the devservers for the latest build for a given target.
   1751 
   1752         @param target: The build target, typically a combination of the board
   1753                        and the type of build e.g. x86-mario-release.
   1754         @param milestone:  For latest build set to '', for builds only in a
   1755                            specific milestone set to a str of format Rxx
   1756                            (e.g. R16). Default: ''. Since we are dealing with a
   1757                            webserver sending an empty string, '', ensures that
   1758                            the variable in the URL is ignored as if it was set
   1759                            to None.
   1760         @return A string of the returned build e.g. R20-2226.0.0.
   1761         @raise DevServerException upon any return code that's not HTTP OK.
   1762         """
   1763         calls = cls.build_all_calls('latestbuild', target=target,
   1764                                     milestone=milestone)
   1765         latest_builds = []
   1766         for call in calls:
   1767             latest_builds.append(cls.run_call(call))
   1768 
   1769         return max(latest_builds, key=version.LooseVersion)
   1770 
   1771 
   1772     @remote_devserver_call()
   1773     def _kill_au_process_for_host(self, **kwargs):
   1774         """Kill the triggerred auto_update process if error happens in cros_au.
   1775 
   1776         @param kwargs: Arguments to make kill_au_proc devserver call.
   1777         """
   1778         call = self.build_call('kill_au_proc', **kwargs)
   1779         response = self.run_call(call)
   1780         if not response == 'True':
   1781             raise DevServerException(
   1782                     'Failed to kill the triggerred CrOS auto_update process'
   1783                     'on devserver %s, the response is %s' % (
   1784                             self.url(), response))
   1785 
   1786 
   1787     def kill_au_process_for_host(self, host_name, pid):
   1788         """Kill the triggerred auto_update process if error happens.
   1789 
   1790         Usually this function is used to clear all potential left au processes
   1791         of the given host name.
   1792 
   1793         If pid is specified, the devserver will further check the given pid to
   1794         make sure the process is killed. This is used for the case that the au
   1795         process has started in background, but then provision fails due to
   1796         some unknown issues very fast. In this case, when 'kill_au_proc' is
   1797         called, there's no corresponding background track log created for this
   1798         ongoing au process, which prevents this RPC call from killing this au
   1799         process.
   1800 
   1801         @param host_name: The DUT's hostname.
   1802         @param pid: The ongoing au process's pid.
   1803 
   1804         @return: True if successfully kill the auto-update process for host.
   1805         """
   1806         kwargs = {'host_name': host_name, 'pid': pid}
   1807         try:
   1808             self._kill_au_process_for_host(**kwargs)
   1809         except DevServerException:
   1810             return False
   1811 
   1812         return True
   1813 
   1814 
   1815     @remote_devserver_call()
   1816     def _clean_track_log(self, **kwargs):
   1817         """Clean track log for the current auto-update process."""
   1818         call = self.build_call('handler_cleanup', **kwargs)
   1819         self.run_call(call)
   1820 
   1821 
   1822     def clean_track_log(self, host_name, pid):
   1823         """Clean track log for the current auto-update process.
   1824 
   1825         @param host_name: The host name to be updated.
   1826         @param pid: The auto-update process id.
   1827 
   1828         @return: True if track log is successfully cleaned, False otherwise.
   1829         """
   1830         if not pid:
   1831             return False
   1832 
   1833         kwargs = {'host_name': host_name, 'pid': pid}
   1834         try:
   1835             self._clean_track_log(**kwargs)
   1836         except DevServerException as e:
   1837             logging.debug('Failed to clean track_status_file on '
   1838                           'devserver for host %s and process id %s: %s',
   1839                           host_name, pid, str(e))
   1840             return False
   1841 
   1842         return True
   1843 
   1844 
   1845     def _get_au_log_filename(self, log_dir, host_name, pid):
   1846         """Return the auto-update log's filename."""
   1847         return os.path.join(log_dir, CROS_AU_LOG_FILENAME % (
   1848                     host_name, pid))
   1849 
   1850     def _read_json_response_from_devserver(self, response):
   1851         """Reads the json response from the devserver.
   1852 
   1853         This is extracted to its own function so that it can be easily mocked.
   1854         @param response: the response for a devserver.
   1855         """
   1856         try:
   1857             return json.loads(response)
   1858         except ValueError as e:
   1859             logging.debug('Failed to load json response: %s', response)
   1860             raise DevServerException(e)
   1861 
   1862 
   1863     @remote_devserver_call()
   1864     def _collect_au_log(self, log_dir, **kwargs):
   1865         """Collect logs from devserver after cros-update process is finished.
   1866 
   1867         Collect the logs that recording the whole cros-update process, and
   1868         write it to sysinfo path of a job.
   1869 
   1870         The example log file name that is stored is like:
   1871             '1220-repair/sysinfo/CrOS_update_host_name_pid.log'
   1872 
   1873         @param host_name: the DUT's hostname.
   1874         @param pid: the auto-update process id on devserver.
   1875         @param log_dir: The directory to save the cros-update process log
   1876                         retrieved from devserver.
   1877         """
   1878         call = self.build_call('collect_cros_au_log', **kwargs)
   1879         response = self.run_call(call)
   1880         if not os.path.exists(log_dir):
   1881             os.mkdir(log_dir)
   1882         write_file = self._get_au_log_filename(
   1883                 log_dir, kwargs['host_name'], kwargs['pid'])
   1884         logging.debug('Saving auto-update logs into %s', write_file)
   1885 
   1886         au_logs = self._read_json_response_from_devserver(response)
   1887 
   1888         try:
   1889             for k, v in au_logs['host_logs'].items():
   1890                 log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid'])
   1891                 log_path = os.path.join(log_dir, log_name)
   1892                 with open(log_path, 'w') as out_log:
   1893                     out_log.write(v)
   1894         except IOError as e:
   1895             raise DevServerException('Failed to write auto-update hostlogs: '
   1896                                      '%s' % e)
   1897 
   1898         try:
   1899             with open(write_file, 'w') as out_log:
   1900                 out_log.write(au_logs['cros_au_log'])
   1901         except:
   1902             raise DevServerException('Failed to write auto-update logs into '
   1903                                      '%s' % write_file)
   1904 
   1905 
   1906     def collect_au_log(self, host_name, pid, log_dir):
   1907         """Collect logs from devserver after cros-update process is finished.
   1908 
   1909         @param host_name: the DUT's hostname.
   1910         @param pid: the auto-update process id on devserver.
   1911         @param log_dir: The directory to save the cros-update process log
   1912                         retrieved from devserver.
   1913 
   1914         @return: True if auto-update log is successfully collected, False
   1915           otherwise.
   1916         """
   1917         if not pid:
   1918             return False
   1919 
   1920         kwargs = {'host_name': host_name, 'pid': pid}
   1921         try:
   1922             self._collect_au_log(log_dir, **kwargs)
   1923         except DevServerException as e:
   1924             logging.debug('Failed to collect auto-update log on '
   1925                           'devserver for host %s and process id %s: %s',
   1926                           host_name, pid, str(e))
   1927             return False
   1928 
   1929         return True
   1930 
   1931 
   1932     @remote_devserver_call()
   1933     def _trigger_auto_update(self, **kwargs):
   1934         """Trigger auto-update by calling devserver.cros_au.
   1935 
   1936         @param kwargs:  Arguments to make cros_au devserver call.
   1937 
   1938         @return: a tuple indicates whether the RPC call cros_au succeeds and
   1939           the auto-update process id running on devserver.
   1940         """
   1941         host_name = kwargs['host_name']
   1942         call = self.build_call('cros_au', async=True, **kwargs)
   1943         try:
   1944             response = self.run_call(call)
   1945             logging.info(
   1946                 'Received response from devserver for cros_au call: %r',
   1947                 response)
   1948         except httplib.BadStatusLine as e:
   1949             logging.error(e)
   1950             raise DevServerException('Received Bad Status line, Devserver %s '
   1951                                      'might have gone down while handling '
   1952                                      'the call: %s' % (self.url(), call))
   1953 
   1954         return response
   1955 
   1956 
   1957     def _check_for_auto_update_finished(self, pid, wait=True, **kwargs):
   1958         """Polling devserver.get_au_status to get current auto-update status.
   1959 
   1960         The current auto-update status is used to identify whether the update
   1961         process is finished.
   1962 
   1963         @param pid:    The background process id for auto-update in devserver.
   1964         @param kwargs: keyword arguments to make get_au_status devserver call.
   1965         @param wait:   Should the check wait for completion.
   1966 
   1967         @return: True if auto-update is finished for a given dut.
   1968         """
   1969         logging.debug('Check the progress for auto-update process %r', pid)
   1970         kwargs['pid'] = pid
   1971         call = self.build_call('get_au_status', **kwargs)
   1972 
   1973         def all_finished():
   1974             """Call devserver.get_au_status rpc to check if auto-update
   1975                is finished.
   1976 
   1977             @return: True if auto-update is finished for a given dut. False
   1978                      otherwise.
   1979             @rasies  DevServerException, the exception is a wrapper of all
   1980                      exceptions that were raised when devserver tried to
   1981                      download the artifacts. devserver raises an HTTPError or
   1982                      a CmdError when an exception was raised in the code. Such
   1983                      exception should be re-raised here to stop the caller from
   1984                      waiting. If the call to devserver failed for connection
   1985                      issue, a URLError exception is raised, and caller should
   1986                      retry the call to avoid such network flakiness.
   1987 
   1988             """
   1989             try:
   1990                 au_status = self.run_call(call)
   1991                 response = json.loads(au_status)
   1992                 # This is a temp fix to fit both dict and tuple returning
   1993                 # values. The dict check will be removed after a corresponding
   1994                 # devserver CL is deployed.
   1995                 if isinstance(response, dict):
   1996                     if response.get('detailed_error_msg'):
   1997                         raise DevServerException(
   1998                                 response.get('detailed_error_msg'))
   1999 
   2000                     if response.get('finished'):
   2001                         logging.debug('CrOS auto-update is finished')
   2002                         return True
   2003                     else:
   2004                         logging.debug('Current CrOS auto-update status: %s',
   2005                                       response.get('status'))
   2006                         return False
   2007 
   2008                 if not response[0]:
   2009                     logging.debug('Current CrOS auto-update status: %s',
   2010                                   response[1])
   2011                     return False
   2012                 else:
   2013                     logging.debug('CrOS auto-update is finished')
   2014                     return True
   2015             except urllib2.HTTPError as e:
   2016                 error_markup = e.read()
   2017                 raise DevServerException(_strip_http_message(error_markup))
   2018             except urllib2.URLError as e:
   2019                 # Could be connection issue, retry it.
   2020                 # For example: <urlopen error [Errno 111] Connection refused>
   2021                 logging.warning('URLError (%r): Retrying connection to '
   2022                                 'devserver to check auto-update status.', e)
   2023                 return False
   2024             except error.CmdError:
   2025                 # Retry if SSH failed to connect to the devserver.
   2026                 logging.warning('CmdError: Retrying SSH connection to check '
   2027                                 'auto-update status.')
   2028                 return False
   2029             except socket.error as e:
   2030                 # Could be some temporary devserver connection issues.
   2031                 logging.warning('Socket Error (%r): Retrying connection to '
   2032                                 'devserver to check auto-update status.', e)
   2033                 return False
   2034             except ValueError as e:
   2035                 raise DevServerException(
   2036                         '%s (Got AU status: %r)' % (str(e), au_status))
   2037 
   2038         if wait:
   2039             bin_utils.poll_for_condition(
   2040                     all_finished,
   2041                     exception=bin_utils.TimeoutError(),
   2042                     timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60,
   2043                     sleep_interval=CROS_AU_POLLING_INTERVAL)
   2044 
   2045             return True
   2046         else:
   2047             return all_finished()
   2048 
   2049 
   2050     def check_for_auto_update_finished(self, response, wait=True, **kwargs):
   2051         """Processing response of 'cros_au' and polling for auto-update status.
   2052 
   2053         Will wait for the whole auto-update process is finished.
   2054 
   2055         @param response: The response from RPC 'cros_au'
   2056         @param kwargs: keyword arguments to make get_au_status devserver call.
   2057 
   2058         @return: a tuple includes two elements.
   2059           finished: True if the operation has completed.
   2060           raised_error: None if everything works well or the raised error.
   2061           pid: the auto-update process id on devserver.
   2062         """
   2063 
   2064         pid = 0
   2065         raised_error = None
   2066         finished = False
   2067         try:
   2068             response = json.loads(response)
   2069             if response[0]:
   2070                 pid = response[1]
   2071                 # If provision is kicked off asynchronously, pid will be -1.
   2072                 # If provision is not successfully kicked off , pid continues
   2073                 # to be 0.
   2074                 if pid > 0:
   2075                     logging.debug('start process %r for auto_update in '
   2076                                   'devserver', pid)
   2077                     finished = self._check_for_auto_update_finished(
   2078                             pid, wait=wait, **kwargs)
   2079         except Exception as e:
   2080             logging.debug('Failed to trigger auto-update process on devserver')
   2081             finished = True
   2082             raised_error = e
   2083         finally:
   2084             return finished, raised_error, pid
   2085 
   2086 
   2087     def _check_error_message(self, error_patterns_to_check, error_msg):
   2088         """Detect whether specific error pattern exist in error message.
   2089 
   2090         @param error_patterns_to_check: the error patterns to check
   2091         @param error_msg: the error message which may include any error
   2092                           pattern.
   2093 
   2094         @return A boolean variable, True if error_msg contains any error
   2095             pattern in error_patterns_to_check, False otherwise.
   2096         """
   2097         for err in error_patterns_to_check:
   2098             if err in error_msg:
   2099                 return True
   2100 
   2101         return False
   2102 
   2103 
   2104     def _is_retryable(self, error_msg):
   2105         """Detect whether we will retry auto-update based on error_msg.
   2106 
   2107         @param error_msg: The given error message.
   2108 
   2109         @return A boolean variable which indicates whether we will retry
   2110             auto_update with another devserver based on the given error_msg.
   2111         """
   2112         # For now we just hard-code the error message we think it's suspicious.
   2113         # When we get more date about what's the json response when devserver
   2114         # is overloaded, we can update this part.
   2115         retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE,
   2116                                     'is not pingable']
   2117         return self._check_error_message(retryable_error_patterns, error_msg)
   2118 
   2119 
   2120     def _should_use_original_payload(self, error_msg):
   2121         devserver_error_patterns = ['DevserverCannotStartError']
   2122         return self._check_error_message(devserver_error_patterns, error_msg)
   2123 
   2124 
   2125     def _parse_buildname_safely(self, build_name):
   2126         """Parse a given buildname safely.
   2127 
   2128         @param build_name: the build name to be parsed.
   2129 
   2130         @return: a tuple (board, build_type, milestone)
   2131         """
   2132         try:
   2133             board, build_type, milestone, _ = server_utils.ParseBuildName(
   2134                     build_name)
   2135         except server_utils.ParseBuildNameException:
   2136             logging.warning('Unable to parse build name %s for metrics. '
   2137                             'Continuing anyway.', build_name)
   2138             board, build_type, milestone = ('', '', '')
   2139 
   2140         return board, build_type, milestone
   2141 
   2142 
   2143     def _emit_auto_update_metrics(self, board, build_type, dut_host_name,
   2144                                   build_name, attempt,
   2145                                   success, failure_reason, duration):
   2146         """Send metrics for a single auto_update attempt.
   2147 
   2148         @param board: a field in metrics representing which board this
   2149             auto_update tries to update.
   2150         @param build_type: a field in metrics representing which build type this
   2151             auto_update tries to update.
   2152         @param dut_host_name: a field in metrics representing which DUT this
   2153             auto_update tries to update.
   2154         @param build_name: auto update build being updated to.
   2155         @param attempt: a field in metrics, representing which attempt/retry
   2156             this auto_update is.
   2157         @param success: a field in metrics, representing whether this
   2158             auto_update succeeds or not.
   2159         @param failure_reason: DevServerExceptionClassifier object to show
   2160             auto update failure reason, or None.
   2161         @param duration: auto update duration time, in seconds.
   2162         """
   2163         # The following is high cardinality, but sparse.
   2164         # Each DUT is of a single board type, and likely build type.
   2165         # The affinity also results in each DUT being attached to the same
   2166         # dev_server as well.
   2167         fields = {
   2168                 'board': board,
   2169                 'build_type': build_type,
   2170                 'dut_host_name': dut_host_name,
   2171                 'dev_server': self.resolved_hostname,
   2172                 'attempt': attempt,
   2173                 'success': success,
   2174         }
   2175 
   2176         # reset_after=True is required for String gauges events to ensure that
   2177         # the metrics are not repeatedly emitted until the server restarts.
   2178 
   2179         metrics.String(PROVISION_PATH + '/auto_update_build_by_devserver_dut',
   2180                        reset_after=True).set(build_name, fields=fields)
   2181 
   2182         if not success:
   2183             metrics.String(
   2184                 PROVISION_PATH +
   2185                 '/auto_update_failure_reason_by_devserver_dut',
   2186                 reset_after=True).set(
   2187                     failure_reason.classification if failure_reason else '',
   2188                     fields=fields)
   2189 
   2190         metrics.SecondsDistribution(
   2191                 PROVISION_PATH + '/auto_update_duration_by_devserver_dut').add(
   2192                         duration, fields=fields)
   2193 
   2194 
   2195     def _emit_provision_metrics(self, error_list, duration_list,
   2196                                 is_au_success, board, build_type, milestone,
   2197                                 dut_host_name, is_aue2etest,
   2198                                 total_duration, build_name):
   2199         """Send metrics for provision request.
   2200 
   2201         Provision represents potentially multiple auto update attempts.
   2202 
   2203         Please note: to avoid reaching or exceeding the monarch field
   2204         cardinality limit, we avoid a metric that includes both dut hostname
   2205         and other high cardinality fields.
   2206 
   2207         @param error_list: a list of DevServerExceptionClassifier objects to
   2208             show errors happened in provision. Usually it contains 1 ~
   2209             AU_RETRY_LIMIT objects since we only retry provision for several
   2210             times.
   2211         @param duration_list: a list of provision duration time, counted by
   2212             seconds.
   2213         @param is_au_success: a field in metrics, representing whether this
   2214             auto_update succeeds or not.
   2215         @param board: a field in metrics representing which board this
   2216             auto_update tries to update.
   2217         @param build_type: a field in metrics representing which build type this
   2218             auto_update tries to update.
   2219         @param milestone: a field in metrics representing which milestone this
   2220             auto_update tries to update.
   2221         @param dut_host_name: a field in metrics representing which DUT this
   2222             auto_update tries to update.
   2223         @param is_aue2etest: a field in metrics representing if provision was
   2224             done as part of the autoupdate_EndToEndTest.
   2225         """
   2226         # The following is high cardinality, but sparse.
   2227         # Each DUT is of a single board type, and likely build type.
   2228         # The affinity also results in each DUT being attached to the same
   2229         # dev_server as well.
   2230         fields = {
   2231                 'board': board,
   2232                 'build_type': build_type,
   2233                 'dut_host_name': dut_host_name,
   2234                 'dev_server': self.resolved_hostname,
   2235                 'success': is_au_success,
   2236         }
   2237 
   2238         # reset_after=True is required for String gauges events to ensure that
   2239         # the metrics are not repeatedly emitted until the server restarts.
   2240 
   2241         metrics.String(PROVISION_PATH + '/provision_build_by_devserver_dut',
   2242                        reset_after=True).set(build_name, fields=fields)
   2243 
   2244         if error_list:
   2245             metrics.String(
   2246                     PROVISION_PATH +
   2247                     '/provision_failure_reason_by_devserver_dut',
   2248                     reset_after=True).set(error_list[0].classification,
   2249                                           fields=fields)
   2250 
   2251         metrics.SecondsDistribution(
   2252                 PROVISION_PATH + '/provision_duration_by_devserver_dut').add(
   2253                         total_duration, fields=fields)
   2254 
   2255 
   2256     def _parse_buildname_from_gs_uri(self, uri):
   2257         """Get parameters needed for AU metrics when build_name is not known.
   2258 
   2259         autoupdate_EndToEndTest is run with two Google Storage URIs from the
   2260         gs://chromeos-releases bucket. URIs in this bucket do not have the
   2261         build_name in the format samus-release/R60-0000.0.0.
   2262 
   2263         We can get the milestone and board by checking the instructions.json
   2264         file contained in the bucket with the payloads.
   2265 
   2266         @param uri: The partial uri we received from autoupdate_EndToEndTest.
   2267         """
   2268         try:
   2269             # Get the instructions file that contains info about the build.
   2270             gs_file = 'gs://chromeos-releases/' + uri + '/*instructions.json'
   2271             files = bin_utils.gs_ls(gs_file)
   2272             for f in files:
   2273                 gs_folder, _, instruction_file = f.rpartition('/')
   2274                 self.stage_artifacts(image=uri,
   2275                                      files=[instruction_file],
   2276                                      archive_url=gs_folder)
   2277                 json_file = self.get_staged_file_url(instruction_file, uri)
   2278                 response = urllib2.urlopen(json_file)
   2279                 data = json.load(response)
   2280                 return data['board'], 'release', data['version']['milestone']
   2281         except (ValueError, error.CmdError, urllib2.URLError) as e:
   2282             logging.debug('Problem getting values for metrics: %s', e)
   2283             logging.warning('Unable to parse build name %s from AU test for '
   2284                             'metrics. Continuing anyway.', uri)
   2285 
   2286         return '', '', ''
   2287 
   2288 
   2289     def auto_update(self, host_name, build_name, original_board=None,
   2290                     original_release_version=None, log_dir=None,
   2291                     force_update=False, full_update=False,
   2292                     payload_filename=None, force_original=False,
   2293                     clobber_stateful=True, quick_provision=False):
   2294         """Auto-update a CrOS host.
   2295 
   2296         @param host_name: The hostname of the DUT to auto-update.
   2297         @param build_name:  The build name to be auto-updated on the DUT.
   2298         @param original_board: The original board of the DUT to auto-update.
   2299         @param original_release_version: The release version of the DUT's
   2300             current build.
   2301         @param log_dir: The log directory to store auto-update logs from
   2302             devserver.
   2303         @param force_update: Force an update even if the version installed
   2304                              is the same. Default: False.
   2305         @param full_update:  If True, do not run stateful update, directly
   2306                              force a full reimage. If False, try stateful
   2307                              update first if the dut is already installed
   2308                              with the same version.
   2309         @param payload_filename: Used to specify the exact file to
   2310                                  use for autoupdating. If None, the payload
   2311                                  will be determined by build_name. You
   2312                                  must have already staged this file before
   2313                                  passing it in here.
   2314         @param force_original: Whether to force stateful update with the
   2315                                original payload.
   2316         @param clobber_stateful: If True do a clean install of stateful.
   2317         @param quick_provision: Attempt to use quick provision path first.
   2318 
   2319         @return A set (is_success, pid) in which:
   2320             1. is_success indicates whether this auto_update succeeds.
   2321             2. pid is the process id of the successful autoupdate run.
   2322 
   2323         @raise DevServerException if auto_update fails and is not retryable.
   2324         @raise RetryableProvisionException if it fails and is retryable.
   2325         """
   2326         kwargs = {'host_name': host_name,
   2327                   'build_name': build_name,
   2328                   'force_update': force_update,
   2329                   'full_update': full_update,
   2330                   'clobber_stateful': clobber_stateful,
   2331                   'quick_provision': quick_provision}
   2332 
   2333         is_aue2etest = payload_filename is not None
   2334 
   2335         if is_aue2etest:
   2336             kwargs['payload_filename'] = payload_filename
   2337 
   2338         error_msg = 'CrOS auto-update failed for host %s: %s'
   2339         error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s'
   2340         is_au_success = False
   2341         au_log_dir = os.path.join(log_dir,
   2342                                   AUTO_UPDATE_LOG_DIR) if log_dir else None
   2343         error_list = []
   2344         retry_with_another_devserver = False
   2345         duration_list = []
   2346 
   2347         if is_aue2etest:
   2348             board, build_type, milestone = self._parse_buildname_from_gs_uri(
   2349                 build_name)
   2350         else:
   2351             board, build_type, milestone = self._parse_buildname_safely(
   2352                 build_name)
   2353 
   2354         provision_start_time = time.time()
   2355         for au_attempt in range(AU_RETRY_LIMIT):
   2356             logging.debug('Start CrOS auto-update for host %s at %d time(s).',
   2357                           host_name, au_attempt + 1)
   2358             au_start_time = time.time()
   2359             failure_reason = None
   2360             # No matter _trigger_auto_update succeeds or fails, the auto-update
   2361             # track_status_file should be cleaned, and the auto-update execute
   2362             # log should be collected to directory sysinfo. Also, the error
   2363             # raised by _trigger_auto_update should be displayed.
   2364             try:
   2365                 # Try update with stateful.tgz of old release version in the
   2366                 # last try of auto-update.
   2367                 if force_original and original_release_version:
   2368                     # Monitor this case in monarch
   2369                     original_build = '%s/%s' % (original_board,
   2370                                                 original_release_version)
   2371                     c = metrics.Counter(
   2372                             'chromeos/autotest/provision/'
   2373                             'cros_update_with_original_build')
   2374                     f = {'dev_server': self.resolved_hostname,
   2375                          'board': board,
   2376                          'build_type': build_type,
   2377                          'milestone': milestone,
   2378                          'original_build': original_build}
   2379                     c.increment(fields=f)
   2380 
   2381                     logging.debug('Try updating stateful partition of the '
   2382                                   'host with the same version of its current '
   2383                                   'rootfs partition: %s', original_build)
   2384                     response = self._trigger_auto_update(
   2385                             original_build=original_build, **kwargs)
   2386                 else:
   2387                     response = self._trigger_auto_update(**kwargs)
   2388             except DevServerException as e:
   2389                 logging.debug(error_msg_attempt, au_attempt+1, str(e))
   2390                 failure_reason = DevServerExceptionClassifier(str(e))
   2391             else:
   2392                 _, raised_error, pid = self.check_for_auto_update_finished(
   2393                         response, **kwargs)
   2394 
   2395                 # Error happens in _collect_au_log won't be raised.
   2396                 if au_log_dir:
   2397                     is_collect_success = self.collect_au_log(
   2398                             kwargs['host_name'], pid, au_log_dir)
   2399                 else:
   2400                     is_collect_success = True
   2401 
   2402                 # Error happens in _clean_track_log won't be raised.
   2403                 if pid >= 0:
   2404                     is_clean_success = self.clean_track_log(
   2405                             kwargs['host_name'], pid)
   2406                 else:
   2407                     is_clean_success = True
   2408 
   2409                 # If any error is raised previously, log it and retry
   2410                 # auto-update. Otherwise, claim a successful CrOS auto-update.
   2411                 if (not raised_error and is_clean_success and
   2412                     is_collect_success):
   2413                     logging.debug('CrOS auto-update succeed for host %s',
   2414                                   host_name)
   2415                     is_au_success = True
   2416                     break
   2417                 else:
   2418                     if not self.kill_au_process_for_host(kwargs['host_name'],
   2419                                                          pid):
   2420                         logging.debug('Failed to kill auto_update process %d',
   2421                                       pid)
   2422                     if raised_error:
   2423                         error_str = str(raised_error)
   2424                         logging.debug(error_msg_attempt, au_attempt + 1,
   2425                                       error_str)
   2426                         if au_log_dir:
   2427                             logging.debug('Please see error details in log %s',
   2428                                           self._get_au_log_filename(
   2429                                                   au_log_dir,
   2430                                                   kwargs['host_name'],
   2431                                                   pid))
   2432                         failure_reason = DevServerExceptionClassifier(
   2433                             error_str, keep_full_trace=False)
   2434                         if self._is_retryable(error_str):
   2435                             retry_with_another_devserver = True
   2436 
   2437                         if self._should_use_original_payload(error_str):
   2438                             force_original = True
   2439 
   2440             finally:
   2441                 duration = int(time.time() - au_start_time)
   2442                 duration_list.append(duration)
   2443                 if failure_reason:
   2444                     error_list.append(failure_reason)
   2445                 self._emit_auto_update_metrics(board, build_type, host_name,
   2446                                                build_name, au_attempt + 1,
   2447                                                is_au_success, failure_reason,
   2448                                                duration)
   2449                 if retry_with_another_devserver:
   2450                     break
   2451 
   2452                 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1:
   2453                     time.sleep(CROS_AU_RETRY_INTERVAL)
   2454                     # Use the IP of DUT if the hostname failed.
   2455                     host_name_ip = socket.gethostbyname(host_name)
   2456                     kwargs['host_name'] = host_name_ip
   2457                     logging.debug(
   2458                             'AU failed, trying IP instead of hostname: %s',
   2459                             host_name_ip)
   2460 
   2461         total_duration = int(time.time() - provision_start_time)
   2462         self._emit_provision_metrics(error_list, duration_list, is_au_success,
   2463                                      board, build_type, milestone, host_name,
   2464                                      is_aue2etest, total_duration, build_name)
   2465 
   2466         if is_au_success:
   2467             return (is_au_success, pid)
   2468 
   2469         # If errors happen in the CrOS AU process, report the concatenation
   2470         # of the errors happening in first & second provision.
   2471         # If error happens in RPCs of cleaning track log, collecting
   2472         # auto-update logs, or killing auto-update processes, just report a
   2473         # common error here.
   2474         if error_list:
   2475             real_error = ', '.join(['%d) %s' % (i, e.summary)
   2476                                     for i, e in enumerate(error_list)])
   2477             if retry_with_another_devserver:
   2478                 raise RetryableProvisionException(
   2479                         error_msg % (host_name, real_error))
   2480             else:
   2481                 raise error_list[0].classified_exception(
   2482                     error_msg % (host_name, real_error))
   2483         else:
   2484             raise DevServerException(error_msg % (
   2485                         host_name, ('RPC calls after the whole auto-update '
   2486                                     'process failed.')))
   2487 
   2488 
   2489 class AndroidBuildServer(ImageServerBase):
   2490     """Class for DevServer that handles RPCs related to Android builds.
   2491 
   2492     The calls to devserver to stage artifacts, including stage and download, are
   2493     made in async mode. That is, when caller makes an RPC |stage| to request
   2494     devserver to stage certain artifacts, devserver handles the call and starts
   2495     staging artifacts in a new thread, and return |Success| without waiting for
   2496     staging being completed. When caller receives message |Success|, it polls
   2497     devserver's is_staged call until all artifacts are staged.
   2498     Such mechanism is designed to prevent cherrypy threads in devserver being
   2499     running out, as staging artifacts might take long time, and cherrypy starts
   2500     with a fixed number of threads that handle devserver rpc.
   2501     """
   2502 
   2503     def wait_for_artifacts_staged(self, target, build_id, branch,
   2504                                   archive_url=None, artifacts='', files=''):
   2505         """Polling devserver.is_staged until all artifacts are staged.
   2506 
   2507         @param target: Target of the android build to stage, e.g.,
   2508                        shamu-userdebug.
   2509         @param build_id: Build id of the android build to stage.
   2510         @param branch: Branch of the android build to stage.
   2511         @param archive_url: Google Storage URL for the build.
   2512         @param artifacts: Comma separated list of artifacts to download.
   2513         @param files: Comma separated list of files to download.
   2514 
   2515         @return: True if all artifacts are staged in devserver.
   2516         """
   2517         kwargs = {'target': target,
   2518                   'build_id': build_id,
   2519                   'branch': branch,
   2520                   'artifacts': artifacts,
   2521                   'files': files,
   2522                   'os_type': 'android'}
   2523         if archive_url:
   2524             kwargs['archive_url'] = archive_url
   2525         return self._poll_is_staged(**kwargs)
   2526 
   2527 
   2528     @remote_devserver_call()
   2529     def call_and_wait(self, call_name, target, build_id, branch, archive_url,
   2530                       artifacts, files, error_message,
   2531                       expected_response=SUCCESS):
   2532         """Helper method to make a urlopen call, and wait for artifacts staged.
   2533 
   2534         @param call_name: name of devserver rpc call.
   2535         @param target: Target of the android build to stage, e.g.,
   2536                        shamu-userdebug.
   2537         @param build_id: Build id of the android build to stage.
   2538         @param branch: Branch of the android build to stage.
   2539         @param archive_url: Google Storage URL for the CrOS build.
   2540         @param artifacts: Comma separated list of artifacts to download.
   2541         @param files: Comma separated list of files to download.
   2542         @param expected_response: Expected response from rpc, default to
   2543                                   |Success|. If it's set to None, do not compare
   2544                                   the actual response. Any response is consider
   2545                                   to be good.
   2546         @param error_message: Error message to be thrown if response does not
   2547                               match expected_response.
   2548 
   2549         @return: The response from rpc.
   2550         @raise DevServerException upon any return code that's expected_response.
   2551 
   2552         """
   2553         kwargs = {'target': target,
   2554                   'build_id': build_id,
   2555                   'branch': branch,
   2556                   'artifacts': artifacts,
   2557                   'files': files,
   2558                   'os_type': 'android'}
   2559         if archive_url:
   2560             kwargs['archive_url'] = archive_url
   2561         return self._call_and_wait(call_name, error_message, expected_response,
   2562                                    **kwargs)
   2563 
   2564 
   2565     @remote_devserver_call()
   2566     def stage_artifacts(self, target=None, build_id=None, branch=None,
   2567                         image=None, artifacts=None, files='', archive_url=None):
   2568         """Tell the devserver to download and stage |artifacts| from |image|.
   2569 
   2570          This is the main call point for staging any specific artifacts for a
   2571         given build. To see the list of artifacts one can stage see:
   2572 
   2573         ~src/platfrom/dev/artifact_info.py.
   2574 
   2575         This is maintained along with the actual devserver code.
   2576 
   2577         @param target: Target of the android build to stage, e.g.,
   2578                                shamu-userdebug.
   2579         @param build_id: Build id of the android build to stage.
   2580         @param branch: Branch of the android build to stage.
   2581         @param image: Name of a build to test, in the format of
   2582                       branch/target/build_id
   2583         @param artifacts: A list of artifacts.
   2584         @param files: A list of files to stage.
   2585         @param archive_url: Optional parameter that has the archive_url to stage
   2586                 this artifact from. Default is specified in autotest config +
   2587                 image.
   2588 
   2589         @raise DevServerException upon any return code that's not HTTP OK.
   2590         """
   2591         if image and not target and not build_id and not branch:
   2592             branch, target, build_id = utils.parse_launch_control_build(image)
   2593         if not target or not build_id or not branch:
   2594             raise DevServerException('Must specify all build info (target, '
   2595                                      'build_id and branch) to stage.')
   2596 
   2597         android_build_info = {'target': target,
   2598                               'build_id': build_id,
   2599                               'branch': branch}
   2600         if not artifacts and not files:
   2601             raise DevServerException('Must specify something to stage.')
   2602         if not all(android_build_info.values()):
   2603             raise DevServerException(
   2604                     'To stage an Android build, must specify target, build id '
   2605                     'and branch.')
   2606         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2607         self._stage_artifacts(build, artifacts, files, archive_url,
   2608                               **android_build_info)
   2609 
   2610     def get_pull_url(self, target, build_id, branch):
   2611         """Get the url to pull files from the devserver.
   2612 
   2613         @param target: Target of the android build, e.g., shamu_userdebug
   2614         @param build_id: Build id of the android build.
   2615         @param branch: Branch of the android build.
   2616 
   2617         @return A url to pull files from the dev server given a specific
   2618                 android build.
   2619         """
   2620         return os.path.join(self.url(), 'static', branch, target, build_id)
   2621 
   2622 
   2623     def trigger_download(self, target, build_id, branch, artifacts=None,
   2624                          files='', os='android', synchronous=True):
   2625         """Tell the devserver to download and stage an Android build.
   2626 
   2627         Tells the devserver to fetch an Android build from the image storage
   2628         server named by _get_image_storage_server().
   2629 
   2630         If |synchronous| is True, waits for the entire download to finish
   2631         staging before returning. Otherwise only the artifacts necessary
   2632         to start installing images onto DUT's will be staged before returning.
   2633         A caller can then call finish_download to guarantee the rest of the
   2634         artifacts have finished staging.
   2635 
   2636         @param target: Target of the android build to stage, e.g.,
   2637                        shamu-userdebug.
   2638         @param build_id: Build id of the android build to stage.
   2639         @param branch: Branch of the android build to stage.
   2640         @param artifacts: A string of artifacts separated by comma. If None,
   2641                use the default artifacts for Android or Brillo build.
   2642         @param files: String of file seperated by commas.
   2643         @param os: OS artifacts to download (android/brillo).
   2644         @param synchronous: if True, waits until all components of the image are
   2645                staged before returning.
   2646 
   2647         @raise DevServerException upon any return code that's not HTTP OK.
   2648 
   2649         """
   2650         android_build_info = {'target': target,
   2651                               'build_id': build_id,
   2652                               'branch': branch}
   2653         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2654         if not artifacts:
   2655             board = target.split('-')[0]
   2656             artifacts = (
   2657                 android_utils.AndroidArtifacts.get_artifacts_for_reimage(
   2658                         board, os))
   2659         self._trigger_download(build, artifacts, files=files,
   2660                                synchronous=synchronous, **android_build_info)
   2661 
   2662 
   2663     def finish_download(self, target, build_id, branch, os='android'):
   2664         """Tell the devserver to finish staging an Android build.
   2665 
   2666         If trigger_download is called with synchronous=False, it will return
   2667         before all artifacts have been staged. This method contacts the
   2668         devserver and blocks until all staging is completed and should be
   2669         called after a call to trigger_download.
   2670 
   2671         @param target: Target of the android build to stage, e.g.,
   2672                        shamu-userdebug.
   2673         @param build_id: Build id of the android build to stage.
   2674         @param branch: Branch of the android build to stage.
   2675         @param os: OS artifacts to download (android/brillo).
   2676 
   2677         @raise DevServerException upon any return code that's not HTTP OK.
   2678         """
   2679         android_build_info = {'target': target,
   2680                               'build_id': build_id,
   2681                               'branch': branch}
   2682         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2683         board = target.split('-')[0]
   2684         artifacts = (
   2685                 android_utils.AndroidArtifacts.get_artifacts_for_reimage(
   2686                         board))
   2687         self._finish_download(build, artifacts, files='', **android_build_info)
   2688 
   2689 
   2690     def get_staged_file_url(self, filename, target, build_id, branch):
   2691         """Returns the url of a staged file for this image on the devserver.
   2692 
   2693         @param filename: Name of the file.
   2694         @param target: Target of the android build to stage, e.g.,
   2695                        shamu-userdebug.
   2696         @param build_id: Build id of the android build to stage.
   2697         @param branch: Branch of the android build to stage.
   2698 
   2699         @return: The url of a staged file for this image on the devserver.
   2700         """
   2701         android_build_info = {'target': target,
   2702                               'build_id': build_id,
   2703                               'branch': branch,
   2704                               'os_type': 'android'}
   2705         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2706         return '/'.join([self._get_image_url(build), filename])
   2707 
   2708 
   2709     @remote_devserver_call()
   2710     def translate(self, build_name):
   2711         """Translate the build name if it's in LATEST format.
   2712 
   2713         If the build name is in the format [branch]/[target]/LATEST, return the
   2714         latest build in Launch Control otherwise return the build name as is.
   2715 
   2716         @param build_name: build_name to check.
   2717 
   2718         @return The actual build name to use.
   2719         """
   2720         branch, target, build_id = utils.parse_launch_control_build(build_name)
   2721         if build_id.upper() != 'LATEST':
   2722             return build_name
   2723         call = self.build_call('latestbuild', branch=branch, target=target,
   2724                                os_type='android')
   2725         translated_build_id = self.run_call(call)
   2726         translated_build = (ANDROID_BUILD_NAME_PATTERN %
   2727                             {'branch': branch,
   2728                              'target': target,
   2729                              'build_id': translated_build_id})
   2730         logging.debug('Translated relative build %s to %s', build_name,
   2731                       translated_build)
   2732         return translated_build
   2733 
   2734 
   2735 def _is_load_healthy(load):
   2736     """Check if devserver's load meets the minimum threshold.
   2737 
   2738     @param load: The devserver's load stats to check.
   2739 
   2740     @return: True if the load meets the minimum threshold. Return False
   2741              otherwise.
   2742 
   2743     """
   2744     # Threshold checks, including CPU load.
   2745     if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD:
   2746         logging.debug('CPU load of devserver %s is at %s%%, which is higher '
   2747                       'than the threshold of %s%%', load['devserver'],
   2748                       load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD)
   2749         return False
   2750     if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO:
   2751         logging.debug('Network IO of devserver %s is at %i Bps, which is '
   2752                       'higher than the threshold of %i bytes per second.',
   2753                       load['devserver'], load[DevServer.NETWORK_IO],
   2754                       DevServer.MAX_NETWORK_IO)
   2755         return False
   2756     return True
   2757 
   2758 
   2759 def _compare_load(devserver1, devserver2):
   2760     """Comparator function to compare load between two devservers.
   2761 
   2762     @param devserver1: A dictionary of devserver load stats to be compared.
   2763     @param devserver2: A dictionary of devserver load stats to be compared.
   2764 
   2765     @return: Negative value if the load of `devserver1` is less than the load
   2766              of `devserver2`. Return positive value otherwise.
   2767 
   2768     """
   2769     return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO])
   2770 
   2771 
   2772 def _get_subnet_for_host_ip(host_ip,
   2773                             restricted_subnets=utils.RESTRICTED_SUBNETS):
   2774     """Get the subnet for a given host IP.
   2775 
   2776     @param host_ip: the IP of a DUT.
   2777     @param restricted_subnets: A list of restriected subnets.
   2778 
   2779     @return: a (subnet_ip, mask_bits) tuple. If no matched subnet for the
   2780              host_ip, return (None, None).
   2781     """
   2782     for subnet_ip, mask_bits in restricted_subnets:
   2783         if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits):
   2784             return subnet_ip, mask_bits
   2785 
   2786     return None, None
   2787 
   2788 
   2789 def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None):
   2790     """Get the devserver with the least load.
   2791 
   2792     Iterate through all devservers and get the one with least load.
   2793 
   2794     TODO(crbug.com/486278): Devserver with required build already staged should
   2795     take higher priority. This will need check_health call to be able to verify
   2796     existence of a given build/artifact. Also, in case all devservers are
   2797     overloaded, the logic here should fall back to the old behavior that randomly
   2798     selects a devserver based on the hash of the image name/url.
   2799 
   2800     @param devserver_type: Type of devserver to select from. Default is set to
   2801                            ImageServer.
   2802     @param hostname: Hostname of the dut that the devserver is used for. The
   2803             picked devserver needs to respect the location of the host if
   2804             `prefer_local_devserver` is set to True or `restricted_subnets` is
   2805             set.
   2806 
   2807     @return: Name of the devserver with the least load.
   2808 
   2809     """
   2810     logging.debug('Get the least loaded %r', devserver_type)
   2811     devservers, can_retry = devserver_type.get_available_devservers(
   2812             hostname)
   2813     # If no healthy devservers available and can_retry is False, return None.
   2814     # Otherwise, relax the constrain on hostname, allow all devservers to be
   2815     # available.
   2816     if not devserver_type.get_healthy_devserver('', devservers):
   2817         if not can_retry:
   2818             return None
   2819         else:
   2820             devservers, _ = devserver_type.get_available_devservers()
   2821 
   2822     # get_devserver_load call needs to be made in a new process to allow force
   2823     # timeout using signal.
   2824     output = multiprocessing.Queue()
   2825     processes = []
   2826     for devserver in devservers:
   2827         processes.append(multiprocessing.Process(
   2828                 target=devserver_type.get_devserver_load_wrapper,
   2829                 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output)))
   2830 
   2831     for p in processes:
   2832         p.start()
   2833     for p in processes:
   2834         p.join()
   2835     loads = [output.get() for p in processes]
   2836     # Filter out any load failed to be retrieved or does not support load check.
   2837     loads = [load for load in loads if load and DevServer.CPU_LOAD in load and
   2838              DevServer.is_free_disk_ok(load) and
   2839              DevServer.is_apache_client_count_ok(load)]
   2840     if not loads:
   2841         logging.debug('Failed to retrieve load stats from any devserver. No '
   2842                       'load balancing can be applied.')
   2843         return None
   2844     loads = [load for load in loads if _is_load_healthy(load)]
   2845     if not loads:
   2846         logging.error('No devserver has the capacity to be selected.')
   2847         return None
   2848     loads = sorted(loads, cmp=_compare_load)
   2849     return loads[0]['devserver']
   2850 
   2851 
   2852 def resolve(build, hostname=None, ban_list=None):
   2853     """Resolve a devserver can be used for given build and hostname.
   2854 
   2855     @param build: Name of a build to stage on devserver, e.g.,
   2856                   ChromeOS build: daisy-release/R50-1234.0.0
   2857                   Launch Control build: git_mnc_release/shamu-eng
   2858     @param hostname: Hostname of a devserver for, default is None, which means
   2859             devserver is not restricted by the network location of the host.
   2860     @param ban_list: The blacklist of devservers shouldn't be chosen.
   2861 
   2862     @return: A DevServer instance that can be used to stage given build for the
   2863              given host.
   2864     """
   2865     if utils.is_launch_control_build(build):
   2866         return AndroidBuildServer.resolve(build, hostname)
   2867     else:
   2868         return ImageServer.resolve(build, hostname, ban_list=ban_list)
   2869