Home | History | Annotate | Download | only in cros
      1 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 from distutils import version
      6 import cStringIO
      7 import HTMLParser
      8 import httplib
      9 import json
     10 import logging
     11 import multiprocessing
     12 import os
     13 import re
     14 import socket
     15 import time
     16 import urllib2
     17 import urlparse
     18 
     19 from autotest_lib.client.bin import utils as bin_utils
     20 from autotest_lib.client.common_lib import android_utils
     21 from autotest_lib.client.common_lib import error
     22 from autotest_lib.client.common_lib import global_config
     23 from autotest_lib.client.common_lib import utils
     24 from autotest_lib.client.common_lib.cros import retry
     25 from autotest_lib.server import utils as server_utils
     26 # TODO(cmasone): redo this class using requests module; http://crosbug.com/30107
     27 
     28 try:
     29     from chromite.lib import metrics
     30 except ImportError:
     31     metrics = utils.metrics_mock
     32 
     33 
     34 CONFIG = global_config.global_config
     35 # This file is generated at build time and specifies, per suite and per test,
     36 # the DEPENDENCIES list specified in each control file.  It's a dict of dicts:
     37 # {'bvt':   {'/path/to/autotest/control/site_tests/test1/control': ['dep1']}
     38 #  'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']}
     39 #  'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'],
     40 #            '/path/to/autotest/control/site_tests/test3/control': ['dep3']}
     41 # }
     42 DEPENDENCIES_FILE = 'test_suites/dependency_info'
     43 # Number of seconds for caller to poll devserver's is_staged call to check if
     44 # artifacts are staged.
     45 _ARTIFACT_STAGE_POLLING_INTERVAL = 5
     46 # Artifacts that should be staged when client calls devserver RPC to stage an
     47 # image.
     48 _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful'
     49 # Artifacts that should be staged when client calls devserver RPC to stage an
     50 # image with autotest artifact.
     51 _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,'
     52                                                    'control_files,stateful,'
     53                                                    'autotest_packages')
     54 # Artifacts that should be staged when client calls devserver RPC to stage an
     55 # Android build.
     56 _BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions')
     57 SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value(
     58         'CROS', 'skip_devserver_health_check', type=bool)
     59 # Number of seconds for the call to get devserver load to time out.
     60 TIMEOUT_GET_DEVSERVER_LOAD = 2.0
     61 
     62 # Android artifact path in devserver
     63 ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value(
     64         'CROS', 'android_build_name_pattern', type=str).replace('\\', '')
     65 
     66 # Return value from a devserver RPC indicating the call succeeded.
     67 SUCCESS = 'Success'
     68 
     69 # The timeout minutes for a given devserver ssh call.
     70 DEVSERVER_SSH_TIMEOUT_MINS = 1
     71 
     72 # Error message for invalid devserver response.
     73 ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error'
     74 ERR_MSG_FOR_DOWN_DEVSERVER = 'Service Unavailable'
     75 
     76 # Error message for devserver call timedout.
     77 ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout'
     78 
     79 # The timeout minutes for waiting a devserver staging.
     80 DEVSERVER_IS_STAGING_RETRY_MIN = 100
     81 
     82 # The timeout minutes for waiting a DUT auto-update finished.
     83 DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100
     84 
     85 # The total times of devserver triggering CrOS auto-update.
     86 AU_RETRY_LIMIT = 2
     87 
     88 # Number of seconds for caller to poll devserver's get_au_status call to
     89 # check if cros auto-update is finished.
     90 CROS_AU_POLLING_INTERVAL = 10
     91 
     92 # Number of seconds for intervals between retrying auto-update calls.
     93 CROS_AU_RETRY_INTERVAL = 20
     94 
     95 # The file name for auto-update logs.
     96 CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log'
     97 
     98 # Provision error patterns.
     99 # People who see this should know that they shouldn't change these
    100 # classification strings. These strings are used for monitoring provision
    101 # failures. Any changes may mess up the stats.
    102 _EXCEPTION_PATTERNS = [
    103         # Raised when devserver portfile does not exist on host.
    104         (r".*Devserver portfile does not exist!.*$",
    105          '(1) Devserver portfile does not exist on host'),
    106         # Raised when devserver cannot copy packages to host.
    107         (r".*Could not copy .* to device.*$",
    108          '(2) Cannot copy packages to host'),
    109         # Raised when devserver fails to run specific commands on host.
    110         (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$",
    111          '(3) Fail to run specific command on host'),
    112         # Raised when new build fails to boot on the host.
    113         (r'.*RootfsUpdateError: Build .* failed to boot on.*$',
    114          '(4) Build failed to boot on host'),
    115         # Raised when the auto-update process is timed out.
    116         (r'.*The CrOS auto-update process is timed out, '
    117          'thus will be terminated.*$',
    118          '(5) Auto-update is timed out'),
    119         # Raised when the host is not pingable.
    120         (r".*DeviceNotPingableError.*$",
    121          '(6) Host is not pingable during auto-update'),
    122         # Raised when hosts have unexpected status after rootfs update.
    123         (r'.*Update failed with unexpected update status: '
    124          'UPDATE_STATUS_IDLE.*$',
    125          '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs '
    126          'update'),
    127         # Raised when devserver returns non-json response to shard/drone.
    128         (r'.*No JSON object could be decoded.*$',
    129          '(8) Devserver returned non-json object'),
    130         # Raised when devserver loses host's ssh connection
    131         (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$',
    132          "(9) Devserver lost host's ssh connection"),
    133         # Raised when error happens in writing files to host
    134         (r'.*Write failed\: Broken pipe.*$',
    135          "(10) Broken pipe while writing or connecting to host")]
    136 
    137 PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value(
    138         'CROS', 'prefer_local_devserver', type=bool, default=False)
    139 
    140 ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value(
    141         'CROS', 'enable_ssh_connection_for_devserver', type=bool,
    142         default=False)
    143 
    144 # Directory to save auto-update logs
    145 AUTO_UPDATE_LOG_DIR = 'autoupdate_logs'
    146 
    147 DEFAULT_SUBNET_MASKBIT = 19
    148 
    149 # Metrics basepaths.
    150 METRICS_PATH = 'chromeos/autotest'
    151 PROVISION_PATH = METRICS_PATH + '/provision'
    152 
    153 
    154 class DevServerException(Exception):
    155     """Raised when the dev server returns a non-200 HTTP response."""
    156     pass
    157 
    158 
    159 class BadBuildException(DevServerException):
    160     """Raised when build failed to boot on DUT."""
    161     pass
    162 
    163 
    164 class RetryableProvisionException(DevServerException):
    165     """Raised when provision fails due to a retryable reason."""
    166     pass
    167 
    168 class DevServerOverloadException(Exception):
    169     """Raised when the dev server returns a 502 HTTP response."""
    170     pass
    171 
    172 class DevServerFailToLocateException(Exception):
    173     """Raised when fail to locate any devserver."""
    174     pass
    175 
    176 
    177 class DevServerExceptionClassifier(object):
    178     """A Class represents exceptions raised from DUT by calling auto_update."""
    179     def __init__(self, err, keep_full_trace=True):
    180         """
    181         @param err: A single string representing one time provision
    182             error happened in auto_update().
    183         @param keep_full_trace: True to keep the whole track trace of error.
    184             False when just keep the last line.
    185         """
    186         self._err = err if keep_full_trace else err.split('\n')[-1]
    187         self._classification = None
    188 
    189     def _classify(self):
    190         for err_pattern, classification in _EXCEPTION_PATTERNS:
    191             if re.match(err_pattern, self._err):
    192                 return classification
    193 
    194         return '(0) Unknown exception'
    195 
    196     @property
    197     def classification(self):
    198         """Classify the error
    199 
    200         @return: return a classified exception type (string) from
    201             _EXCEPTION_PATTERNS or 'Unknown exception'. Current patterns in
    202             _EXCEPTION_PATTERNS are very specific so that errors cannot match
    203             more than one pattern.
    204         """
    205         if not self._classification:
    206             self._classification = self._classify()
    207         return self._classification
    208 
    209     @property
    210     def summary(self):
    211         """Use one line to show the error message."""
    212         return ' '.join(self._err.splitlines())
    213 
    214     @property
    215     def classified_exception(self):
    216         """What kind of exception will be raised to higher.
    217 
    218         @return: return a special Exception when the raised error is an
    219             RootfsUpdateError. Otherwise, return general DevServerException.
    220         """
    221         # The classification of RootfsUpdateError in _EXCEPTION_PATTERNS starts
    222         # with "(4)"
    223         if self.classification.startswith('(4)'):
    224             return BadBuildException
    225 
    226         return DevServerException
    227 
    228 
    229 class MarkupStripper(HTMLParser.HTMLParser):
    230     """HTML parser that strips HTML tags, coded characters like &
    231 
    232     Works by, basically, not doing anything for any tags, and only recording
    233     the content of text nodes in an internal data structure.
    234     """
    235     def __init__(self):
    236         self.reset()
    237         self.fed = []
    238 
    239 
    240     def handle_data(self, d):
    241         """Consume content of text nodes, store it away."""
    242         self.fed.append(d)
    243 
    244 
    245     def get_data(self):
    246         """Concatenate and return all stored data."""
    247         return ''.join(self.fed)
    248 
    249 
    250 def _strip_http_message(message):
    251     """Strip the HTTP marker from the an HTTP message.
    252 
    253     @param message: A string returned by an HTTP call.
    254 
    255     @return: A string with HTTP marker being stripped.
    256     """
    257     strip = MarkupStripper()
    258     try:
    259         strip.feed(message.decode('utf_32'))
    260     except UnicodeDecodeError:
    261         strip.feed(message)
    262     return strip.get_data()
    263 
    264 
    265 def _get_image_storage_server():
    266     return CONFIG.get_config_value('CROS', 'image_storage_server', type=str)
    267 
    268 
    269 def _get_canary_channel_server():
    270     """
    271     Get the url of the canary-channel server,
    272     eg: gsutil://chromeos-releases/canary-channel/<board>/<release>
    273 
    274     @return: The url to the canary channel server.
    275     """
    276     return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str)
    277 
    278 
    279 def _get_storage_server_for_artifacts(artifacts=None):
    280     """Gets the appropriate storage server for the given artifacts.
    281 
    282     @param artifacts: A list of artifacts we need to stage.
    283     @return: The address of the storage server that has these artifacts.
    284              The default image storage server if no artifacts are specified.
    285     """
    286     factory_artifact = global_config.global_config.get_config_value(
    287             'CROS', 'factory_artifact', type=str, default='')
    288     if artifacts and factory_artifact and factory_artifact in artifacts:
    289         return _get_canary_channel_server()
    290     return _get_image_storage_server()
    291 
    292 
    293 def _gs_or_local_archive_url_args(archive_url):
    294     """Infer the devserver call arguments to use with the given archive_url.
    295 
    296     @param archive_url: The archive url to include the in devserver RPC. This
    297             can either e a GS path or a local path.
    298     @return: A dict of arguments to include in the devserver call.
    299     """
    300     if not archive_url:
    301         return {}
    302     elif archive_url.startswith('gs://'):
    303         return {'archive_url': archive_url}
    304     else:
    305         # For a local path, we direct the devserver to move the files while
    306         # staging. This is the fastest way to stage local files, but deletes the
    307         # files from the source. This is OK because the files are available on
    308         # the devserver once staged.
    309         return {
    310                 'local_path': archive_url,
    311                 'delete_source': True,
    312         }
    313 
    314 
    315 def _reverse_lookup_from_config(address):
    316     """Look up hostname for the given IP address.
    317 
    318     This uses the hostname-address map from the config file.
    319 
    320     If multiple hostnames map to the same IP address, the first one
    321     defined in the configuration file takes precedence.
    322 
    323     @param address: IP address string
    324     @returns: hostname string, or original input if not found
    325     """
    326     for hostname, addr in _get_hostname_addr_map().iteritems():
    327         if addr == address:
    328             return hostname
    329     return address
    330 
    331 
    332 def _get_hostname_addr_map():
    333     """Get hostname address mapping from config.
    334 
    335     @return: dict mapping server hostnames to addresses
    336     """
    337     return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP')
    338 
    339 
    340 def _get_dev_server_list():
    341     return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[])
    342 
    343 
    344 def _get_crash_server_list():
    345     return CONFIG.get_config_value('CROS', 'crash_server', type=list,
    346         default=[])
    347 
    348 
    349 def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN,
    350                           exception_to_raise=DevServerException):
    351     """A decorator to use with remote devserver calls.
    352 
    353     This decorator converts urllib2.HTTPErrors into DevServerExceptions
    354     with any embedded error info converted into plain text. The method
    355     retries on urllib2.URLError or error.CmdError to avoid devserver flakiness.
    356     """
    357     #pylint: disable=C0111
    358 
    359     def inner_decorator(method):
    360         label = method.__name__ if hasattr(method, '__name__') else None
    361         def metrics_wrapper(*args, **kwargs):
    362             @retry.retry((urllib2.URLError, error.CmdError,
    363                           DevServerOverloadException),
    364                          timeout_min=timeout_min,
    365                          exception_to_raise=exception_to_raise,
    366                         label=label)
    367             def wrapper():
    368                 """This wrapper actually catches the HTTPError."""
    369                 try:
    370                     return method(*args, **kwargs)
    371                 except urllib2.HTTPError as e:
    372                     error_markup = e.read()
    373                     raise DevServerException(_strip_http_message(error_markup))
    374 
    375             try:
    376                 return wrapper()
    377             except Exception as e:
    378                 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e):
    379                     dev_server = None
    380                     if args and isinstance(args[0], DevServer):
    381                         dev_server = args[0].hostname
    382                     elif 'devserver' in kwargs:
    383                         dev_server = get_hostname(kwargs['devserver'])
    384 
    385                     logging.debug('RPC call %s has timed out on devserver %s.',
    386                                   label, dev_server)
    387                     c = metrics.Counter(
    388                             'chromeos/autotest/devserver/call_timeout')
    389                     c.increment(fields={'dev_server': dev_server,
    390                                         'healthy': label})
    391 
    392                 raise
    393 
    394         return metrics_wrapper
    395 
    396     return inner_decorator
    397 
    398 
    399 def get_hostname(url):
    400     """Get the hostname portion of a URL
    401 
    402     schema://hostname:port/path
    403 
    404     @param url: a Url string
    405     @return: a hostname string
    406     """
    407     return urlparse.urlparse(url).hostname
    408 
    409 
    410 def get_resolved_hostname(url):
    411     """Get the symbolic hostname from url.
    412 
    413     If the given `url` uses a numeric IP address, try and find a
    414     symbolic name from the hostname map in the config file.
    415 
    416     @param url  The URL with which to perform the conversion/lookup.
    417     """
    418     return _reverse_lookup_from_config(get_hostname(url))
    419 
    420 
    421 class DevServer(object):
    422     """Base class for all DevServer-like server stubs.
    423 
    424     This is the base class for interacting with all Dev Server-like servers.
    425     A caller should instantiate a sub-class of DevServer with:
    426 
    427     host = SubClassServer.resolve(build)
    428     server = SubClassServer(host)
    429     """
    430     _MIN_FREE_DISK_SPACE_GB = 20
    431     _MAX_APACHE_CLIENT_COUNT = 75
    432     # Threshold for the CPU load percentage for a devserver to be selected.
    433     MAX_CPU_LOAD = 80.0
    434     # Threshold for the network IO, set to 80MB/s
    435     MAX_NETWORK_IO = 1024 * 1024 * 80
    436     DISK_IO = 'disk_total_bytes_per_second'
    437     NETWORK_IO = 'network_total_bytes_per_second'
    438     CPU_LOAD = 'cpu_percent'
    439     FREE_DISK = 'free_disk'
    440     AU_PROCESS = 'au_process_count'
    441     STAGING_THREAD_COUNT = 'staging_thread_count'
    442     APACHE_CLIENT_COUNT = 'apache_client_count'
    443 
    444 
    445     def __init__(self, devserver):
    446         self._devserver = devserver
    447 
    448 
    449     def url(self):
    450         """Returns the url for this devserver."""
    451         return self._devserver
    452 
    453 
    454     @property
    455     def hostname(self):
    456         """Return devserver hostname parsed from the devserver URL.
    457 
    458         Note that this is likely parsed from the devserver URL from
    459         shadow_config.ini, meaning that the "hostname" part of the
    460         devserver URL is actually an IP address.
    461 
    462         @return hostname string
    463         """
    464         return get_hostname(self.url())
    465 
    466 
    467     @property
    468     def resolved_hostname(self):
    469         """Return devserver hostname, resolved from its IP address.
    470 
    471         Unlike the hostname property, this property attempts to look up
    472         the proper hostname from the devserver IP address.  If lookup
    473         fails, then fall back to whatever the hostname property would
    474         have returned.
    475 
    476         @return hostname string
    477         """
    478         return _reverse_lookup_from_config(self.hostname)
    479 
    480 
    481     @staticmethod
    482     def get_server_url(url):
    483         """Get the devserver url from a repo url, which includes build info.
    484 
    485         @param url: A job repo url.
    486 
    487         @return A devserver url, e.g., http://127.0.0.10:8080
    488         """
    489         res = urlparse.urlparse(url)
    490         if res.netloc:
    491             return res.scheme + '://' + res.netloc
    492 
    493 
    494     @classmethod
    495     def get_devserver_load_wrapper(cls, devserver, timeout_sec, output):
    496         """A wrapper function to call get_devserver_load in parallel.
    497 
    498         @param devserver: url of the devserver.
    499         @param timeout_sec: Number of seconds before time out the devserver
    500                             call.
    501         @param output: An output queue to save results to.
    502         """
    503         load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0)
    504         if load:
    505             load['devserver'] = devserver
    506         output.put(load)
    507 
    508 
    509     @classmethod
    510     def get_devserver_load(cls, devserver,
    511                            timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
    512         """Returns True if the |devserver| is healthy to stage build.
    513 
    514         @param devserver: url of the devserver.
    515         @param timeout_min: How long to wait in minutes before deciding the
    516                             the devserver is not up (float).
    517 
    518         @return: A dictionary of the devserver's load.
    519 
    520         """
    521         call = cls._build_call(devserver, 'check_health')
    522         @remote_devserver_call(timeout_min=timeout_min)
    523         def get_load(devserver=devserver):
    524             """Inner method that makes the call."""
    525             return cls.run_call(call, timeout=timeout_min*60)
    526 
    527         try:
    528             return json.load(cStringIO.StringIO(get_load(devserver=devserver)))
    529         except Exception as e:
    530             logging.error('Devserver call failed: "%s", timeout: %s seconds,'
    531                           ' Error: %s', call, timeout_min * 60, e)
    532 
    533 
    534     @classmethod
    535     def is_free_disk_ok(cls, load):
    536         """Check if a devserver has enough free disk.
    537 
    538         @param load: A dict of the load of the devserver.
    539 
    540         @return: True if the devserver has enough free disk or disk check is
    541                  skipped in global config.
    542 
    543         """
    544         if SKIP_DEVSERVER_HEALTH_CHECK:
    545             logging.debug('devserver health check is skipped.')
    546         elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB:
    547             return False
    548 
    549         return True
    550 
    551 
    552     @classmethod
    553     def is_apache_client_count_ok(cls, load):
    554         """Check if a devserver has enough Apache connections available.
    555 
    556         Apache server by default has maximum of 150 concurrent connections. If
    557         a devserver has too many live connections, it likely indicates the
    558         server is busy handling many long running download requests, e.g.,
    559         downloading stateful partitions. It is better not to add more requests
    560         to it.
    561 
    562         @param load: A dict of the load of the devserver.
    563 
    564         @return: True if the devserver has enough Apache connections available,
    565                  or disk check is skipped in global config.
    566 
    567         """
    568         if SKIP_DEVSERVER_HEALTH_CHECK:
    569             logging.debug('devserver health check is skipped.')
    570         elif cls.APACHE_CLIENT_COUNT not in load:
    571             logging.debug('Apache client count is not collected from devserver.')
    572         elif (load[cls.APACHE_CLIENT_COUNT] >
    573               cls._MAX_APACHE_CLIENT_COUNT):
    574             return False
    575 
    576         return True
    577 
    578 
    579     @classmethod
    580     def devserver_healthy(cls, devserver,
    581                           timeout_min=DEVSERVER_SSH_TIMEOUT_MINS):
    582         """Returns True if the |devserver| is healthy to stage build.
    583 
    584         @param devserver: url of the devserver.
    585         @param timeout_min: How long to wait in minutes before deciding the
    586                             the devserver is not up (float).
    587 
    588         @return: True if devserver is healthy. Return False otherwise.
    589 
    590         """
    591         c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy')
    592         reason = ''
    593         healthy = False
    594         load = cls.get_devserver_load(devserver, timeout_min=timeout_min)
    595         try:
    596             if not load:
    597                 # Failed to get the load of devserver.
    598                 reason = '(1) Failed to get load.'
    599                 return False
    600 
    601             apache_ok = cls.is_apache_client_count_ok(load)
    602             if not apache_ok:
    603                 reason = '(2) Apache client count too high.'
    604                 logging.error('Devserver check_health failed. Live Apache client '
    605                               'count is too high: %d.',
    606                               load[cls.APACHE_CLIENT_COUNT])
    607                 return False
    608 
    609             disk_ok = cls.is_free_disk_ok(load)
    610             if not disk_ok:
    611                 reason = '(3) Disk space too low.'
    612                 logging.error('Devserver check_health failed. Free disk space is '
    613                               'low. Only %dGB is available.',
    614                               load[cls.FREE_DISK])
    615             healthy = bool(disk_ok)
    616             return disk_ok
    617         finally:
    618             c.increment(fields={'dev_server': cls(devserver).resolved_hostname,
    619                                 'healthy': healthy,
    620                                 'reason': reason})
    621             # Monitor how many AU processes the devserver is currently running.
    622             if load is not None and load.get(DevServer.AU_PROCESS):
    623                 c_au = metrics.Gauge(
    624                         'chromeos/autotest/devserver/devserver_au_count')
    625                 c_au.set(
    626                     load.get(DevServer.AU_PROCESS),
    627                     fields={'dev_server': cls(devserver).resolved_hostname})
    628 
    629 
    630     @staticmethod
    631     def _build_call(host, method, **kwargs):
    632         """Build a URL to |host| that calls |method|, passing |kwargs|.
    633 
    634         Builds a URL that calls |method| on the dev server defined by |host|,
    635         passing a set of key/value pairs built from the dict |kwargs|.
    636 
    637         @param host: a string that is the host basename e.g. http://server:90.
    638         @param method: the dev server method to call.
    639         @param kwargs: a dict mapping arg names to arg values.
    640         @return the URL string.
    641         """
    642         # If the archive_url is a local path, the args expected by the devserver
    643         # are a little different.
    644         archive_url_args = _gs_or_local_archive_url_args(
    645                 kwargs.pop('archive_url', None))
    646         kwargs.update(archive_url_args)
    647 
    648         argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems()))
    649         return "%(host)s/%(method)s?%(argstr)s" % dict(
    650                 host=host, method=method, argstr=argstr)
    651 
    652 
    653     def build_call(self, method, **kwargs):
    654         """Builds a devserver RPC string that is used by 'run_call()'.
    655 
    656         @param method: remote devserver method to call.
    657         """
    658         return self._build_call(self._devserver, method, **kwargs)
    659 
    660 
    661     @classmethod
    662     def build_all_calls(cls, method, **kwargs):
    663         """Builds a list of URLs that makes RPC calls on all devservers.
    664 
    665         Build a URL that calls |method| on the dev server, passing a set
    666         of key/value pairs built from the dict |kwargs|.
    667 
    668         @param method: the dev server method to call.
    669         @param kwargs: a dict mapping arg names to arg values
    670 
    671         @return the URL string
    672         """
    673         calls = []
    674         # Note we use cls.servers as servers is class specific.
    675         for server in cls.servers():
    676             if cls.devserver_healthy(server):
    677                 calls.append(cls._build_call(server, method, **kwargs))
    678 
    679         return calls
    680 
    681 
    682     @classmethod
    683     def run_call(cls, call, readline=False, timeout=None):
    684         """Invoke a given devserver call using urllib.open.
    685 
    686         Open the URL with HTTP, and return the text of the response. Exceptions
    687         may be raised as for urllib2.urlopen().
    688 
    689         @param call: a url string that calls a method to a devserver.
    690         @param readline: whether read http response line by line.
    691         @param timeout: The timeout seconds for this urlopen call.
    692 
    693         @return the results of this call.
    694         """
    695         if timeout is not None:
    696             return utils.urlopen_socket_timeout(
    697                     call, timeout=timeout).read()
    698         elif readline:
    699             response = urllib2.urlopen(call)
    700             return [line.rstrip() for line in response]
    701         else:
    702             return urllib2.urlopen(call).read()
    703 
    704 
    705     @staticmethod
    706     def servers():
    707         """Returns a list of servers that can serve as this type of server."""
    708         raise NotImplementedError()
    709 
    710 
    711     @classmethod
    712     def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT,
    713                                       unrestricted_only=False):
    714         """Get the devservers in the same subnet of the given ip.
    715 
    716         @param ip: The IP address of a dut to look for devserver.
    717         @param mask_bits: Number of mask bits. Default is 19.
    718         @param unrestricted_only: Set to True to select from devserver in
    719                 unrestricted subnet only. Default is False.
    720 
    721         @return: A list of devservers in the same subnet of the given ip.
    722 
    723         """
    724         # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so
    725         # we need a dict to return the full devserver path once the IPs are
    726         # filtered in get_servers_in_same_subnet.
    727         server_names = {}
    728         all_devservers = []
    729         devservers = (cls.get_unrestricted_devservers() if unrestricted_only
    730                       else cls.servers())
    731         for server in devservers:
    732             server_name = get_hostname(server)
    733             server_names[server_name] = server
    734             all_devservers.append(server_name)
    735         if not all_devservers:
    736             devserver_type = 'unrestricted only' if unrestricted_only else 'all'
    737             raise DevServerFailToLocateException(
    738                 'Fail to locate a devserver for dut %s in %s devservers'
    739                 % (ip, devserver_type))
    740 
    741         devservers = utils.get_servers_in_same_subnet(ip, mask_bits,
    742                                                       all_devservers)
    743         return [server_names[s] for s in devservers]
    744 
    745 
    746     @classmethod
    747     def get_unrestricted_devservers(
    748                 cls, restricted_subnets=utils.RESTRICTED_SUBNETS):
    749         """Get the devservers not in any restricted subnet specified in
    750         restricted_subnets.
    751 
    752         @param restricted_subnets: A list of restriected subnets.
    753 
    754         @return: A list of devservers not in any restricted subnet.
    755 
    756         """
    757         if not restricted_subnets:
    758             return cls.servers()
    759 
    760         devservers = []
    761         for server in cls.servers():
    762             server_name = get_hostname(server)
    763             if not utils.get_restricted_subnet(server_name, restricted_subnets):
    764                 devservers.append(server)
    765         return devservers
    766 
    767 
    768     @classmethod
    769     def get_healthy_devserver(cls, build, devservers, ban_list=None):
    770         """"Get a healthy devserver instance from the list of devservers.
    771 
    772         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
    773         @param devservers: The devserver list to be chosen out a healthy one.
    774         @param ban_list: The blacklist of devservers we don't want to choose.
    775                 Default is None.
    776 
    777         @return: A DevServer object of a healthy devserver. Return None if no
    778                 healthy devserver is found.
    779 
    780         """
    781         logging.debug('Pick one healthy devserver from %r', devservers)
    782         while devservers:
    783             hash_index = hash(build) % len(devservers)
    784             devserver = devservers.pop(hash_index)
    785             logging.debug('Check health for %s', devserver)
    786             if ban_list and devserver in ban_list:
    787                 continue
    788 
    789             if cls.devserver_healthy(devserver):
    790                 logging.debug('Pick %s', devserver)
    791                 return cls(devserver)
    792 
    793 
    794     @classmethod
    795     def get_available_devservers(cls, hostname=None,
    796                                  prefer_local_devserver=PREFER_LOCAL_DEVSERVER,
    797                                  restricted_subnets=utils.RESTRICTED_SUBNETS):
    798         """Get devservers in the same subnet of the given hostname.
    799 
    800         @param hostname: Hostname of a DUT to choose devserver for.
    801 
    802         @return: A tuple of (devservers, can_retry), devservers is a list of
    803                  devservers that's available for the given hostname. can_retry
    804                  is a flag that indicate if caller can retry the selection of
    805                  devserver if no devserver in the returned devservers can be
    806                  used. For example, if hostname is in a restricted subnet,
    807                  can_retry will be False.
    808         """
    809         logging.info('Getting devservers for host: %s',  hostname)
    810         host_ip = None
    811         if hostname:
    812             host_ip = bin_utils.get_ip_address(hostname)
    813             if not host_ip:
    814                 logging.error('Failed to get IP address of %s. Will pick a '
    815                               'devserver without subnet constraint.', hostname)
    816 
    817         if not host_ip:
    818             return cls.get_unrestricted_devservers(restricted_subnets), False
    819 
    820         # Go through all restricted subnet settings and check if the DUT is
    821         # inside a restricted subnet. If so, only return the devservers in the
    822         # restricted subnet and doesn't allow retry.
    823         if host_ip and restricted_subnets:
    824             subnet_ip, mask_bits = _get_subnet_for_host_ip(
    825                     host_ip, restricted_subnets=restricted_subnets)
    826             if subnet_ip:
    827                 logging.debug('The host %s (%s) is in a restricted subnet. '
    828                               'Try to locate a devserver inside subnet '
    829                               '%s:%d.', hostname, host_ip, subnet_ip,
    830                               mask_bits)
    831                 devservers = cls.get_devservers_in_same_subnet(
    832                         subnet_ip, mask_bits)
    833                 return devservers, False
    834 
    835         # If prefer_local_devserver is set to True and the host is not in
    836         # restricted subnet, pick a devserver in the same subnet if possible.
    837         # Set can_retry to True so it can pick a different devserver if all
    838         # devservers in the same subnet are down.
    839         if prefer_local_devserver:
    840             return (cls.get_devservers_in_same_subnet(
    841                     host_ip, DEFAULT_SUBNET_MASKBIT, True), True)
    842 
    843         return cls.get_unrestricted_devservers(restricted_subnets), False
    844 
    845 
    846     @classmethod
    847     def resolve(cls, build, hostname=None, ban_list=None):
    848         """"Resolves a build to a devserver instance.
    849 
    850         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514).
    851         @param hostname: The hostname of dut that requests a devserver. It's
    852                          used to make sure a devserver in the same subnet is
    853                          preferred.
    854         @param ban_list: The blacklist of devservers shouldn't be chosen.
    855 
    856         @raise DevServerException: If no devserver is available.
    857         """
    858         tried_devservers = set()
    859         devservers, can_retry = cls.get_available_devservers(hostname)
    860         if devservers:
    861             tried_devservers |= set(devservers)
    862 
    863         devserver = cls.get_healthy_devserver(build, devservers,
    864                                               ban_list=ban_list)
    865 
    866         if not devserver and can_retry:
    867             # Find available devservers without dut location constrain.
    868             devservers, _ = cls.get_available_devservers()
    869             devserver = cls.get_healthy_devserver(build, devservers,
    870                                                   ban_list=ban_list)
    871             if devservers:
    872                 tried_devservers |= set(devservers)
    873         if devserver:
    874             return devserver
    875         else:
    876             subnet = 'unrestricted subnet'
    877             if hostname is not None:
    878                 host_ip = bin_utils.get_ip_address(hostname)
    879                 if host_ip:
    880                     subnet_ip, mask_bits = _get_subnet_for_host_ip(host_ip)
    881                     subnet = '%s/%s' % (str(subnet_ip), str(mask_bits))
    882 
    883             error_msg = ('All devservers in subnet: %s are currently down: '
    884                          '%s. (dut hostname: %s)' %
    885                          (subnet, tried_devservers, hostname))
    886             logging.error(error_msg)
    887             c = metrics.Counter(
    888                     'chromeos/autotest/devserver/subnet_without_devservers')
    889             c.increment(fields={'subnet': subnet, 'hostname': str(hostname)})
    890             raise DevServerException(error_msg)
    891 
    892 
    893     @classmethod
    894     def random(cls):
    895         """Return a random devserver that's available.
    896 
    897         Devserver election in `resolve` method is based on a hash of the
    898         build that a caller wants to stage. The purpose is that different
    899         callers requesting for the same build can get the same devserver,
    900         while the lab is able to distribute different builds across all
    901         devservers. That helps to reduce the duplication of builds across
    902         all devservers.
    903         This function returns a random devserver, by passing a random
    904         pseudo build name to `resolve `method.
    905         """
    906         return cls.resolve(build=str(time.time()))
    907 
    908 
    909 class CrashServer(DevServer):
    910     """Class of DevServer that symbolicates crash dumps."""
    911 
    912     @staticmethod
    913     def servers():
    914         return _get_crash_server_list()
    915 
    916 
    917     @remote_devserver_call()
    918     def symbolicate_dump(self, minidump_path, build):
    919         """Ask the devserver to symbolicate the dump at minidump_path.
    920 
    921         Stage the debug symbols for |build| and, if that works, ask the
    922         devserver to symbolicate the dump at |minidump_path|.
    923 
    924         @param minidump_path: the on-disk path of the minidump.
    925         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
    926                       whose debug symbols are needed for symbolication.
    927         @return The contents of the stack trace
    928         @raise DevServerException upon any return code that's not HTTP OK.
    929         """
    930         try:
    931             import requests
    932         except ImportError:
    933             logging.warning("Can't 'import requests' to connect to dev server.")
    934             return ''
    935         f = {'dev_server': self.resolved_hostname}
    936         c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump')
    937         c.increment(fields=f)
    938         # Symbolicate minidump.
    939         m = 'chromeos/autotest/crashserver/symbolicate_dump_duration'
    940         with metrics.SecondsTimer(m, fields=f):
    941             call = self.build_call('symbolicate_dump',
    942                                    archive_url=_get_image_storage_server() + build)
    943             request = requests.post(
    944                     call, files={'minidump': open(minidump_path, 'rb')})
    945             if request.status_code == requests.codes.OK:
    946                 return request.text
    947 
    948         error_fd = cStringIO.StringIO(request.text)
    949         raise urllib2.HTTPError(
    950                 call, request.status_code, request.text, request.headers,
    951                 error_fd)
    952 
    953 
    954     @classmethod
    955     def get_available_devservers(cls, hostname):
    956         """Get all available crash servers.
    957 
    958         Crash server election doesn't need to count the location of hostname.
    959 
    960         @param hostname: Hostname of a DUT to choose devserver for.
    961 
    962         @return: A tuple of (all crash servers, False). can_retry is set to
    963                  False, as all crash servers are returned. There is no point to
    964                  retry.
    965         """
    966         return cls.servers(), False
    967 
    968 
    969 class ImageServerBase(DevServer):
    970     """Base class for devservers used to stage builds.
    971 
    972     CrOS and Android builds are staged in different ways as they have different
    973     sets of artifacts. This base class abstracts the shared functions between
    974     the two types of ImageServer.
    975     """
    976 
    977     @classmethod
    978     def servers(cls):
    979         """Returns a list of servers that can serve as a desired type of
    980         devserver.
    981         """
    982         return _get_dev_server_list()
    983 
    984 
    985     def _get_image_url(self, image):
    986         """Returns the url of the directory for this image on the devserver.
    987 
    988         @param image: the image that was fetched.
    989         """
    990         image = self.translate(image)
    991         url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
    992                                               type=str)
    993         return (url_pattern % (self.url(), image)).replace('update', 'static')
    994 
    995 
    996     @staticmethod
    997     def create_metadata(server_name, image, artifacts=None, files=None):
    998         """Create a metadata dictionary given the staged items.
    999 
   1000         The metadata can be send to metadata db along with stats.
   1001 
   1002         @param server_name: name of the devserver, e.g 172.22.33.44.
   1003         @param image: The name of the image.
   1004         @param artifacts: A list of artifacts.
   1005         @param files: A list of files.
   1006 
   1007         @return A metadata dictionary.
   1008 
   1009         """
   1010         metadata = {'devserver': server_name,
   1011                     'image': image,
   1012                     '_type': 'devserver'}
   1013         if artifacts:
   1014             metadata['artifacts'] = ' '.join(artifacts)
   1015         if files:
   1016             metadata['files'] = ' '.join(files)
   1017         return metadata
   1018 
   1019 
   1020     @classmethod
   1021     def run_ssh_call(cls, call, readline=False, timeout=None):
   1022         """Construct an ssh-based rpc call, and execute it.
   1023 
   1024         @param call: a url string that calls a method to a devserver.
   1025         @param readline: whether read http response line by line.
   1026         @param timeout: The timeout seconds for ssh call.
   1027 
   1028         @return the results of this call.
   1029         """
   1030         hostname = get_hostname(call)
   1031         ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call))
   1032         timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60
   1033         try:
   1034             result = utils.run(ssh_call, timeout=timeout_seconds)
   1035         except error.CmdError as e:
   1036             logging.debug('Error occurred with exit_code %d when executing the '
   1037                           'ssh call: %s.', e.result_obj.exit_status,
   1038                           e.result_obj.stderr)
   1039             c = metrics.Counter('chromeos/autotest/devserver/ssh_failure')
   1040             c.increment(fields={'dev_server': hostname})
   1041             raise
   1042         response = result.stdout
   1043 
   1044         # If the curl command's returned HTTP response contains certain
   1045         # exception string, raise the DevServerException of the response.
   1046         if 'DownloaderException' in response:
   1047             raise DevServerException(_strip_http_message(response))
   1048 
   1049         if readline:
   1050             # Remove line terminators and trailing whitespace
   1051             response = response.splitlines()
   1052             return [line.rstrip() for line in response]
   1053 
   1054         return response
   1055 
   1056 
   1057     @classmethod
   1058     def run_call(cls, call, readline=False, timeout=None):
   1059         """Invoke a given devserver call using urllib.open or ssh.
   1060 
   1061         Open the URL with HTTP or SSH-based HTTP, and return the text of the
   1062         response. Exceptions may be raised as for urllib2.urlopen() or
   1063         utils.run().
   1064 
   1065         @param call: a url string that calls a method to a devserver.
   1066         @param readline: whether read http response line by line.
   1067         @param timeout: The timeout seconds for urlopen call or ssh call.
   1068 
   1069         @return the results of this call.
   1070         """
   1071         server_name = get_hostname(call)
   1072         is_in_restricted_subnet = utils.get_restricted_subnet(
   1073                 server_name, utils.RESTRICTED_SUBNETS)
   1074         _EMPTY_SENTINEL_VALUE = object()
   1075         def kickoff_call():
   1076             """Invoke a given devserver call using urllib.open or ssh.
   1077 
   1078             @param call: a url string that calls a method to a devserver.
   1079             @param is_in_restricted_subnet: whether the devserver is in subnet.
   1080             @param readline: whether read http response line by line.
   1081             @param timeout: The timeout seconds for urlopen call or ssh call.
   1082             """
   1083             if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or
   1084                 not is_in_restricted_subnet):
   1085                 response = super(ImageServerBase, cls).run_call(
   1086                         call, readline=readline, timeout=timeout)
   1087             else:
   1088                 response = cls.run_ssh_call(
   1089                         call, readline=readline, timeout=timeout)
   1090             # Retry if devserver service is temporarily down, e.g. in a
   1091             # devserver push.
   1092             if ERR_MSG_FOR_DOWN_DEVSERVER in response:
   1093                 return False
   1094 
   1095             # Don't return response directly since it may be empty string,
   1096             # which causes poll_for_condition to retry.
   1097             return _EMPTY_SENTINEL_VALUE if not response else response
   1098 
   1099         try:
   1100             response = bin_utils.poll_for_condition(
   1101                     kickoff_call,
   1102                     exception=bin_utils.TimeoutError(),
   1103                     timeout=60,
   1104                     sleep_interval=5)
   1105             return '' if response is _EMPTY_SENTINEL_VALUE else response
   1106         except bin_utils.TimeoutError:
   1107             return ERR_MSG_FOR_DOWN_DEVSERVER
   1108 
   1109 
   1110     @classmethod
   1111     def download_file(cls, remote_file, local_file, timeout=None):
   1112         """Download file from devserver.
   1113 
   1114         The format of remote_file should be:
   1115             http://devserver_ip:8082/static/board/...
   1116 
   1117         @param remote_file: The URL of the file on devserver that need to be
   1118             downloaded.
   1119         @param local_file: The path of the file saved to local.
   1120         @param timeout: The timeout seconds for this call.
   1121         """
   1122         response = cls.run_call(remote_file, timeout=timeout)
   1123         with open(local_file, 'w') as out_log:
   1124             out_log.write(response)
   1125 
   1126 
   1127     def _poll_is_staged(self, **kwargs):
   1128         """Polling devserver.is_staged until all artifacts are staged.
   1129 
   1130         @param kwargs: keyword arguments to make is_staged devserver call.
   1131 
   1132         @return: True if all artifacts are staged in devserver.
   1133         """
   1134         call = self.build_call('is_staged', **kwargs)
   1135 
   1136         def all_staged():
   1137             """Call devserver.is_staged rpc to check if all files are staged.
   1138 
   1139             @return: True if all artifacts are staged in devserver. False
   1140                      otherwise.
   1141             @rasies DevServerException, the exception is a wrapper of all
   1142                     exceptions that were raised when devserver tried to download
   1143                     the artifacts. devserver raises an HTTPError or a CmdError
   1144                     when an exception was raised in the code. Such exception
   1145                     should be re-raised here to stop the caller from waiting.
   1146                     If the call to devserver failed for connection issue, a
   1147                     URLError exception is raised, and caller should retry the
   1148                     call to avoid such network flakiness.
   1149 
   1150             """
   1151             try:
   1152                 result = self.run_call(call)
   1153                 logging.debug('whether artifact is staged: %r', result)
   1154                 return result == 'True'
   1155             except urllib2.HTTPError as e:
   1156                 error_markup = e.read()
   1157                 raise DevServerException(_strip_http_message(error_markup))
   1158             except urllib2.URLError as e:
   1159                 # Could be connection issue, retry it.
   1160                 # For example: <urlopen error [Errno 111] Connection refused>
   1161                 logging.error('URLError happens in is_stage: %r', e)
   1162                 return False
   1163             except error.CmdError as e:
   1164                 # Retry if SSH failed to connect to the devserver.
   1165                 logging.warning('CmdError happens in is_stage: %r, will retry', e)
   1166                 return False
   1167 
   1168         bin_utils.poll_for_condition(
   1169                 all_staged,
   1170                 exception=bin_utils.TimeoutError(),
   1171                 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60,
   1172                 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL)
   1173 
   1174         return True
   1175 
   1176 
   1177     def _call_and_wait(self, call_name, error_message,
   1178                        expected_response=SUCCESS, **kwargs):
   1179         """Helper method to make a urlopen call, and wait for artifacts staged.
   1180 
   1181         @param call_name: name of devserver rpc call.
   1182         @param error_message: Error message to be thrown if response does not
   1183                               match expected_response.
   1184         @param expected_response: Expected response from rpc, default to
   1185                                   |Success|. If it's set to None, do not compare
   1186                                   the actual response. Any response is consider
   1187                                   to be good.
   1188         @param kwargs: keyword arguments to make is_staged devserver call.
   1189 
   1190         @return: The response from rpc.
   1191         @raise DevServerException upon any return code that's expected_response.
   1192 
   1193         """
   1194         call = self.build_call(call_name, async=True, **kwargs)
   1195         try:
   1196             response = self.run_call(call)
   1197             logging.debug('response for RPC: %r', response)
   1198             if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response:
   1199                 logging.debug('Proxy error happens in RPC call, '
   1200                               'will retry in 30 seconds')
   1201                 time.sleep(30)
   1202                 raise DevServerOverloadException()
   1203         except httplib.BadStatusLine as e:
   1204             logging.error(e)
   1205             raise DevServerException('Received Bad Status line, Devserver %s '
   1206                                      'might have gone down while handling '
   1207                                      'the call: %s' % (self.url(), call))
   1208 
   1209         if expected_response and not response == expected_response:
   1210                 raise DevServerException(error_message)
   1211 
   1212         # `os_type` is needed in build a devserver call, but not needed for
   1213         # wait_for_artifacts_staged, since that method is implemented by
   1214         # each ImageServerBase child class.
   1215         if 'os_type' in kwargs:
   1216             del kwargs['os_type']
   1217         self.wait_for_artifacts_staged(**kwargs)
   1218         return response
   1219 
   1220 
   1221     def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs):
   1222         """Tell the devserver to download and stage |artifacts| from |image|
   1223         specified by kwargs.
   1224 
   1225         This is the main call point for staging any specific artifacts for a
   1226         given build. To see the list of artifacts one can stage see:
   1227 
   1228         ~src/platfrom/dev/artifact_info.py.
   1229 
   1230         This is maintained along with the actual devserver code.
   1231 
   1232         @param artifacts: A list of artifacts.
   1233         @param files: A list of files to stage.
   1234         @param archive_url: Optional parameter that has the archive_url to stage
   1235                 this artifact from. Default is specified in autotest config +
   1236                 image.
   1237         @param kwargs: keyword arguments that specify the build information, to
   1238                 make stage devserver call.
   1239 
   1240         @raise DevServerException upon any return code that's not HTTP OK.
   1241         """
   1242         if not archive_url:
   1243             archive_url = _get_storage_server_for_artifacts(artifacts) + build
   1244 
   1245         artifacts_arg = ','.join(artifacts) if artifacts else ''
   1246         files_arg = ','.join(files) if files else ''
   1247         error_message = ("staging %s for %s failed;"
   1248                          "HTTP OK not accompanied by 'Success'." %
   1249                          ('artifacts=%s files=%s ' % (artifacts_arg, files_arg),
   1250                           build))
   1251 
   1252         staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' %
   1253                         (build, artifacts, files, archive_url))
   1254         logging.info('Staging artifacts on devserver %s: %s',
   1255                      self.url(), staging_info)
   1256         success = False
   1257         try:
   1258             arguments = {'archive_url': archive_url,
   1259                          'artifacts': artifacts_arg,
   1260                          'files': files_arg}
   1261             if kwargs:
   1262                 arguments.update(kwargs)
   1263             # TODO(akeshet): canonicalize artifacts_arg before using it as a
   1264             # metric field (as it stands it is a not-very-well-controlled
   1265             # string).
   1266             f = {'artifacts': artifacts_arg,
   1267                  'dev_server': self.resolved_hostname}
   1268             with metrics.SecondsTimer(
   1269                     'chromeos/autotest/devserver/stage_artifact_duration',
   1270                     fields=f):
   1271                 self.call_and_wait(call_name='stage', error_message=error_message,
   1272                                    **arguments)
   1273             logging.info('Finished staging artifacts: %s', staging_info)
   1274             success = True
   1275         except (bin_utils.TimeoutError, error.TimeoutException):
   1276             logging.error('stage_artifacts timed out: %s', staging_info)
   1277             raise DevServerException(
   1278                     'stage_artifacts timed out: %s' % staging_info)
   1279         finally:
   1280             f = {'success': success,
   1281                  'artifacts': artifacts_arg,
   1282                  'dev_server': self.resolved_hostname}
   1283             metrics.Counter('chromeos/autotest/devserver/stage_artifact'
   1284                             ).increment(fields=f)
   1285 
   1286 
   1287     def call_and_wait(self, *args, **kwargs):
   1288         """Helper method to make a urlopen call, and wait for artifacts staged.
   1289 
   1290         This method needs to be overridden in the subclass to implement the
   1291         logic to call _call_and_wait.
   1292         """
   1293         raise NotImplementedError
   1294 
   1295 
   1296     def _trigger_download(self, build, artifacts, files, synchronous=True,
   1297                           **kwargs_build_info):
   1298         """Tell the devserver to download and stage image specified in
   1299         kwargs_build_info.
   1300 
   1301         Tells the devserver to fetch |image| from the image storage server
   1302         named by _get_image_storage_server().
   1303 
   1304         If |synchronous| is True, waits for the entire download to finish
   1305         staging before returning. Otherwise only the artifacts necessary
   1306         to start installing images onto DUT's will be staged before returning.
   1307         A caller can then call finish_download to guarantee the rest of the
   1308         artifacts have finished staging.
   1309 
   1310         @param synchronous: if True, waits until all components of the image are
   1311                staged before returning.
   1312         @param kwargs_build_info: Dictionary of build information.
   1313                 For CrOS, it is None as build is the CrOS image name.
   1314                 For Android, it is {'target': target,
   1315                                     'build_id': build_id,
   1316                                     'branch': branch}
   1317 
   1318         @raise DevServerException upon any return code that's not HTTP OK.
   1319 
   1320         """
   1321         if kwargs_build_info:
   1322             archive_url = None
   1323         else:
   1324             archive_url = _get_image_storage_server() + build
   1325         error_message = ("trigger_download for %s failed;"
   1326                          "HTTP OK not accompanied by 'Success'." % build)
   1327         kwargs = {'archive_url': archive_url,
   1328                   'artifacts': artifacts,
   1329                   'files': files,
   1330                   'error_message': error_message}
   1331         if kwargs_build_info:
   1332             kwargs.update(kwargs_build_info)
   1333 
   1334         logging.info('trigger_download starts for %s', build)
   1335         try:
   1336             response = self.call_and_wait(call_name='stage', **kwargs)
   1337             logging.info('trigger_download finishes for %s', build)
   1338         except (bin_utils.TimeoutError, error.TimeoutException):
   1339             logging.error('trigger_download timed out for %s.', build)
   1340             raise DevServerException(
   1341                     'trigger_download timed out for %s.' % build)
   1342         was_successful = response == SUCCESS
   1343         if was_successful and synchronous:
   1344             self._finish_download(build, artifacts, files, **kwargs_build_info)
   1345 
   1346 
   1347     def _finish_download(self, build, artifacts, files, **kwargs_build_info):
   1348         """Tell the devserver to finish staging image specified in
   1349         kwargs_build_info.
   1350 
   1351         If trigger_download is called with synchronous=False, it will return
   1352         before all artifacts have been staged. This method contacts the
   1353         devserver and blocks until all staging is completed and should be
   1354         called after a call to trigger_download.
   1355 
   1356         @param kwargs_build_info: Dictionary of build information.
   1357                 For CrOS, it is None as build is the CrOS image name.
   1358                 For Android, it is {'target': target,
   1359                                     'build_id': build_id,
   1360                                     'branch': branch}
   1361 
   1362         @raise DevServerException upon any return code that's not HTTP OK.
   1363         """
   1364         archive_url = _get_image_storage_server() + build
   1365         error_message = ("finish_download for %s failed;"
   1366                          "HTTP OK not accompanied by 'Success'." % build)
   1367         kwargs = {'archive_url': archive_url,
   1368                   'artifacts': artifacts,
   1369                   'files': files,
   1370                   'error_message': error_message}
   1371         if kwargs_build_info:
   1372             kwargs.update(kwargs_build_info)
   1373         try:
   1374             self.call_and_wait(call_name='stage', **kwargs)
   1375         except (bin_utils.TimeoutError, error.TimeoutException):
   1376             logging.error('finish_download timed out for %s', build)
   1377             raise DevServerException(
   1378                     'finish_download timed out for %s.' % build)
   1379 
   1380 
   1381     @remote_devserver_call()
   1382     def locate_file(self, file_name, artifacts, build, build_info):
   1383         """Locate a file with the given file_name on devserver.
   1384 
   1385         This method calls devserver RPC `locate_file` to look up a file with
   1386         the given file name inside specified build artifacts.
   1387 
   1388         @param file_name: Name of the file to look for a file.
   1389         @param artifacts: A list of artifact names to search for the file.
   1390         @param build: Name of the build. For Android, it's None as build_info
   1391                 should be used.
   1392         @param build_info: Dictionary of build information.
   1393                 For CrOS, it is None as build is the CrOS image name.
   1394                 For Android, it is {'target': target,
   1395                                     'build_id': build_id,
   1396                                     'branch': branch}
   1397 
   1398         @return: A devserver url to the file.
   1399         @raise DevServerException upon any return code that's not HTTP OK.
   1400         """
   1401         if not build and not build_info:
   1402             raise DevServerException('You must specify build information to '
   1403                                      'look for file %s in artifacts %s.' %
   1404                                      (file_name, artifacts))
   1405         kwargs = {'file_name': file_name,
   1406                   'artifacts': artifacts}
   1407         if build_info:
   1408             build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info
   1409             kwargs.update(build_info)
   1410             # Devserver treats Android and Brillo build in the same way as they
   1411             # are both retrieved from Launch Control and have similar build
   1412             # artifacts. Therefore, os_type for devserver calls is `android` for
   1413             # both Android and Brillo builds.
   1414             kwargs['os_type'] = 'android'
   1415         else:
   1416             build_path = build
   1417             kwargs['build'] = build
   1418         call = self.build_call('locate_file', async=False, **kwargs)
   1419         try:
   1420             file_path = self.run_call(call)
   1421             return os.path.join(self.url(), 'static', build_path, file_path)
   1422         except httplib.BadStatusLine as e:
   1423             logging.error(e)
   1424             raise DevServerException('Received Bad Status line, Devserver %s '
   1425                                      'might have gone down while handling '
   1426                                      'the call: %s' % (self.url(), call))
   1427 
   1428 
   1429     @remote_devserver_call()
   1430     def list_control_files(self, build, suite_name=''):
   1431         """Ask the devserver to list all control files for |build|.
   1432 
   1433         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
   1434                       whose control files the caller wants listed.
   1435         @param suite_name: The name of the suite for which we require control
   1436                            files.
   1437         @return None on failure, or a list of control file paths
   1438                 (e.g. server/site_tests/autoupdate/control)
   1439         @raise DevServerException upon any return code that's not HTTP OK.
   1440         """
   1441         build = self.translate(build)
   1442         call = self.build_call('controlfiles', build=build,
   1443                                suite_name=suite_name)
   1444         return self.run_call(call, readline=True)
   1445 
   1446 
   1447     @remote_devserver_call()
   1448     def get_control_file(self, build, control_path):
   1449         """Ask the devserver for the contents of a control file.
   1450 
   1451         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
   1452                       whose control file the caller wants to fetch.
   1453         @param control_path: The file to fetch
   1454                              (e.g. server/site_tests/autoupdate/control)
   1455         @return The contents of the desired file.
   1456         @raise DevServerException upon any return code that's not HTTP OK.
   1457         """
   1458         build = self.translate(build)
   1459         call = self.build_call('controlfiles', build=build,
   1460                                control_path=control_path)
   1461         return self.run_call(call)
   1462 
   1463 
   1464     @remote_devserver_call()
   1465     def list_suite_controls(self, build, suite_name=''):
   1466         """Ask the devserver to list contents of all control files for |build|.
   1467 
   1468         @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514)
   1469                       whose control files' contents the caller wants returned.
   1470         @param suite_name: The name of the suite for which we require control
   1471                            files.
   1472         @return None on failure, or a dict of contents of all control files
   1473             (e.g. {'path1': "#Copyright controls ***", ...,
   1474                 pathX': "#Copyright controls ***"}
   1475         @raise DevServerException upon any return code that's not HTTP OK.
   1476         """
   1477         build = self.translate(build)
   1478         call = self.build_call('list_suite_controls', build=build,
   1479                                suite_name=suite_name)
   1480         return json.load(cStringIO.StringIO(self.run_call(call)))
   1481 
   1482 
   1483 class ImageServer(ImageServerBase):
   1484     """Class for DevServer that handles RPCs related to CrOS images.
   1485 
   1486     The calls to devserver to stage artifacts, including stage and download, are
   1487     made in async mode. That is, when caller makes an RPC |stage| to request
   1488     devserver to stage certain artifacts, devserver handles the call and starts
   1489     staging artifacts in a new thread, and return |Success| without waiting for
   1490     staging being completed. When caller receives message |Success|, it polls
   1491     devserver's is_staged call until all artifacts are staged.
   1492     Such mechanism is designed to prevent cherrypy threads in devserver being
   1493     running out, as staging artifacts might take long time, and cherrypy starts
   1494     with a fixed number of threads that handle devserver rpc.
   1495     """
   1496 
   1497     class ArtifactUrls(object):
   1498         """A container for URLs of staged artifacts.
   1499 
   1500         Attributes:
   1501             full_payload: URL for downloading a staged full release update
   1502             mton_payload: URL for downloading a staged M-to-N release update
   1503             nton_payload: URL for downloading a staged N-to-N release update
   1504 
   1505         """
   1506         def __init__(self, full_payload=None, mton_payload=None,
   1507                      nton_payload=None):
   1508             self.full_payload = full_payload
   1509             self.mton_payload = mton_payload
   1510             self.nton_payload = nton_payload
   1511 
   1512 
   1513     def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''):
   1514         """Polling devserver.is_staged until all artifacts are staged.
   1515 
   1516         @param archive_url: Google Storage URL for the build.
   1517         @param artifacts: Comma separated list of artifacts to download.
   1518         @param files: Comma separated list of files to download.
   1519         @return: True if all artifacts are staged in devserver.
   1520         """
   1521         kwargs = {'archive_url': archive_url,
   1522                   'artifacts': artifacts,
   1523                   'files': files}
   1524         return self._poll_is_staged(**kwargs)
   1525 
   1526 
   1527     @remote_devserver_call()
   1528     def call_and_wait(self, call_name, archive_url, artifacts, files,
   1529                       error_message, expected_response=SUCCESS):
   1530         """Helper method to make a urlopen call, and wait for artifacts staged.
   1531 
   1532         @param call_name: name of devserver rpc call.
   1533         @param archive_url: Google Storage URL for the build..
   1534         @param artifacts: Comma separated list of artifacts to download.
   1535         @param files: Comma separated list of files to download.
   1536         @param expected_response: Expected response from rpc, default to
   1537                                   |Success|. If it's set to None, do not compare
   1538                                   the actual response. Any response is consider
   1539                                   to be good.
   1540         @param error_message: Error message to be thrown if response does not
   1541                               match expected_response.
   1542 
   1543         @return: The response from rpc.
   1544         @raise DevServerException upon any return code that's expected_response.
   1545 
   1546         """
   1547         kwargs = {'archive_url': archive_url,
   1548                   'artifacts': artifacts,
   1549                   'files': files}
   1550         return self._call_and_wait(call_name, error_message,
   1551                                    expected_response, **kwargs)
   1552 
   1553 
   1554     @remote_devserver_call()
   1555     def stage_artifacts(self, image=None, artifacts=None, files='',
   1556                         archive_url=None):
   1557         """Tell the devserver to download and stage |artifacts| from |image|.
   1558 
   1559          This is the main call point for staging any specific artifacts for a
   1560         given build. To see the list of artifacts one can stage see:
   1561 
   1562         ~src/platfrom/dev/artifact_info.py.
   1563 
   1564         This is maintained along with the actual devserver code.
   1565 
   1566         @param image: the image to fetch and stage.
   1567         @param artifacts: A list of artifacts.
   1568         @param files: A list of files to stage.
   1569         @param archive_url: Optional parameter that has the archive_url to stage
   1570                 this artifact from. Default is specified in autotest config +
   1571                 image.
   1572 
   1573         @raise DevServerException upon any return code that's not HTTP OK.
   1574         """
   1575         if not artifacts and not files:
   1576             raise DevServerException('Must specify something to stage.')
   1577         image = self.translate(image)
   1578         self._stage_artifacts(image, artifacts, files, archive_url)
   1579 
   1580 
   1581     @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS)
   1582     def list_image_dir(self, image):
   1583         """List the contents of the image stage directory, on the devserver.
   1584 
   1585         @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>.
   1586 
   1587         @raise DevServerException upon any return code that's not HTTP OK.
   1588         """
   1589         image = self.translate(image)
   1590         logging.info('Requesting contents from devserver %s for image %s',
   1591                      self.url(), image)
   1592         archive_url = _get_storage_server_for_artifacts() + image
   1593         call = self.build_call('list_image_dir', archive_url=archive_url)
   1594         response = self.run_call(call, readline=True)
   1595         for line in response:
   1596             logging.info(line)
   1597 
   1598 
   1599     def trigger_download(self, image, synchronous=True):
   1600         """Tell the devserver to download and stage |image|.
   1601 
   1602         Tells the devserver to fetch |image| from the image storage server
   1603         named by _get_image_storage_server().
   1604 
   1605         If |synchronous| is True, waits for the entire download to finish
   1606         staging before returning. Otherwise only the artifacts necessary
   1607         to start installing images onto DUT's will be staged before returning.
   1608         A caller can then call finish_download to guarantee the rest of the
   1609         artifacts have finished staging.
   1610 
   1611         @param image: the image to fetch and stage.
   1612         @param synchronous: if True, waits until all components of the image are
   1613                staged before returning.
   1614 
   1615         @raise DevServerException upon any return code that's not HTTP OK.
   1616 
   1617         """
   1618         image = self.translate(image)
   1619         artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE
   1620         self._trigger_download(image, artifacts, files='',
   1621                                synchronous=synchronous)
   1622 
   1623 
   1624     @remote_devserver_call()
   1625     def setup_telemetry(self, build):
   1626         """Tell the devserver to setup telemetry for this build.
   1627 
   1628         The devserver will stage autotest and then extract the required files
   1629         for telemetry.
   1630 
   1631         @param build: the build to setup telemetry for.
   1632 
   1633         @returns path on the devserver that telemetry is installed to.
   1634         """
   1635         build = self.translate(build)
   1636         archive_url = _get_image_storage_server() + build
   1637         call = self.build_call('setup_telemetry', archive_url=archive_url)
   1638         try:
   1639             response = self.run_call(call)
   1640         except httplib.BadStatusLine as e:
   1641             logging.error(e)
   1642             raise DevServerException('Received Bad Status line, Devserver %s '
   1643                                      'might have gone down while handling '
   1644                                      'the call: %s' % (self.url(), call))
   1645         return response
   1646 
   1647 
   1648     def finish_download(self, image):
   1649         """Tell the devserver to finish staging |image|.
   1650 
   1651         If trigger_download is called with synchronous=False, it will return
   1652         before all artifacts have been staged. This method contacts the
   1653         devserver and blocks until all staging is completed and should be
   1654         called after a call to trigger_download.
   1655 
   1656         @param image: the image to fetch and stage.
   1657         @raise DevServerException upon any return code that's not HTTP OK.
   1658         """
   1659         image = self.translate(image)
   1660         artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST
   1661         self._finish_download(image, artifacts, files='')
   1662 
   1663 
   1664     def get_update_url(self, image):
   1665         """Returns the url that should be passed to the updater.
   1666 
   1667         @param image: the image that was fetched.
   1668         """
   1669         image = self.translate(image)
   1670         url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern',
   1671                                               type=str)
   1672         return (url_pattern % (self.url(), image))
   1673 
   1674 
   1675     def get_staged_file_url(self, filename, image):
   1676         """Returns the url of a staged file for this image on the devserver."""
   1677         return '/'.join([self._get_image_url(image), filename])
   1678 
   1679 
   1680     def get_full_payload_url(self, image):
   1681         """Returns a URL to a staged full payload.
   1682 
   1683         @param image: the image that was fetched.
   1684 
   1685         @return A fully qualified URL that can be used for downloading the
   1686                 payload.
   1687 
   1688         """
   1689         return self._get_image_url(image) + '/update.gz'
   1690 
   1691 
   1692     def get_test_image_url(self, image):
   1693         """Returns a URL to a staged test image.
   1694 
   1695         @param image: the image that was fetched.
   1696 
   1697         @return A fully qualified URL that can be used for downloading the
   1698                 image.
   1699 
   1700         """
   1701         return self._get_image_url(image) + '/chromiumos_test_image.bin'
   1702 
   1703 
   1704     def get_recovery_image_url(self, image):
   1705         """Returns a URL to a staged recovery image.
   1706 
   1707         @param image: the image that was fetched.
   1708 
   1709         @return A fully qualified URL that can be used for downloading the
   1710                 image.
   1711 
   1712         """
   1713         return self._get_image_url(image) + '/recovery_image.bin'
   1714 
   1715 
   1716     @remote_devserver_call()
   1717     def get_dependencies_file(self, build):
   1718         """Ask the dev server for the contents of the suite dependencies file.
   1719 
   1720         Ask the dev server at |self._dev_server| for the contents of the
   1721         pre-processed suite dependencies file (at DEPENDENCIES_FILE)
   1722         for |build|.
   1723 
   1724         @param build: The build (e.g. x86-mario-release/R21-2333.0.0)
   1725                       whose dependencies the caller is interested in.
   1726         @return The contents of the dependencies file, which should eval to
   1727                 a dict of dicts, as per bin_utils/suite_preprocessor.py.
   1728         @raise DevServerException upon any return code that's not HTTP OK.
   1729         """
   1730         build = self.translate(build)
   1731         call = self.build_call('controlfiles',
   1732                                build=build, control_path=DEPENDENCIES_FILE)
   1733         return self.run_call(call)
   1734 
   1735 
   1736     @remote_devserver_call()
   1737     def get_latest_build_in_gs(self, board):
   1738         """Ask the devservers for the latest offical build in Google Storage.
   1739 
   1740         @param board: The board for who we want the latest official build.
   1741         @return A string of the returned build rambi-release/R37-5868.0.0
   1742         @raise DevServerException upon any return code that's not HTTP OK.
   1743         """
   1744         call = self.build_call(
   1745                 'xbuddy_translate/remote/%s/latest-official' % board,
   1746                 image_dir=_get_image_storage_server())
   1747         image_name = self.run_call(call)
   1748         return os.path.dirname(image_name)
   1749 
   1750 
   1751     def translate(self, build_name):
   1752         """Translate the build name if it's in LATEST format.
   1753 
   1754         If the build name is in the format [builder]/LATEST, return the latest
   1755         build in Google Storage otherwise return the build name as is.
   1756 
   1757         @param build_name: build_name to check.
   1758 
   1759         @return The actual build name to use.
   1760         """
   1761         match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I)
   1762         if not match:
   1763             return build_name
   1764         translated_build = self.get_latest_build_in_gs(match.groups()[0])
   1765         logging.debug('Translated relative build %s to %s', build_name,
   1766                       translated_build)
   1767         return translated_build
   1768 
   1769 
   1770     @classmethod
   1771     @remote_devserver_call()
   1772     def get_latest_build(cls, target, milestone=''):
   1773         """Ask all the devservers for the latest build for a given target.
   1774 
   1775         @param target: The build target, typically a combination of the board
   1776                        and the type of build e.g. x86-mario-release.
   1777         @param milestone:  For latest build set to '', for builds only in a
   1778                            specific milestone set to a str of format Rxx
   1779                            (e.g. R16). Default: ''. Since we are dealing with a
   1780                            webserver sending an empty string, '', ensures that
   1781                            the variable in the URL is ignored as if it was set
   1782                            to None.
   1783         @return A string of the returned build e.g. R20-2226.0.0.
   1784         @raise DevServerException upon any return code that's not HTTP OK.
   1785         """
   1786         calls = cls.build_all_calls('latestbuild', target=target,
   1787                                     milestone=milestone)
   1788         latest_builds = []
   1789         for call in calls:
   1790             latest_builds.append(cls.run_call(call))
   1791 
   1792         return max(latest_builds, key=version.LooseVersion)
   1793 
   1794 
   1795     @remote_devserver_call()
   1796     def _kill_au_process_for_host(self, **kwargs):
   1797         """Kill the triggerred auto_update process if error happens in cros_au.
   1798 
   1799         @param kwargs: Arguments to make kill_au_proc devserver call.
   1800         """
   1801         call = self.build_call('kill_au_proc', **kwargs)
   1802         response = self.run_call(call)
   1803         if not response == 'True':
   1804             raise DevServerException(
   1805                     'Failed to kill the triggerred CrOS auto_update process'
   1806                     'on devserver %s, the response is %s' % (
   1807                             self.url(), response))
   1808 
   1809 
   1810     def kill_au_process_for_host(self, host_name, pid):
   1811         """Kill the triggerred auto_update process if error happens.
   1812 
   1813         Usually this function is used to clear all potential left au processes
   1814         of the given host name.
   1815 
   1816         If pid is specified, the devserver will further check the given pid to
   1817         make sure the process is killed. This is used for the case that the au
   1818         process has started in background, but then provision fails due to
   1819         some unknown issues very fast. In this case, when 'kill_au_proc' is
   1820         called, there's no corresponding background track log created for this
   1821         ongoing au process, which prevents this RPC call from killing this au
   1822         process.
   1823 
   1824         @param host_name: The DUT's hostname.
   1825         @param pid: The ongoing au process's pid.
   1826 
   1827         @return: True if successfully kill the auto-update process for host.
   1828         """
   1829         kwargs = {'host_name': host_name, 'pid': pid}
   1830         try:
   1831             self._kill_au_process_for_host(**kwargs)
   1832         except DevServerException:
   1833             return False
   1834 
   1835         return True
   1836 
   1837 
   1838     @remote_devserver_call()
   1839     def _clean_track_log(self, **kwargs):
   1840         """Clean track log for the current auto-update process."""
   1841         call = self.build_call('handler_cleanup', **kwargs)
   1842         self.run_call(call)
   1843 
   1844 
   1845     def clean_track_log(self, host_name, pid):
   1846         """Clean track log for the current auto-update process.
   1847 
   1848         @param host_name: The host name to be updated.
   1849         @param pid: The auto-update process id.
   1850 
   1851         @return: True if track log is successfully cleaned, False otherwise.
   1852         """
   1853         if not pid:
   1854             return False
   1855 
   1856         kwargs = {'host_name': host_name, 'pid': pid}
   1857         try:
   1858             self._clean_track_log(**kwargs)
   1859         except DevServerException as e:
   1860             logging.debug('Failed to clean track_status_file on '
   1861                           'devserver for host %s and process id %s: %s',
   1862                           host_name, pid, str(e))
   1863             return False
   1864 
   1865         return True
   1866 
   1867 
   1868     def _get_au_log_filename(self, log_dir, host_name, pid):
   1869         """Return the auto-update log's filename."""
   1870         return os.path.join(log_dir, CROS_AU_LOG_FILENAME % (
   1871                     host_name, pid))
   1872 
   1873     def _read_json_response_from_devserver(self, response):
   1874         """Reads the json response from the devserver.
   1875 
   1876         This is extracted to its own function so that it can be easily mocked.
   1877         @param response: the response for a devserver.
   1878         """
   1879         try:
   1880             return json.loads(response)
   1881         except ValueError as e:
   1882             logging.debug('Failed to load json response: %s', response)
   1883             raise DevServerException(e)
   1884 
   1885 
   1886     @remote_devserver_call()
   1887     def _collect_au_log(self, log_dir, **kwargs):
   1888         """Collect logs from devserver after cros-update process is finished.
   1889 
   1890         Collect the logs that recording the whole cros-update process, and
   1891         write it to sysinfo path of a job.
   1892 
   1893         The example log file name that is stored is like:
   1894             '1220-repair/sysinfo/CrOS_update_host_name_pid.log'
   1895 
   1896         @param host_name: the DUT's hostname.
   1897         @param pid: the auto-update process id on devserver.
   1898         @param log_dir: The directory to save the cros-update process log
   1899                         retrieved from devserver.
   1900         """
   1901         call = self.build_call('collect_cros_au_log', **kwargs)
   1902         response = self.run_call(call)
   1903         if not os.path.exists(log_dir):
   1904             os.mkdir(log_dir)
   1905         write_file = self._get_au_log_filename(
   1906                 log_dir, kwargs['host_name'], kwargs['pid'])
   1907         logging.debug('Saving auto-update logs into %s', write_file)
   1908 
   1909         au_logs = self._read_json_response_from_devserver(response)
   1910 
   1911         try:
   1912             for k, v in au_logs['host_logs'].items():
   1913                 log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid'])
   1914                 log_path = os.path.join(log_dir, log_name)
   1915                 with open(log_path, 'w') as out_log:
   1916                     out_log.write(v)
   1917         except IOError as e:
   1918             raise DevServerException('Failed to write auto-update hostlogs: '
   1919                                      '%s' % e)
   1920 
   1921         try:
   1922             with open(write_file, 'w') as out_log:
   1923                 out_log.write(au_logs['cros_au_log'])
   1924         except:
   1925             raise DevServerException('Failed to write auto-update logs into '
   1926                                      '%s' % write_file)
   1927 
   1928 
   1929     def collect_au_log(self, host_name, pid, log_dir):
   1930         """Collect logs from devserver after cros-update process is finished.
   1931 
   1932         @param host_name: the DUT's hostname.
   1933         @param pid: the auto-update process id on devserver.
   1934         @param log_dir: The directory to save the cros-update process log
   1935                         retrieved from devserver.
   1936 
   1937         @return: True if auto-update log is successfully collected, False
   1938           otherwise.
   1939         """
   1940         if not pid:
   1941             return False
   1942 
   1943         kwargs = {'host_name': host_name, 'pid': pid}
   1944         try:
   1945             self._collect_au_log(log_dir, **kwargs)
   1946         except DevServerException as e:
   1947             logging.debug('Failed to collect auto-update log on '
   1948                           'devserver for host %s and process id %s: %s',
   1949                           host_name, pid, str(e))
   1950             return False
   1951 
   1952         return True
   1953 
   1954 
   1955     @remote_devserver_call()
   1956     def _trigger_auto_update(self, **kwargs):
   1957         """Trigger auto-update by calling devserver.cros_au.
   1958 
   1959         @param kwargs:  Arguments to make cros_au devserver call.
   1960 
   1961         @return: a tuple indicates whether the RPC call cros_au succeeds and
   1962           the auto-update process id running on devserver.
   1963         """
   1964         host_name = kwargs['host_name']
   1965         call = self.build_call('cros_au', async=True, **kwargs)
   1966         try:
   1967             response = self.run_call(call)
   1968             logging.info(
   1969                 'Received response from devserver for cros_au call: %r',
   1970                 response)
   1971         except httplib.BadStatusLine as e:
   1972             logging.error(e)
   1973             raise DevServerException('Received Bad Status line, Devserver %s '
   1974                                      'might have gone down while handling '
   1975                                      'the call: %s' % (self.url(), call))
   1976 
   1977         return response
   1978 
   1979 
   1980     def _check_for_auto_update_finished(self, pid, wait=True, **kwargs):
   1981         """Polling devserver.get_au_status to get current auto-update status.
   1982 
   1983         The current auto-update status is used to identify whether the update
   1984         process is finished.
   1985 
   1986         @param pid:    The background process id for auto-update in devserver.
   1987         @param kwargs: keyword arguments to make get_au_status devserver call.
   1988         @param wait:   Should the check wait for completion.
   1989 
   1990         @return: True if auto-update is finished for a given dut.
   1991         """
   1992         logging.debug('Check the progress for auto-update process %r', pid)
   1993         kwargs['pid'] = pid
   1994         call = self.build_call('get_au_status', **kwargs)
   1995 
   1996         def all_finished():
   1997             """Call devserver.get_au_status rpc to check if auto-update
   1998                is finished.
   1999 
   2000             @return: True if auto-update is finished for a given dut. False
   2001                      otherwise.
   2002             @rasies  DevServerException, the exception is a wrapper of all
   2003                      exceptions that were raised when devserver tried to
   2004                      download the artifacts. devserver raises an HTTPError or
   2005                      a CmdError when an exception was raised in the code. Such
   2006                      exception should be re-raised here to stop the caller from
   2007                      waiting. If the call to devserver failed for connection
   2008                      issue, a URLError exception is raised, and caller should
   2009                      retry the call to avoid such network flakiness.
   2010 
   2011             """
   2012             try:
   2013                 au_status = self.run_call(call)
   2014                 response = json.loads(au_status)
   2015                 # This is a temp fix to fit both dict and tuple returning
   2016                 # values. The dict check will be removed after a corresponding
   2017                 # devserver CL is deployed.
   2018                 if isinstance(response, dict):
   2019                     if response.get('detailed_error_msg'):
   2020                         raise DevServerException(
   2021                                 response.get('detailed_error_msg'))
   2022 
   2023                     if response.get('finished'):
   2024                         logging.debug('CrOS auto-update is finished')
   2025                         return True
   2026                     else:
   2027                         logging.debug('Current CrOS auto-update status: %s',
   2028                                       response.get('status'))
   2029                         return False
   2030 
   2031                 if not response[0]:
   2032                     logging.debug('Current CrOS auto-update status: %s',
   2033                                   response[1])
   2034                     return False
   2035                 else:
   2036                     logging.debug('CrOS auto-update is finished')
   2037                     return True
   2038             except urllib2.HTTPError as e:
   2039                 error_markup = e.read()
   2040                 raise DevServerException(_strip_http_message(error_markup))
   2041             except urllib2.URLError as e:
   2042                 # Could be connection issue, retry it.
   2043                 # For example: <urlopen error [Errno 111] Connection refused>
   2044                 logging.warning('URLError (%r): Retrying connection to '
   2045                                 'devserver to check auto-update status.', e)
   2046                 return False
   2047             except error.CmdError:
   2048                 # Retry if SSH failed to connect to the devserver.
   2049                 logging.warning('CmdError: Retrying SSH connection to check '
   2050                                 'auto-update status.')
   2051                 return False
   2052             except socket.error as e:
   2053                 # Could be some temporary devserver connection issues.
   2054                 logging.warning('Socket Error (%r): Retrying connection to '
   2055                                 'devserver to check auto-update status.', e)
   2056                 return False
   2057             except ValueError as e:
   2058                 raise DevServerException(
   2059                         '%s (Got AU status: %r)' % (str(e), au_status))
   2060 
   2061         if wait:
   2062             bin_utils.poll_for_condition(
   2063                     all_finished,
   2064                     exception=bin_utils.TimeoutError(),
   2065                     timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60,
   2066                     sleep_interval=CROS_AU_POLLING_INTERVAL)
   2067 
   2068             return True
   2069         else:
   2070             return all_finished()
   2071 
   2072 
   2073     def check_for_auto_update_finished(self, response, wait=True, **kwargs):
   2074         """Processing response of 'cros_au' and polling for auto-update status.
   2075 
   2076         Will wait for the whole auto-update process is finished.
   2077 
   2078         @param response: The response from RPC 'cros_au'
   2079         @param kwargs: keyword arguments to make get_au_status devserver call.
   2080 
   2081         @return: a tuple includes two elements.
   2082           finished: True if the operation has completed.
   2083           raised_error: None if everything works well or the raised error.
   2084           pid: the auto-update process id on devserver.
   2085         """
   2086 
   2087         pid = 0
   2088         raised_error = None
   2089         finished = False
   2090         try:
   2091             response = json.loads(response)
   2092             if response[0]:
   2093                 pid = response[1]
   2094                 # If provision is kicked off asynchronously, pid will be -1.
   2095                 # If provision is not successfully kicked off , pid continues
   2096                 # to be 0.
   2097                 if pid > 0:
   2098                     logging.debug('start process %r for auto_update in '
   2099                                   'devserver', pid)
   2100                     finished = self._check_for_auto_update_finished(
   2101                             pid, wait=wait, **kwargs)
   2102         except Exception as e:
   2103             logging.debug('Failed to trigger auto-update process on devserver')
   2104             finished = True
   2105             raised_error = e
   2106         finally:
   2107             return finished, raised_error, pid
   2108 
   2109 
   2110     def _check_error_message(self, error_patterns_to_check, error_msg):
   2111         """Detect whether specific error pattern exist in error message.
   2112 
   2113         @param error_patterns_to_check: the error patterns to check
   2114         @param error_msg: the error message which may include any error
   2115                           pattern.
   2116 
   2117         @return A boolean variable, True if error_msg contains any error
   2118             pattern in error_patterns_to_check, False otherwise.
   2119         """
   2120         for err in error_patterns_to_check:
   2121             if err in error_msg:
   2122                 return True
   2123 
   2124         return False
   2125 
   2126 
   2127     def _is_retryable(self, error_msg):
   2128         """Detect whether we will retry auto-update based on error_msg.
   2129 
   2130         @param error_msg: The given error message.
   2131 
   2132         @return A boolean variable which indicates whether we will retry
   2133             auto_update with another devserver based on the given error_msg.
   2134         """
   2135         # For now we just hard-code the error message we think it's suspicious.
   2136         # When we get more date about what's the json response when devserver
   2137         # is overloaded, we can update this part.
   2138         retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE,
   2139                                     'is not pingable']
   2140         return self._check_error_message(retryable_error_patterns, error_msg)
   2141 
   2142 
   2143     def _should_use_original_payload(self, error_msg):
   2144         devserver_error_patterns = ['DevserverCannotStartError']
   2145         return self._check_error_message(devserver_error_patterns, error_msg)
   2146 
   2147 
   2148     def _parse_buildname_safely(self, build_name):
   2149         """Parse a given buildname safely.
   2150 
   2151         @param build_name: the build name to be parsed.
   2152 
   2153         @return: a tuple (board, build_type, milestone)
   2154         """
   2155         try:
   2156             board, build_type, milestone, _ = server_utils.ParseBuildName(
   2157                     build_name)
   2158         except server_utils.ParseBuildNameException:
   2159             logging.warning('Unable to parse build name %s for metrics. '
   2160                             'Continuing anyway.', build_name)
   2161             board, build_type, milestone = ('', '', '')
   2162 
   2163         return board, build_type, milestone
   2164 
   2165 
   2166     def _emit_auto_update_metrics(self, board, build_type, dut_host_name,
   2167                                   build_name, attempt,
   2168                                   success, failure_reason, duration):
   2169         """Send metrics for a single auto_update attempt.
   2170 
   2171         @param board: a field in metrics representing which board this
   2172             auto_update tries to update.
   2173         @param build_type: a field in metrics representing which build type this
   2174             auto_update tries to update.
   2175         @param dut_host_name: a field in metrics representing which DUT this
   2176             auto_update tries to update.
   2177         @param build_name: auto update build being updated to.
   2178         @param attempt: a field in metrics, representing which attempt/retry
   2179             this auto_update is.
   2180         @param success: a field in metrics, representing whether this
   2181             auto_update succeeds or not.
   2182         @param failure_reason: DevServerExceptionClassifier object to show
   2183             auto update failure reason, or None.
   2184         @param duration: auto update duration time, in seconds.
   2185         """
   2186         # The following is high cardinality, but sparse.
   2187         # Each DUT is of a single board type, and likely build type.
   2188         # The affinity also results in each DUT being attached to the same
   2189         # dev_server as well.
   2190         fields = {
   2191                 'board': board,
   2192                 'build_type': build_type,
   2193                 'dut_host_name': dut_host_name,
   2194                 'dev_server': self.resolved_hostname,
   2195                 'attempt': attempt,
   2196                 'success': success,
   2197         }
   2198 
   2199         # reset_after=True is required for String gauges events to ensure that
   2200         # the metrics are not repeatedly emitted until the server restarts.
   2201 
   2202         metrics.String(PROVISION_PATH + '/auto_update_build_by_devserver_dut',
   2203                        reset_after=True).set(build_name, fields=fields)
   2204 
   2205         if not success:
   2206             metrics.String(
   2207                 PROVISION_PATH +
   2208                 '/auto_update_failure_reason_by_devserver_dut',
   2209                 reset_after=True).set(
   2210                     failure_reason.classification if failure_reason else '',
   2211                     fields=fields)
   2212 
   2213         metrics.SecondsDistribution(
   2214                 PROVISION_PATH + '/auto_update_duration_by_devserver_dut').add(
   2215                         duration, fields=fields)
   2216 
   2217 
   2218     def _emit_provision_metrics(self, error_list, duration_list,
   2219                                 is_au_success, board, build_type, milestone,
   2220                                 dut_host_name, is_aue2etest,
   2221                                 total_duration, build_name):
   2222         """Send metrics for provision request.
   2223 
   2224         Provision represents potentially multiple auto update attempts.
   2225 
   2226         Please note: to avoid reaching or exceeding the monarch field
   2227         cardinality limit, we avoid a metric that includes both dut hostname
   2228         and other high cardinality fields.
   2229 
   2230         @param error_list: a list of DevServerExceptionClassifier objects to
   2231             show errors happened in provision. Usually it contains 1 ~
   2232             AU_RETRY_LIMIT objects since we only retry provision for several
   2233             times.
   2234         @param duration_list: a list of provision duration time, counted by
   2235             seconds.
   2236         @param is_au_success: a field in metrics, representing whether this
   2237             auto_update succeeds or not.
   2238         @param board: a field in metrics representing which board this
   2239             auto_update tries to update.
   2240         @param build_type: a field in metrics representing which build type this
   2241             auto_update tries to update.
   2242         @param milestone: a field in metrics representing which milestone this
   2243             auto_update tries to update.
   2244         @param dut_host_name: a field in metrics representing which DUT this
   2245             auto_update tries to update.
   2246         @param is_aue2etest: a field in metrics representing if provision was
   2247             done as part of the autoupdate_EndToEndTest.
   2248         """
   2249         # The following is high cardinality, but sparse.
   2250         # Each DUT is of a single board type, and likely build type.
   2251         # The affinity also results in each DUT being attached to the same
   2252         # dev_server as well.
   2253         fields = {
   2254                 'board': board,
   2255                 'build_type': build_type,
   2256                 'dut_host_name': dut_host_name,
   2257                 'dev_server': self.resolved_hostname,
   2258                 'success': is_au_success,
   2259         }
   2260 
   2261         # reset_after=True is required for String gauges events to ensure that
   2262         # the metrics are not repeatedly emitted until the server restarts.
   2263 
   2264         metrics.String(PROVISION_PATH + '/provision_build_by_devserver_dut',
   2265                        reset_after=True).set(build_name, fields=fields)
   2266 
   2267         if error_list:
   2268             metrics.String(
   2269                     PROVISION_PATH +
   2270                     '/provision_failure_reason_by_devserver_dut',
   2271                     reset_after=True).set(error_list[0].classification,
   2272                                           fields=fields)
   2273 
   2274         metrics.SecondsDistribution(
   2275                 PROVISION_PATH + '/provision_duration_by_devserver_dut').add(
   2276                         total_duration, fields=fields)
   2277 
   2278 
   2279     def _parse_buildname_from_gs_uri(self, uri):
   2280         """Get parameters needed for AU metrics when build_name is not known.
   2281 
   2282         autoupdate_EndToEndTest is run with two Google Storage URIs from the
   2283         gs://chromeos-releases bucket. URIs in this bucket do not have the
   2284         build_name in the format samus-release/R60-0000.0.0.
   2285 
   2286         We can get the milestone and board by checking the instructions.json
   2287         file contained in the bucket with the payloads.
   2288 
   2289         @param uri: The partial uri we received from autoupdate_EndToEndTest.
   2290         """
   2291         try:
   2292             # Get the instructions file that contains info about the build.
   2293             gs_file = 'gs://chromeos-releases/' + uri + '/*instructions.json'
   2294             files = bin_utils.gs_ls(gs_file)
   2295             for f in files:
   2296                 gs_folder, _, instruction_file = f.rpartition('/')
   2297                 self.stage_artifacts(image=uri,
   2298                                      files=[instruction_file],
   2299                                      archive_url=gs_folder)
   2300                 json_file = self.get_staged_file_url(instruction_file, uri)
   2301                 response = urllib2.urlopen(json_file)
   2302                 data = json.load(response)
   2303                 return data['board'], 'release', data['version']['milestone']
   2304         except (ValueError, error.CmdError, urllib2.URLError) as e:
   2305             logging.debug('Problem getting values for metrics: %s', e)
   2306             logging.warning('Unable to parse build name %s from AU test for '
   2307                             'metrics. Continuing anyway.', uri)
   2308 
   2309         return '', '', ''
   2310 
   2311 
   2312     def auto_update(self, host_name, build_name, original_board=None,
   2313                     original_release_version=None, log_dir=None,
   2314                     force_update=False, full_update=False,
   2315                     payload_filename=None, force_original=False,
   2316                     clobber_stateful=True, quick_provision=False):
   2317         """Auto-update a CrOS host.
   2318 
   2319         @param host_name: The hostname of the DUT to auto-update.
   2320         @param build_name:  The build name to be auto-updated on the DUT.
   2321         @param original_board: The original board of the DUT to auto-update.
   2322         @param original_release_version: The release version of the DUT's
   2323             current build.
   2324         @param log_dir: The log directory to store auto-update logs from
   2325             devserver.
   2326         @param force_update: Force an update even if the version installed
   2327                              is the same. Default: False.
   2328         @param full_update:  If True, do not run stateful update, directly
   2329                              force a full reimage. If False, try stateful
   2330                              update first if the dut is already installed
   2331                              with the same version.
   2332         @param payload_filename: Used to specify the exact file to
   2333                                  use for autoupdating. If None, the payload
   2334                                  will be determined by build_name. You
   2335                                  must have already staged this file before
   2336                                  passing it in here.
   2337         @param force_original: Whether to force stateful update with the
   2338                                original payload.
   2339         @param clobber_stateful: If True do a clean install of stateful.
   2340         @param quick_provision: Attempt to use quick provision path first.
   2341 
   2342         @return A set (is_success, pid) in which:
   2343             1. is_success indicates whether this auto_update succeeds.
   2344             2. pid is the process id of the successful autoupdate run.
   2345 
   2346         @raise DevServerException if auto_update fails and is not retryable.
   2347         @raise RetryableProvisionException if it fails and is retryable.
   2348         """
   2349         kwargs = {'host_name': host_name,
   2350                   'build_name': build_name,
   2351                   'force_update': force_update,
   2352                   'full_update': full_update,
   2353                   'clobber_stateful': clobber_stateful,
   2354                   'quick_provision': quick_provision}
   2355 
   2356         is_aue2etest = payload_filename is not None
   2357 
   2358         if is_aue2etest:
   2359             kwargs['payload_filename'] = payload_filename
   2360 
   2361         error_msg = 'CrOS auto-update failed for host %s: %s'
   2362         error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s'
   2363         is_au_success = False
   2364         au_log_dir = os.path.join(log_dir,
   2365                                   AUTO_UPDATE_LOG_DIR) if log_dir else None
   2366         error_list = []
   2367         retry_with_another_devserver = False
   2368         duration_list = []
   2369 
   2370         if is_aue2etest:
   2371             board, build_type, milestone = self._parse_buildname_from_gs_uri(
   2372                 build_name)
   2373         else:
   2374             board, build_type, milestone = self._parse_buildname_safely(
   2375                 build_name)
   2376 
   2377         provision_start_time = time.time()
   2378         for au_attempt in range(AU_RETRY_LIMIT):
   2379             logging.debug('Start CrOS auto-update for host %s at %d time(s).',
   2380                           host_name, au_attempt + 1)
   2381             au_start_time = time.time()
   2382             failure_reason = None
   2383             # No matter _trigger_auto_update succeeds or fails, the auto-update
   2384             # track_status_file should be cleaned, and the auto-update execute
   2385             # log should be collected to directory sysinfo. Also, the error
   2386             # raised by _trigger_auto_update should be displayed.
   2387             try:
   2388                 # Try update with stateful.tgz of old release version in the
   2389                 # last try of auto-update.
   2390                 if force_original and original_release_version:
   2391                     # Monitor this case in monarch
   2392                     original_build = '%s/%s' % (original_board,
   2393                                                 original_release_version)
   2394                     c = metrics.Counter(
   2395                             'chromeos/autotest/provision/'
   2396                             'cros_update_with_original_build')
   2397                     f = {'dev_server': self.resolved_hostname,
   2398                          'board': board,
   2399                          'build_type': build_type,
   2400                          'milestone': milestone,
   2401                          'original_build': original_build}
   2402                     c.increment(fields=f)
   2403 
   2404                     logging.debug('Try updating stateful partition of the '
   2405                                   'host with the same version of its current '
   2406                                   'rootfs partition: %s', original_build)
   2407                     response = self._trigger_auto_update(
   2408                             original_build=original_build, **kwargs)
   2409                 else:
   2410                     response = self._trigger_auto_update(**kwargs)
   2411             except DevServerException as e:
   2412                 logging.debug(error_msg_attempt, au_attempt+1, str(e))
   2413                 failure_reason = DevServerExceptionClassifier(str(e))
   2414             else:
   2415                 _, raised_error, pid = self.check_for_auto_update_finished(
   2416                         response, **kwargs)
   2417 
   2418                 # Error happens in _collect_au_log won't be raised.
   2419                 if au_log_dir:
   2420                     is_collect_success = self.collect_au_log(
   2421                             kwargs['host_name'], pid, au_log_dir)
   2422                 else:
   2423                     is_collect_success = True
   2424 
   2425                 # Error happens in _clean_track_log won't be raised.
   2426                 if pid >= 0:
   2427                     is_clean_success = self.clean_track_log(
   2428                             kwargs['host_name'], pid)
   2429                 else:
   2430                     is_clean_success = True
   2431 
   2432                 # If any error is raised previously, log it and retry
   2433                 # auto-update. Otherwise, claim a successful CrOS auto-update.
   2434                 if (not raised_error and is_clean_success and
   2435                     is_collect_success):
   2436                     logging.debug('CrOS auto-update succeed for host %s',
   2437                                   host_name)
   2438                     is_au_success = True
   2439                     break
   2440                 else:
   2441                     if not self.kill_au_process_for_host(kwargs['host_name'],
   2442                                                          pid):
   2443                         logging.debug('Failed to kill auto_update process %d',
   2444                                       pid)
   2445                     if raised_error:
   2446                         error_str = str(raised_error)
   2447                         logging.debug(error_msg_attempt, au_attempt + 1,
   2448                                       error_str)
   2449                         if au_log_dir:
   2450                             logging.debug('Please see error details in log %s',
   2451                                           self._get_au_log_filename(
   2452                                                   au_log_dir,
   2453                                                   kwargs['host_name'],
   2454                                                   pid))
   2455                         failure_reason = DevServerExceptionClassifier(
   2456                             error_str, keep_full_trace=False)
   2457                         if self._is_retryable(error_str):
   2458                             retry_with_another_devserver = True
   2459 
   2460                         if self._should_use_original_payload(error_str):
   2461                             force_original = True
   2462 
   2463             finally:
   2464                 duration = int(time.time() - au_start_time)
   2465                 duration_list.append(duration)
   2466                 if failure_reason:
   2467                     error_list.append(failure_reason)
   2468                 self._emit_auto_update_metrics(board, build_type, host_name,
   2469                                                build_name, au_attempt + 1,
   2470                                                is_au_success, failure_reason,
   2471                                                duration)
   2472                 if retry_with_another_devserver:
   2473                     break
   2474 
   2475                 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1:
   2476                     time.sleep(CROS_AU_RETRY_INTERVAL)
   2477                     # Use the IP of DUT if the hostname failed.
   2478                     host_name_ip = socket.gethostbyname(host_name)
   2479                     kwargs['host_name'] = host_name_ip
   2480                     logging.debug(
   2481                             'AU failed, trying IP instead of hostname: %s',
   2482                             host_name_ip)
   2483 
   2484         total_duration = int(time.time() - provision_start_time)
   2485         self._emit_provision_metrics(error_list, duration_list, is_au_success,
   2486                                      board, build_type, milestone, host_name,
   2487                                      is_aue2etest, total_duration, build_name)
   2488 
   2489         if is_au_success:
   2490             return (is_au_success, pid)
   2491 
   2492         # If errors happen in the CrOS AU process, report the concatenation
   2493         # of the errors happening in first & second provision.
   2494         # If error happens in RPCs of cleaning track log, collecting
   2495         # auto-update logs, or killing auto-update processes, just report a
   2496         # common error here.
   2497         if error_list:
   2498             real_error = ', '.join(['%d) %s' % (i, e.summary)
   2499                                     for i, e in enumerate(error_list)])
   2500             if retry_with_another_devserver:
   2501                 raise RetryableProvisionException(
   2502                         error_msg % (host_name, real_error))
   2503             else:
   2504                 raise error_list[0].classified_exception(
   2505                     error_msg % (host_name, real_error))
   2506         else:
   2507             raise DevServerException(error_msg % (
   2508                         host_name, ('RPC calls after the whole auto-update '
   2509                                     'process failed.')))
   2510 
   2511 
   2512 class AndroidBuildServer(ImageServerBase):
   2513     """Class for DevServer that handles RPCs related to Android builds.
   2514 
   2515     The calls to devserver to stage artifacts, including stage and download, are
   2516     made in async mode. That is, when caller makes an RPC |stage| to request
   2517     devserver to stage certain artifacts, devserver handles the call and starts
   2518     staging artifacts in a new thread, and return |Success| without waiting for
   2519     staging being completed. When caller receives message |Success|, it polls
   2520     devserver's is_staged call until all artifacts are staged.
   2521     Such mechanism is designed to prevent cherrypy threads in devserver being
   2522     running out, as staging artifacts might take long time, and cherrypy starts
   2523     with a fixed number of threads that handle devserver rpc.
   2524     """
   2525 
   2526     def wait_for_artifacts_staged(self, target, build_id, branch,
   2527                                   archive_url=None, artifacts='', files=''):
   2528         """Polling devserver.is_staged until all artifacts are staged.
   2529 
   2530         @param target: Target of the android build to stage, e.g.,
   2531                        shamu-userdebug.
   2532         @param build_id: Build id of the android build to stage.
   2533         @param branch: Branch of the android build to stage.
   2534         @param archive_url: Google Storage URL for the build.
   2535         @param artifacts: Comma separated list of artifacts to download.
   2536         @param files: Comma separated list of files to download.
   2537 
   2538         @return: True if all artifacts are staged in devserver.
   2539         """
   2540         kwargs = {'target': target,
   2541                   'build_id': build_id,
   2542                   'branch': branch,
   2543                   'artifacts': artifacts,
   2544                   'files': files,
   2545                   'os_type': 'android'}
   2546         if archive_url:
   2547             kwargs['archive_url'] = archive_url
   2548         return self._poll_is_staged(**kwargs)
   2549 
   2550 
   2551     @remote_devserver_call()
   2552     def call_and_wait(self, call_name, target, build_id, branch, archive_url,
   2553                       artifacts, files, error_message,
   2554                       expected_response=SUCCESS):
   2555         """Helper method to make a urlopen call, and wait for artifacts staged.
   2556 
   2557         @param call_name: name of devserver rpc call.
   2558         @param target: Target of the android build to stage, e.g.,
   2559                        shamu-userdebug.
   2560         @param build_id: Build id of the android build to stage.
   2561         @param branch: Branch of the android build to stage.
   2562         @param archive_url: Google Storage URL for the CrOS build.
   2563         @param artifacts: Comma separated list of artifacts to download.
   2564         @param files: Comma separated list of files to download.
   2565         @param expected_response: Expected response from rpc, default to
   2566                                   |Success|. If it's set to None, do not compare
   2567                                   the actual response. Any response is consider
   2568                                   to be good.
   2569         @param error_message: Error message to be thrown if response does not
   2570                               match expected_response.
   2571 
   2572         @return: The response from rpc.
   2573         @raise DevServerException upon any return code that's expected_response.
   2574 
   2575         """
   2576         kwargs = {'target': target,
   2577                   'build_id': build_id,
   2578                   'branch': branch,
   2579                   'artifacts': artifacts,
   2580                   'files': files,
   2581                   'os_type': 'android'}
   2582         if archive_url:
   2583             kwargs['archive_url'] = archive_url
   2584         return self._call_and_wait(call_name, error_message, expected_response,
   2585                                    **kwargs)
   2586 
   2587 
   2588     @remote_devserver_call()
   2589     def stage_artifacts(self, target=None, build_id=None, branch=None,
   2590                         image=None, artifacts=None, files='', archive_url=None):
   2591         """Tell the devserver to download and stage |artifacts| from |image|.
   2592 
   2593          This is the main call point for staging any specific artifacts for a
   2594         given build. To see the list of artifacts one can stage see:
   2595 
   2596         ~src/platfrom/dev/artifact_info.py.
   2597 
   2598         This is maintained along with the actual devserver code.
   2599 
   2600         @param target: Target of the android build to stage, e.g.,
   2601                                shamu-userdebug.
   2602         @param build_id: Build id of the android build to stage.
   2603         @param branch: Branch of the android build to stage.
   2604         @param image: Name of a build to test, in the format of
   2605                       branch/target/build_id
   2606         @param artifacts: A list of artifacts.
   2607         @param files: A list of files to stage.
   2608         @param archive_url: Optional parameter that has the archive_url to stage
   2609                 this artifact from. Default is specified in autotest config +
   2610                 image.
   2611 
   2612         @raise DevServerException upon any return code that's not HTTP OK.
   2613         """
   2614         if image and not target and not build_id and not branch:
   2615             branch, target, build_id = utils.parse_launch_control_build(image)
   2616         if not target or not build_id or not branch:
   2617             raise DevServerException('Must specify all build info (target, '
   2618                                      'build_id and branch) to stage.')
   2619 
   2620         android_build_info = {'target': target,
   2621                               'build_id': build_id,
   2622                               'branch': branch}
   2623         if not artifacts and not files:
   2624             raise DevServerException('Must specify something to stage.')
   2625         if not all(android_build_info.values()):
   2626             raise DevServerException(
   2627                     'To stage an Android build, must specify target, build id '
   2628                     'and branch.')
   2629         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2630         self._stage_artifacts(build, artifacts, files, archive_url,
   2631                               **android_build_info)
   2632 
   2633     def trigger_download(self, target, build_id, branch, artifacts=None,
   2634                          files='', os='android', synchronous=True):
   2635         """Tell the devserver to download and stage an Android build.
   2636 
   2637         Tells the devserver to fetch an Android build from the image storage
   2638         server named by _get_image_storage_server().
   2639 
   2640         If |synchronous| is True, waits for the entire download to finish
   2641         staging before returning. Otherwise only the artifacts necessary
   2642         to start installing images onto DUT's will be staged before returning.
   2643         A caller can then call finish_download to guarantee the rest of the
   2644         artifacts have finished staging.
   2645 
   2646         @param target: Target of the android build to stage, e.g.,
   2647                        shamu-userdebug.
   2648         @param build_id: Build id of the android build to stage.
   2649         @param branch: Branch of the android build to stage.
   2650         @param artifacts: A string of artifacts separated by comma. If None,
   2651                use the default artifacts for Android or Brillo build.
   2652         @param files: String of file seperated by commas.
   2653         @param os: OS artifacts to download (android/brillo).
   2654         @param synchronous: if True, waits until all components of the image are
   2655                staged before returning.
   2656 
   2657         @raise DevServerException upon any return code that's not HTTP OK.
   2658 
   2659         """
   2660         android_build_info = {'target': target,
   2661                               'build_id': build_id,
   2662                               'branch': branch}
   2663         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2664         if not artifacts:
   2665             board = target.split('-')[0]
   2666             artifacts = (
   2667                 android_utils.AndroidArtifacts.get_artifacts_for_reimage(
   2668                         board, os))
   2669         self._trigger_download(build, artifacts, files=files,
   2670                                synchronous=synchronous, **android_build_info)
   2671 
   2672 
   2673     def finish_download(self, target, build_id, branch, os='android'):
   2674         """Tell the devserver to finish staging an Android build.
   2675 
   2676         If trigger_download is called with synchronous=False, it will return
   2677         before all artifacts have been staged. This method contacts the
   2678         devserver and blocks until all staging is completed and should be
   2679         called after a call to trigger_download.
   2680 
   2681         @param target: Target of the android build to stage, e.g.,
   2682                        shamu-userdebug.
   2683         @param build_id: Build id of the android build to stage.
   2684         @param branch: Branch of the android build to stage.
   2685         @param os: OS artifacts to download (android/brillo).
   2686 
   2687         @raise DevServerException upon any return code that's not HTTP OK.
   2688         """
   2689         android_build_info = {'target': target,
   2690                               'build_id': build_id,
   2691                               'branch': branch}
   2692         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2693         board = target.split('-')[0]
   2694         artifacts = (
   2695                 android_utils.AndroidArtifacts.get_artifacts_for_reimage(
   2696                         board))
   2697         self._finish_download(build, artifacts, files='', **android_build_info)
   2698 
   2699 
   2700     def get_staged_file_url(self, filename, target, build_id, branch):
   2701         """Returns the url of a staged file for this image on the devserver.
   2702 
   2703         @param filename: Name of the file.
   2704         @param target: Target of the android build to stage, e.g.,
   2705                        shamu-userdebug.
   2706         @param build_id: Build id of the android build to stage.
   2707         @param branch: Branch of the android build to stage.
   2708 
   2709         @return: The url of a staged file for this image on the devserver.
   2710         """
   2711         android_build_info = {'target': target,
   2712                               'build_id': build_id,
   2713                               'branch': branch,
   2714                               'os_type': 'android'}
   2715         build = ANDROID_BUILD_NAME_PATTERN % android_build_info
   2716         return '/'.join([self._get_image_url(build), filename])
   2717 
   2718 
   2719     @remote_devserver_call()
   2720     def translate(self, build_name):
   2721         """Translate the build name if it's in LATEST format.
   2722 
   2723         If the build name is in the format [branch]/[target]/LATEST, return the
   2724         latest build in Launch Control otherwise return the build name as is.
   2725 
   2726         @param build_name: build_name to check.
   2727 
   2728         @return The actual build name to use.
   2729         """
   2730         branch, target, build_id = utils.parse_launch_control_build(build_name)
   2731         if build_id.upper() != 'LATEST':
   2732             return build_name
   2733         call = self.build_call('latestbuild', branch=branch, target=target,
   2734                                os_type='android')
   2735         translated_build_id = self.run_call(call)
   2736         translated_build = (ANDROID_BUILD_NAME_PATTERN %
   2737                             {'branch': branch,
   2738                              'target': target,
   2739                              'build_id': translated_build_id})
   2740         logging.debug('Translated relative build %s to %s', build_name,
   2741                       translated_build)
   2742         return translated_build
   2743 
   2744 
   2745 def _is_load_healthy(load):
   2746     """Check if devserver's load meets the minimum threshold.
   2747 
   2748     @param load: The devserver's load stats to check.
   2749 
   2750     @return: True if the load meets the minimum threshold. Return False
   2751              otherwise.
   2752 
   2753     """
   2754     # Threshold checks, including CPU load.
   2755     if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD:
   2756         logging.debug('CPU load of devserver %s is at %s%%, which is higher '
   2757                       'than the threshold of %s%%', load['devserver'],
   2758                       load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD)
   2759         return False
   2760     if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO:
   2761         logging.debug('Network IO of devserver %s is at %i Bps, which is '
   2762                       'higher than the threshold of %i bytes per second.',
   2763                       load['devserver'], load[DevServer.NETWORK_IO],
   2764                       DevServer.MAX_NETWORK_IO)
   2765         return False
   2766     return True
   2767 
   2768 
   2769 def _compare_load(devserver1, devserver2):
   2770     """Comparator function to compare load between two devservers.
   2771 
   2772     @param devserver1: A dictionary of devserver load stats to be compared.
   2773     @param devserver2: A dictionary of devserver load stats to be compared.
   2774 
   2775     @return: Negative value if the load of `devserver1` is less than the load
   2776              of `devserver2`. Return positive value otherwise.
   2777 
   2778     """
   2779     return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO])
   2780 
   2781 
   2782 def _get_subnet_for_host_ip(host_ip,
   2783                             restricted_subnets=utils.RESTRICTED_SUBNETS):
   2784     """Get the subnet for a given host IP.
   2785 
   2786     @param host_ip: the IP of a DUT.
   2787     @param restricted_subnets: A list of restriected subnets.
   2788 
   2789     @return: a (subnet_ip, mask_bits) tuple. If no matched subnet for the
   2790              host_ip, return (None, None).
   2791     """
   2792     for subnet_ip, mask_bits in restricted_subnets:
   2793         if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits):
   2794             return subnet_ip, mask_bits
   2795 
   2796     return None, None
   2797 
   2798 
   2799 def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None):
   2800     """Get the devserver with the least load.
   2801 
   2802     Iterate through all devservers and get the one with least load.
   2803 
   2804     TODO(crbug.com/486278): Devserver with required build already staged should
   2805     take higher priority. This will need check_health call to be able to verify
   2806     existence of a given build/artifact. Also, in case all devservers are
   2807     overloaded, the logic here should fall back to the old behavior that randomly
   2808     selects a devserver based on the hash of the image name/url.
   2809 
   2810     @param devserver_type: Type of devserver to select from. Default is set to
   2811                            ImageServer.
   2812     @param hostname: Hostname of the dut that the devserver is used for. The
   2813             picked devserver needs to respect the location of the host if
   2814             `prefer_local_devserver` is set to True or `restricted_subnets` is
   2815             set.
   2816 
   2817     @return: Name of the devserver with the least load.
   2818 
   2819     """
   2820     logging.debug('Get the least loaded %r', devserver_type)
   2821     devservers, can_retry = devserver_type.get_available_devservers(
   2822             hostname)
   2823     # If no healthy devservers available and can_retry is False, return None.
   2824     # Otherwise, relax the constrain on hostname, allow all devservers to be
   2825     # available.
   2826     if not devserver_type.get_healthy_devserver('', devservers):
   2827         if not can_retry:
   2828             return None
   2829         else:
   2830             devservers, _ = devserver_type.get_available_devservers()
   2831 
   2832     # get_devserver_load call needs to be made in a new process to allow force
   2833     # timeout using signal.
   2834     output = multiprocessing.Queue()
   2835     processes = []
   2836     for devserver in devservers:
   2837         processes.append(multiprocessing.Process(
   2838                 target=devserver_type.get_devserver_load_wrapper,
   2839                 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output)))
   2840 
   2841     for p in processes:
   2842         p.start()
   2843     for p in processes:
   2844         # The timeout for the process commands aren't reliable.  Add
   2845         # some extra time to the timeout for potential overhead in the
   2846         # subprocesses.  crbug.com/913695
   2847         p.join(TIMEOUT_GET_DEVSERVER_LOAD + 10)
   2848     # Read queue before killing processes to avoid corrupting the queue.
   2849     loads = [output.get() for p in processes if not p.is_alive()]
   2850     for p in processes:
   2851         if p.is_alive():
   2852             p.terminate()
   2853     # Filter out any load failed to be retrieved or does not support load check.
   2854     loads = [load for load in loads if load and DevServer.CPU_LOAD in load and
   2855              DevServer.is_free_disk_ok(load) and
   2856              DevServer.is_apache_client_count_ok(load)]
   2857     if not loads:
   2858         logging.debug('Failed to retrieve load stats from any devserver. No '
   2859                       'load balancing can be applied.')
   2860         return None
   2861     loads = [load for load in loads if _is_load_healthy(load)]
   2862     if not loads:
   2863         logging.error('No devserver has the capacity to be selected.')
   2864         return None
   2865     loads = sorted(loads, cmp=_compare_load)
   2866     return loads[0]['devserver']
   2867 
   2868 
   2869 def resolve(build, hostname=None, ban_list=None):
   2870     """Resolve a devserver can be used for given build and hostname.
   2871 
   2872     @param build: Name of a build to stage on devserver, e.g.,
   2873                   ChromeOS build: daisy-release/R50-1234.0.0
   2874                   Launch Control build: git_mnc_release/shamu-eng
   2875     @param hostname: Hostname of a devserver for, default is None, which means
   2876             devserver is not restricted by the network location of the host.
   2877     @param ban_list: The blacklist of devservers shouldn't be chosen.
   2878 
   2879     @return: A DevServer instance that can be used to stage given build for the
   2880              given host.
   2881     """
   2882     if utils.is_launch_control_build(build):
   2883         return AndroidBuildServer.resolve(build, hostname)
   2884     else:
   2885         return ImageServer.resolve(build, hostname, ban_list=ban_list)
   2886