1 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 from distutils import version 6 import cStringIO 7 import HTMLParser 8 import httplib 9 import json 10 import logging 11 import multiprocessing 12 import os 13 import re 14 import socket 15 import time 16 import urllib2 17 import urlparse 18 19 from autotest_lib.client.bin import utils as bin_utils 20 from autotest_lib.client.common_lib import android_utils 21 from autotest_lib.client.common_lib import error 22 from autotest_lib.client.common_lib import global_config 23 from autotest_lib.client.common_lib import utils 24 from autotest_lib.client.common_lib.cros import retry 25 from autotest_lib.server import utils as server_utils 26 # TODO(cmasone): redo this class using requests module; http://crosbug.com/30107 27 28 try: 29 from chromite.lib import metrics 30 except ImportError: 31 metrics = utils.metrics_mock 32 33 34 CONFIG = global_config.global_config 35 # This file is generated at build time and specifies, per suite and per test, 36 # the DEPENDENCIES list specified in each control file. It's a dict of dicts: 37 # {'bvt': {'/path/to/autotest/control/site_tests/test1/control': ['dep1']} 38 # 'suite': {'/path/to/autotest/control/site_tests/test2/control': ['dep2']} 39 # 'power': {'/path/to/autotest/control/site_tests/test1/control': ['dep1'], 40 # '/path/to/autotest/control/site_tests/test3/control': ['dep3']} 41 # } 42 DEPENDENCIES_FILE = 'test_suites/dependency_info' 43 # Number of seconds for caller to poll devserver's is_staged call to check if 44 # artifacts are staged. 45 _ARTIFACT_STAGE_POLLING_INTERVAL = 5 46 # Artifacts that should be staged when client calls devserver RPC to stage an 47 # image. 48 _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = 'full_payload,test_suites,stateful' 49 # Artifacts that should be staged when client calls devserver RPC to stage an 50 # image with autotest artifact. 51 _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST = ('full_payload,test_suites,' 52 'control_files,stateful,' 53 'autotest_packages') 54 # Artifacts that should be staged when client calls devserver RPC to stage an 55 # Android build. 56 _BRILLO_ARTIFACTS_TO_BE_STAGED_FOR_IMAGE = ('zip_images,vendor_partitions') 57 SKIP_DEVSERVER_HEALTH_CHECK = CONFIG.get_config_value( 58 'CROS', 'skip_devserver_health_check', type=bool) 59 # Number of seconds for the call to get devserver load to time out. 60 TIMEOUT_GET_DEVSERVER_LOAD = 2.0 61 62 # Android artifact path in devserver 63 ANDROID_BUILD_NAME_PATTERN = CONFIG.get_config_value( 64 'CROS', 'android_build_name_pattern', type=str).replace('\\', '') 65 66 # Return value from a devserver RPC indicating the call succeeded. 67 SUCCESS = 'Success' 68 69 # The timeout minutes for a given devserver ssh call. 70 DEVSERVER_SSH_TIMEOUT_MINS = 1 71 72 # Error message for invalid devserver response. 73 ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE = 'Proxy Error' 74 ERR_MSG_FOR_DOWN_DEVSERVER = 'Service Unavailable' 75 76 # Error message for devserver call timedout. 77 ERR_MSG_FOR_TIMED_OUT_CALL = 'timeout' 78 79 # The timeout minutes for waiting a devserver staging. 80 DEVSERVER_IS_STAGING_RETRY_MIN = 100 81 82 # The timeout minutes for waiting a DUT auto-update finished. 83 DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN = 100 84 85 # The total times of devserver triggering CrOS auto-update. 86 AU_RETRY_LIMIT = 2 87 88 # Number of seconds for caller to poll devserver's get_au_status call to 89 # check if cros auto-update is finished. 90 CROS_AU_POLLING_INTERVAL = 10 91 92 # Number of seconds for intervals between retrying auto-update calls. 93 CROS_AU_RETRY_INTERVAL = 20 94 95 # The file name for auto-update logs. 96 CROS_AU_LOG_FILENAME = 'CrOS_update_%s_%s.log' 97 98 # Provision error patterns. 99 # People who see this should know that they shouldn't change these 100 # classification strings. These strings are used for monitoring provision 101 # failures. Any changes may mess up the stats. 102 _EXCEPTION_PATTERNS = [ 103 # Raised when devserver portfile does not exist on host. 104 (r".*Devserver portfile does not exist!.*$", 105 '(1) Devserver portfile does not exist on host'), 106 # Raised when devserver cannot copy packages to host. 107 (r".*Could not copy .* to device.*$", 108 '(2) Cannot copy packages to host'), 109 # Raised when devserver fails to run specific commands on host. 110 (r".*cwd=None, extra env=\{'LC_MESSAGES': 'C'\}.*$", 111 '(3) Fail to run specific command on host'), 112 # Raised when new build fails to boot on the host. 113 (r'.*RootfsUpdateError: Build .* failed to boot on.*$', 114 '(4) Build failed to boot on host'), 115 # Raised when the auto-update process is timed out. 116 (r'.*The CrOS auto-update process is timed out, ' 117 'thus will be terminated.*$', 118 '(5) Auto-update is timed out'), 119 # Raised when the host is not pingable. 120 (r".*DeviceNotPingableError.*$", 121 '(6) Host is not pingable during auto-update'), 122 # Raised when hosts have unexpected status after rootfs update. 123 (r'.*Update failed with unexpected update status: ' 124 'UPDATE_STATUS_IDLE.*$', 125 '(7) Host has unexpected status: UPDATE_STATUS_IDLE after rootfs ' 126 'update'), 127 # Raised when devserver returns non-json response to shard/drone. 128 (r'.*No JSON object could be decoded.*$', 129 '(8) Devserver returned non-json object'), 130 # Raised when devserver loses host's ssh connection 131 (r'.*SSHConnectionError\: .* port 22\: Connection timed out.*$', 132 "(9) Devserver lost host's ssh connection"), 133 # Raised when error happens in writing files to host 134 (r'.*Write failed\: Broken pipe.*$', 135 "(10) Broken pipe while writing or connecting to host")] 136 137 PREFER_LOCAL_DEVSERVER = CONFIG.get_config_value( 138 'CROS', 'prefer_local_devserver', type=bool, default=False) 139 140 ENABLE_SSH_CONNECTION_FOR_DEVSERVER = CONFIG.get_config_value( 141 'CROS', 'enable_ssh_connection_for_devserver', type=bool, 142 default=False) 143 144 # Directory to save auto-update logs 145 AUTO_UPDATE_LOG_DIR = 'autoupdate_logs' 146 147 DEFAULT_SUBNET_MASKBIT = 19 148 149 # Metrics basepaths. 150 METRICS_PATH = 'chromeos/autotest' 151 PROVISION_PATH = METRICS_PATH + '/provision' 152 153 154 class DevServerException(Exception): 155 """Raised when the dev server returns a non-200 HTTP response.""" 156 pass 157 158 159 class BadBuildException(DevServerException): 160 """Raised when build failed to boot on DUT.""" 161 pass 162 163 164 class RetryableProvisionException(DevServerException): 165 """Raised when provision fails due to a retryable reason.""" 166 pass 167 168 class DevServerOverloadException(Exception): 169 """Raised when the dev server returns a 502 HTTP response.""" 170 pass 171 172 class DevServerFailToLocateException(Exception): 173 """Raised when fail to locate any devserver.""" 174 pass 175 176 177 class DevServerExceptionClassifier(object): 178 """A Class represents exceptions raised from DUT by calling auto_update.""" 179 def __init__(self, err, keep_full_trace=True): 180 """ 181 @param err: A single string representing one time provision 182 error happened in auto_update(). 183 @param keep_full_trace: True to keep the whole track trace of error. 184 False when just keep the last line. 185 """ 186 self._err = err if keep_full_trace else err.split('\n')[-1] 187 self._classification = None 188 189 def _classify(self): 190 for err_pattern, classification in _EXCEPTION_PATTERNS: 191 if re.match(err_pattern, self._err): 192 return classification 193 194 return '(0) Unknown exception' 195 196 @property 197 def classification(self): 198 """Classify the error 199 200 @return: return a classified exception type (string) from 201 _EXCEPTION_PATTERNS or 'Unknown exception'. Current patterns in 202 _EXCEPTION_PATTERNS are very specific so that errors cannot match 203 more than one pattern. 204 """ 205 if not self._classification: 206 self._classification = self._classify() 207 return self._classification 208 209 @property 210 def summary(self): 211 """Use one line to show the error message.""" 212 return ' '.join(self._err.splitlines()) 213 214 @property 215 def classified_exception(self): 216 """What kind of exception will be raised to higher. 217 218 @return: return a special Exception when the raised error is an 219 RootfsUpdateError. Otherwise, return general DevServerException. 220 """ 221 # The classification of RootfsUpdateError in _EXCEPTION_PATTERNS starts 222 # with "(4)" 223 if self.classification.startswith('(4)'): 224 return BadBuildException 225 226 return DevServerException 227 228 229 class MarkupStripper(HTMLParser.HTMLParser): 230 """HTML parser that strips HTML tags, coded characters like & 231 232 Works by, basically, not doing anything for any tags, and only recording 233 the content of text nodes in an internal data structure. 234 """ 235 def __init__(self): 236 self.reset() 237 self.fed = [] 238 239 240 def handle_data(self, d): 241 """Consume content of text nodes, store it away.""" 242 self.fed.append(d) 243 244 245 def get_data(self): 246 """Concatenate and return all stored data.""" 247 return ''.join(self.fed) 248 249 250 def _strip_http_message(message): 251 """Strip the HTTP marker from the an HTTP message. 252 253 @param message: A string returned by an HTTP call. 254 255 @return: A string with HTTP marker being stripped. 256 """ 257 strip = MarkupStripper() 258 try: 259 strip.feed(message.decode('utf_32')) 260 except UnicodeDecodeError: 261 strip.feed(message) 262 return strip.get_data() 263 264 265 def _get_image_storage_server(): 266 return CONFIG.get_config_value('CROS', 'image_storage_server', type=str) 267 268 269 def _get_canary_channel_server(): 270 """ 271 Get the url of the canary-channel server, 272 eg: gsutil://chromeos-releases/canary-channel/<board>/<release> 273 274 @return: The url to the canary channel server. 275 """ 276 return CONFIG.get_config_value('CROS', 'canary_channel_server', type=str) 277 278 279 def _get_storage_server_for_artifacts(artifacts=None): 280 """Gets the appropriate storage server for the given artifacts. 281 282 @param artifacts: A list of artifacts we need to stage. 283 @return: The address of the storage server that has these artifacts. 284 The default image storage server if no artifacts are specified. 285 """ 286 factory_artifact = global_config.global_config.get_config_value( 287 'CROS', 'factory_artifact', type=str, default='') 288 if artifacts and factory_artifact and factory_artifact in artifacts: 289 return _get_canary_channel_server() 290 return _get_image_storage_server() 291 292 293 def _gs_or_local_archive_url_args(archive_url): 294 """Infer the devserver call arguments to use with the given archive_url. 295 296 @param archive_url: The archive url to include the in devserver RPC. This 297 can either e a GS path or a local path. 298 @return: A dict of arguments to include in the devserver call. 299 """ 300 if not archive_url: 301 return {} 302 elif archive_url.startswith('gs://'): 303 return {'archive_url': archive_url} 304 else: 305 # For a local path, we direct the devserver to move the files while 306 # staging. This is the fastest way to stage local files, but deletes the 307 # files from the source. This is OK because the files are available on 308 # the devserver once staged. 309 return { 310 'local_path': archive_url, 311 'delete_source': True, 312 } 313 314 315 def _reverse_lookup_from_config(address): 316 """Look up hostname for the given IP address. 317 318 This uses the hostname-address map from the config file. 319 320 If multiple hostnames map to the same IP address, the first one 321 defined in the configuration file takes precedence. 322 323 @param address: IP address string 324 @returns: hostname string, or original input if not found 325 """ 326 for hostname, addr in _get_hostname_addr_map().iteritems(): 327 if addr == address: 328 return hostname 329 return address 330 331 332 def _get_hostname_addr_map(): 333 """Get hostname address mapping from config. 334 335 @return: dict mapping server hostnames to addresses 336 """ 337 return CONFIG.get_section_as_dict('HOSTNAME_ADDR_MAP') 338 339 340 def _get_dev_server_list(): 341 return CONFIG.get_config_value('CROS', 'dev_server', type=list, default=[]) 342 343 344 def _get_crash_server_list(): 345 return CONFIG.get_config_value('CROS', 'crash_server', type=list, 346 default=[]) 347 348 349 def remote_devserver_call(timeout_min=DEVSERVER_IS_STAGING_RETRY_MIN, 350 exception_to_raise=DevServerException): 351 """A decorator to use with remote devserver calls. 352 353 This decorator converts urllib2.HTTPErrors into DevServerExceptions 354 with any embedded error info converted into plain text. The method 355 retries on urllib2.URLError or error.CmdError to avoid devserver flakiness. 356 """ 357 #pylint: disable=C0111 358 359 def inner_decorator(method): 360 label = method.__name__ if hasattr(method, '__name__') else None 361 def metrics_wrapper(*args, **kwargs): 362 @retry.retry((urllib2.URLError, error.CmdError, 363 DevServerOverloadException), 364 timeout_min=timeout_min, 365 exception_to_raise=exception_to_raise, 366 label=label) 367 def wrapper(): 368 """This wrapper actually catches the HTTPError.""" 369 try: 370 return method(*args, **kwargs) 371 except urllib2.HTTPError as e: 372 error_markup = e.read() 373 raise DevServerException(_strip_http_message(error_markup)) 374 375 try: 376 return wrapper() 377 except Exception as e: 378 if ERR_MSG_FOR_TIMED_OUT_CALL in str(e): 379 dev_server = None 380 if args and isinstance(args[0], DevServer): 381 dev_server = args[0].hostname 382 elif 'devserver' in kwargs: 383 dev_server = get_hostname(kwargs['devserver']) 384 385 logging.debug('RPC call %s has timed out on devserver %s.', 386 label, dev_server) 387 c = metrics.Counter( 388 'chromeos/autotest/devserver/call_timeout') 389 c.increment(fields={'dev_server': dev_server, 390 'healthy': label}) 391 392 raise 393 394 return metrics_wrapper 395 396 return inner_decorator 397 398 399 def get_hostname(url): 400 """Get the hostname portion of a URL 401 402 schema://hostname:port/path 403 404 @param url: a Url string 405 @return: a hostname string 406 """ 407 return urlparse.urlparse(url).hostname 408 409 410 class DevServer(object): 411 """Base class for all DevServer-like server stubs. 412 413 This is the base class for interacting with all Dev Server-like servers. 414 A caller should instantiate a sub-class of DevServer with: 415 416 host = SubClassServer.resolve(build) 417 server = SubClassServer(host) 418 """ 419 _MIN_FREE_DISK_SPACE_GB = 20 420 _MAX_APACHE_CLIENT_COUNT = 75 421 # Threshold for the CPU load percentage for a devserver to be selected. 422 MAX_CPU_LOAD = 80.0 423 # Threshold for the network IO, set to 80MB/s 424 MAX_NETWORK_IO = 1024 * 1024 * 80 425 DISK_IO = 'disk_total_bytes_per_second' 426 NETWORK_IO = 'network_total_bytes_per_second' 427 CPU_LOAD = 'cpu_percent' 428 FREE_DISK = 'free_disk' 429 AU_PROCESS = 'au_process_count' 430 STAGING_THREAD_COUNT = 'staging_thread_count' 431 APACHE_CLIENT_COUNT = 'apache_client_count' 432 433 434 def __init__(self, devserver): 435 self._devserver = devserver 436 437 438 def url(self): 439 """Returns the url for this devserver.""" 440 return self._devserver 441 442 443 @property 444 def hostname(self): 445 """Return devserver hostname parsed from the devserver URL. 446 447 Note that this is likely parsed from the devserver URL from 448 shadow_config.ini, meaning that the "hostname" part of the 449 devserver URL is actually an IP address. 450 451 @return hostname string 452 """ 453 return get_hostname(self.url()) 454 455 456 @property 457 def resolved_hostname(self): 458 """Return devserver hostname, resolved from its IP address. 459 460 Unlike the hostname property, this property attempts to look up 461 the proper hostname from the devserver IP address. If lookup 462 fails, then fall back to whatever the hostname property would 463 have returned. 464 465 @return hostname string 466 """ 467 return _reverse_lookup_from_config(self.hostname) 468 469 470 @staticmethod 471 def get_server_url(url): 472 """Get the devserver url from a repo url, which includes build info. 473 474 @param url: A job repo url. 475 476 @return A devserver url, e.g., http://127.0.0.10:8080 477 """ 478 res = urlparse.urlparse(url) 479 if res.netloc: 480 return res.scheme + '://' + res.netloc 481 482 483 @classmethod 484 def get_devserver_load_wrapper(cls, devserver, timeout_sec, output): 485 """A wrapper function to call get_devserver_load in parallel. 486 487 @param devserver: url of the devserver. 488 @param timeout_sec: Number of seconds before time out the devserver 489 call. 490 @param output: An output queue to save results to. 491 """ 492 load = cls.get_devserver_load(devserver, timeout_min=timeout_sec/60.0) 493 if load: 494 load['devserver'] = devserver 495 output.put(load) 496 497 498 @classmethod 499 def get_devserver_load(cls, devserver, 500 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 501 """Returns True if the |devserver| is healthy to stage build. 502 503 @param devserver: url of the devserver. 504 @param timeout_min: How long to wait in minutes before deciding the 505 the devserver is not up (float). 506 507 @return: A dictionary of the devserver's load. 508 509 """ 510 call = cls._build_call(devserver, 'check_health') 511 @remote_devserver_call(timeout_min=timeout_min) 512 def get_load(devserver=devserver): 513 """Inner method that makes the call.""" 514 return cls.run_call(call, timeout=timeout_min*60) 515 516 try: 517 return json.load(cStringIO.StringIO(get_load(devserver=devserver))) 518 except Exception as e: 519 logging.error('Devserver call failed: "%s", timeout: %s seconds,' 520 ' Error: %s', call, timeout_min * 60, e) 521 522 523 @classmethod 524 def is_free_disk_ok(cls, load): 525 """Check if a devserver has enough free disk. 526 527 @param load: A dict of the load of the devserver. 528 529 @return: True if the devserver has enough free disk or disk check is 530 skipped in global config. 531 532 """ 533 if SKIP_DEVSERVER_HEALTH_CHECK: 534 logging.debug('devserver health check is skipped.') 535 elif load[cls.FREE_DISK] < cls._MIN_FREE_DISK_SPACE_GB: 536 return False 537 538 return True 539 540 541 @classmethod 542 def is_apache_client_count_ok(cls, load): 543 """Check if a devserver has enough Apache connections available. 544 545 Apache server by default has maximum of 150 concurrent connections. If 546 a devserver has too many live connections, it likely indicates the 547 server is busy handling many long running download requests, e.g., 548 downloading stateful partitions. It is better not to add more requests 549 to it. 550 551 @param load: A dict of the load of the devserver. 552 553 @return: True if the devserver has enough Apache connections available, 554 or disk check is skipped in global config. 555 556 """ 557 if SKIP_DEVSERVER_HEALTH_CHECK: 558 logging.debug('devserver health check is skipped.') 559 elif cls.APACHE_CLIENT_COUNT not in load: 560 logging.debug('Apache client count is not collected from devserver.') 561 elif (load[cls.APACHE_CLIENT_COUNT] > 562 cls._MAX_APACHE_CLIENT_COUNT): 563 return False 564 565 return True 566 567 568 @classmethod 569 def devserver_healthy(cls, devserver, 570 timeout_min=DEVSERVER_SSH_TIMEOUT_MINS): 571 """Returns True if the |devserver| is healthy to stage build. 572 573 @param devserver: url of the devserver. 574 @param timeout_min: How long to wait in minutes before deciding the 575 the devserver is not up (float). 576 577 @return: True if devserver is healthy. Return False otherwise. 578 579 """ 580 c = metrics.Counter('chromeos/autotest/devserver/devserver_healthy') 581 reason = '' 582 healthy = False 583 load = cls.get_devserver_load(devserver, timeout_min=timeout_min) 584 try: 585 if not load: 586 # Failed to get the load of devserver. 587 reason = '(1) Failed to get load.' 588 return False 589 590 apache_ok = cls.is_apache_client_count_ok(load) 591 if not apache_ok: 592 reason = '(2) Apache client count too high.' 593 logging.error('Devserver check_health failed. Live Apache client ' 594 'count is too high: %d.', 595 load[cls.APACHE_CLIENT_COUNT]) 596 return False 597 598 disk_ok = cls.is_free_disk_ok(load) 599 if not disk_ok: 600 reason = '(3) Disk space too low.' 601 logging.error('Devserver check_health failed. Free disk space is ' 602 'low. Only %dGB is available.', 603 load[cls.FREE_DISK]) 604 healthy = bool(disk_ok) 605 return disk_ok 606 finally: 607 c.increment(fields={'dev_server': cls(devserver).resolved_hostname, 608 'healthy': healthy, 609 'reason': reason}) 610 # Monitor how many AU processes the devserver is currently running. 611 if load is not None and load.get(DevServer.AU_PROCESS): 612 c_au = metrics.Gauge( 613 'chromeos/autotest/devserver/devserver_au_count') 614 c_au.set( 615 load.get(DevServer.AU_PROCESS), 616 fields={'dev_server': cls(devserver).resolved_hostname}) 617 618 619 @staticmethod 620 def _build_call(host, method, **kwargs): 621 """Build a URL to |host| that calls |method|, passing |kwargs|. 622 623 Builds a URL that calls |method| on the dev server defined by |host|, 624 passing a set of key/value pairs built from the dict |kwargs|. 625 626 @param host: a string that is the host basename e.g. http://server:90. 627 @param method: the dev server method to call. 628 @param kwargs: a dict mapping arg names to arg values. 629 @return the URL string. 630 """ 631 # If the archive_url is a local path, the args expected by the devserver 632 # are a little different. 633 archive_url_args = _gs_or_local_archive_url_args( 634 kwargs.pop('archive_url', None)) 635 kwargs.update(archive_url_args) 636 637 argstr = '&'.join(map(lambda x: "%s=%s" % x, kwargs.iteritems())) 638 return "%(host)s/%(method)s?%(argstr)s" % dict( 639 host=host, method=method, argstr=argstr) 640 641 642 def build_call(self, method, **kwargs): 643 """Builds a devserver RPC string that is used by 'run_call()'. 644 645 @param method: remote devserver method to call. 646 """ 647 return self._build_call(self._devserver, method, **kwargs) 648 649 650 @classmethod 651 def build_all_calls(cls, method, **kwargs): 652 """Builds a list of URLs that makes RPC calls on all devservers. 653 654 Build a URL that calls |method| on the dev server, passing a set 655 of key/value pairs built from the dict |kwargs|. 656 657 @param method: the dev server method to call. 658 @param kwargs: a dict mapping arg names to arg values 659 660 @return the URL string 661 """ 662 calls = [] 663 # Note we use cls.servers as servers is class specific. 664 for server in cls.servers(): 665 if cls.devserver_healthy(server): 666 calls.append(cls._build_call(server, method, **kwargs)) 667 668 return calls 669 670 671 @classmethod 672 def run_call(cls, call, readline=False, timeout=None): 673 """Invoke a given devserver call using urllib.open. 674 675 Open the URL with HTTP, and return the text of the response. Exceptions 676 may be raised as for urllib2.urlopen(). 677 678 @param call: a url string that calls a method to a devserver. 679 @param readline: whether read http response line by line. 680 @param timeout: The timeout seconds for this urlopen call. 681 682 @return the results of this call. 683 """ 684 if timeout is not None: 685 return utils.urlopen_socket_timeout( 686 call, timeout=timeout).read() 687 elif readline: 688 response = urllib2.urlopen(call) 689 return [line.rstrip() for line in response] 690 else: 691 return urllib2.urlopen(call).read() 692 693 694 @staticmethod 695 def servers(): 696 """Returns a list of servers that can serve as this type of server.""" 697 raise NotImplementedError() 698 699 700 @classmethod 701 def get_devservers_in_same_subnet(cls, ip, mask_bits=DEFAULT_SUBNET_MASKBIT, 702 unrestricted_only=False): 703 """Get the devservers in the same subnet of the given ip. 704 705 @param ip: The IP address of a dut to look for devserver. 706 @param mask_bits: Number of mask bits. Default is 19. 707 @param unrestricted_only: Set to True to select from devserver in 708 unrestricted subnet only. Default is False. 709 710 @return: A list of devservers in the same subnet of the given ip. 711 712 """ 713 # server from cls.servers() is a URL, e.g., http://10.1.1.10:8082, so 714 # we need a dict to return the full devserver path once the IPs are 715 # filtered in get_servers_in_same_subnet. 716 server_names = {} 717 all_devservers = [] 718 devservers = (cls.get_unrestricted_devservers() if unrestricted_only 719 else cls.servers()) 720 for server in devservers: 721 server_name = get_hostname(server) 722 server_names[server_name] = server 723 all_devservers.append(server_name) 724 if not all_devservers: 725 devserver_type = 'unrestricted only' if unrestricted_only else 'all' 726 raise DevServerFailToLocateException( 727 'Fail to locate a devserver for dut %s in %s devservers' 728 % (ip, devserver_type)) 729 730 devservers = utils.get_servers_in_same_subnet(ip, mask_bits, 731 all_devservers) 732 return [server_names[s] for s in devservers] 733 734 735 @classmethod 736 def get_unrestricted_devservers( 737 cls, restricted_subnets=utils.RESTRICTED_SUBNETS): 738 """Get the devservers not in any restricted subnet specified in 739 restricted_subnets. 740 741 @param restricted_subnets: A list of restriected subnets. 742 743 @return: A list of devservers not in any restricted subnet. 744 745 """ 746 if not restricted_subnets: 747 return cls.servers() 748 749 devservers = [] 750 for server in cls.servers(): 751 server_name = get_hostname(server) 752 if not utils.get_restricted_subnet(server_name, restricted_subnets): 753 devservers.append(server) 754 return devservers 755 756 757 @classmethod 758 def get_healthy_devserver(cls, build, devservers, ban_list=None): 759 """"Get a healthy devserver instance from the list of devservers. 760 761 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 762 @param devservers: The devserver list to be chosen out a healthy one. 763 @param ban_list: The blacklist of devservers we don't want to choose. 764 Default is None. 765 766 @return: A DevServer object of a healthy devserver. Return None if no 767 healthy devserver is found. 768 769 """ 770 logging.debug('Pick one healthy devserver from %r', devservers) 771 while devservers: 772 hash_index = hash(build) % len(devservers) 773 devserver = devservers.pop(hash_index) 774 logging.debug('Check health for %s', devserver) 775 if ban_list and devserver in ban_list: 776 continue 777 778 if cls.devserver_healthy(devserver): 779 logging.debug('Pick %s', devserver) 780 return cls(devserver) 781 782 783 @classmethod 784 def get_available_devservers(cls, hostname=None, 785 prefer_local_devserver=PREFER_LOCAL_DEVSERVER, 786 restricted_subnets=utils.RESTRICTED_SUBNETS): 787 """Get devservers in the same subnet of the given hostname. 788 789 @param hostname: Hostname of a DUT to choose devserver for. 790 791 @return: A tuple of (devservers, can_retry), devservers is a list of 792 devservers that's available for the given hostname. can_retry 793 is a flag that indicate if caller can retry the selection of 794 devserver if no devserver in the returned devservers can be 795 used. For example, if hostname is in a restricted subnet, 796 can_retry will be False. 797 """ 798 logging.info('Getting devservers for host: %s', hostname) 799 host_ip = None 800 if hostname: 801 host_ip = bin_utils.get_ip_address(hostname) 802 if not host_ip: 803 logging.error('Failed to get IP address of %s. Will pick a ' 804 'devserver without subnet constraint.', hostname) 805 806 if not host_ip: 807 return cls.get_unrestricted_devservers(restricted_subnets), False 808 809 # Go through all restricted subnet settings and check if the DUT is 810 # inside a restricted subnet. If so, only return the devservers in the 811 # restricted subnet and doesn't allow retry. 812 if host_ip and restricted_subnets: 813 subnet_ip, mask_bits = _get_subnet_for_host_ip( 814 host_ip, restricted_subnets=restricted_subnets) 815 if subnet_ip: 816 logging.debug('The host %s (%s) is in a restricted subnet. ' 817 'Try to locate a devserver inside subnet ' 818 '%s:%d.', hostname, host_ip, subnet_ip, 819 mask_bits) 820 devservers = cls.get_devservers_in_same_subnet( 821 subnet_ip, mask_bits) 822 return devservers, False 823 824 # If prefer_local_devserver is set to True and the host is not in 825 # restricted subnet, pick a devserver in the same subnet if possible. 826 # Set can_retry to True so it can pick a different devserver if all 827 # devservers in the same subnet are down. 828 if prefer_local_devserver: 829 return (cls.get_devservers_in_same_subnet( 830 host_ip, DEFAULT_SUBNET_MASKBIT, True), True) 831 832 return cls.get_unrestricted_devservers(restricted_subnets), False 833 834 835 @classmethod 836 def resolve(cls, build, hostname=None, ban_list=None): 837 """"Resolves a build to a devserver instance. 838 839 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514). 840 @param hostname: The hostname of dut that requests a devserver. It's 841 used to make sure a devserver in the same subnet is 842 preferred. 843 @param ban_list: The blacklist of devservers shouldn't be chosen. 844 845 @raise DevServerException: If no devserver is available. 846 """ 847 tried_devservers = set() 848 devservers, can_retry = cls.get_available_devservers(hostname) 849 if devservers: 850 tried_devservers |= set(devservers) 851 852 devserver = cls.get_healthy_devserver(build, devservers, 853 ban_list=ban_list) 854 855 if not devserver and can_retry: 856 # Find available devservers without dut location constrain. 857 devservers, _ = cls.get_available_devservers() 858 devserver = cls.get_healthy_devserver(build, devservers, 859 ban_list=ban_list) 860 if devservers: 861 tried_devservers |= set(devservers) 862 if devserver: 863 return devserver 864 else: 865 subnet = 'unrestricted subnet' 866 if hostname is not None: 867 host_ip = bin_utils.get_ip_address(hostname) 868 if host_ip: 869 subnet_ip, mask_bits = _get_subnet_for_host_ip(host_ip) 870 subnet = '%s/%s' % (str(subnet_ip), str(mask_bits)) 871 872 error_msg = ('All devservers in subnet: %s are currently down: ' 873 '%s. (dut hostname: %s)' % 874 (subnet, tried_devservers, hostname)) 875 logging.error(error_msg) 876 c = metrics.Counter( 877 'chromeos/autotest/devserver/subnet_without_devservers') 878 c.increment(fields={'subnet': subnet, 'hostname': str(hostname)}) 879 raise DevServerException(error_msg) 880 881 882 @classmethod 883 def random(cls): 884 """Return a random devserver that's available. 885 886 Devserver election in `resolve` method is based on a hash of the 887 build that a caller wants to stage. The purpose is that different 888 callers requesting for the same build can get the same devserver, 889 while the lab is able to distribute different builds across all 890 devservers. That helps to reduce the duplication of builds across 891 all devservers. 892 This function returns a random devserver, by passing a random 893 pseudo build name to `resolve `method. 894 """ 895 return cls.resolve(build=str(time.time())) 896 897 898 class CrashServer(DevServer): 899 """Class of DevServer that symbolicates crash dumps.""" 900 901 @staticmethod 902 def servers(): 903 return _get_crash_server_list() 904 905 906 @remote_devserver_call() 907 def symbolicate_dump(self, minidump_path, build): 908 """Ask the devserver to symbolicate the dump at minidump_path. 909 910 Stage the debug symbols for |build| and, if that works, ask the 911 devserver to symbolicate the dump at |minidump_path|. 912 913 @param minidump_path: the on-disk path of the minidump. 914 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 915 whose debug symbols are needed for symbolication. 916 @return The contents of the stack trace 917 @raise DevServerException upon any return code that's not HTTP OK. 918 """ 919 try: 920 import requests 921 except ImportError: 922 logging.warning("Can't 'import requests' to connect to dev server.") 923 return '' 924 f = {'dev_server': self.resolved_hostname} 925 c = metrics.Counter('chromeos/autotest/crashserver/symbolicate_dump') 926 c.increment(fields=f) 927 # Symbolicate minidump. 928 m = 'chromeos/autotest/crashserver/symbolicate_dump_duration' 929 with metrics.SecondsTimer(m, fields=f): 930 call = self.build_call('symbolicate_dump', 931 archive_url=_get_image_storage_server() + build) 932 request = requests.post( 933 call, files={'minidump': open(minidump_path, 'rb')}) 934 if request.status_code == requests.codes.OK: 935 return request.text 936 937 error_fd = cStringIO.StringIO(request.text) 938 raise urllib2.HTTPError( 939 call, request.status_code, request.text, request.headers, 940 error_fd) 941 942 943 @classmethod 944 def get_available_devservers(cls, hostname): 945 """Get all available crash servers. 946 947 Crash server election doesn't need to count the location of hostname. 948 949 @param hostname: Hostname of a DUT to choose devserver for. 950 951 @return: A tuple of (all crash servers, False). can_retry is set to 952 False, as all crash servers are returned. There is no point to 953 retry. 954 """ 955 return cls.servers(), False 956 957 958 class ImageServerBase(DevServer): 959 """Base class for devservers used to stage builds. 960 961 CrOS and Android builds are staged in different ways as they have different 962 sets of artifacts. This base class abstracts the shared functions between 963 the two types of ImageServer. 964 """ 965 966 @classmethod 967 def servers(cls): 968 """Returns a list of servers that can serve as a desired type of 969 devserver. 970 """ 971 return _get_dev_server_list() 972 973 974 def _get_image_url(self, image): 975 """Returns the url of the directory for this image on the devserver. 976 977 @param image: the image that was fetched. 978 """ 979 image = self.translate(image) 980 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 981 type=str) 982 return (url_pattern % (self.url(), image)).replace('update', 'static') 983 984 985 @staticmethod 986 def create_metadata(server_name, image, artifacts=None, files=None): 987 """Create a metadata dictionary given the staged items. 988 989 The metadata can be send to metadata db along with stats. 990 991 @param server_name: name of the devserver, e.g 172.22.33.44. 992 @param image: The name of the image. 993 @param artifacts: A list of artifacts. 994 @param files: A list of files. 995 996 @return A metadata dictionary. 997 998 """ 999 metadata = {'devserver': server_name, 1000 'image': image, 1001 '_type': 'devserver'} 1002 if artifacts: 1003 metadata['artifacts'] = ' '.join(artifacts) 1004 if files: 1005 metadata['files'] = ' '.join(files) 1006 return metadata 1007 1008 1009 @classmethod 1010 def run_ssh_call(cls, call, readline=False, timeout=None): 1011 """Construct an ssh-based rpc call, and execute it. 1012 1013 @param call: a url string that calls a method to a devserver. 1014 @param readline: whether read http response line by line. 1015 @param timeout: The timeout seconds for ssh call. 1016 1017 @return the results of this call. 1018 """ 1019 hostname = get_hostname(call) 1020 ssh_call = 'ssh %s \'curl "%s"\'' % (hostname, utils.sh_escape(call)) 1021 timeout_seconds = timeout if timeout else DEVSERVER_SSH_TIMEOUT_MINS*60 1022 try: 1023 result = utils.run(ssh_call, timeout=timeout_seconds) 1024 except error.CmdError as e: 1025 logging.debug('Error occurred with exit_code %d when executing the ' 1026 'ssh call: %s.', e.result_obj.exit_status, 1027 e.result_obj.stderr) 1028 c = metrics.Counter('chromeos/autotest/devserver/ssh_failure') 1029 c.increment(fields={'dev_server': hostname}) 1030 raise 1031 response = result.stdout 1032 1033 # If the curl command's returned HTTP response contains certain 1034 # exception string, raise the DevServerException of the response. 1035 if 'DownloaderException' in response: 1036 raise DevServerException(_strip_http_message(response)) 1037 1038 if readline: 1039 # Remove line terminators and trailing whitespace 1040 response = response.splitlines() 1041 return [line.rstrip() for line in response] 1042 1043 return response 1044 1045 1046 @classmethod 1047 def run_call(cls, call, readline=False, timeout=None): 1048 """Invoke a given devserver call using urllib.open or ssh. 1049 1050 Open the URL with HTTP or SSH-based HTTP, and return the text of the 1051 response. Exceptions may be raised as for urllib2.urlopen() or 1052 utils.run(). 1053 1054 @param call: a url string that calls a method to a devserver. 1055 @param readline: whether read http response line by line. 1056 @param timeout: The timeout seconds for urlopen call or ssh call. 1057 1058 @return the results of this call. 1059 """ 1060 server_name = get_hostname(call) 1061 is_in_restricted_subnet = utils.get_restricted_subnet( 1062 server_name, utils.RESTRICTED_SUBNETS) 1063 _EMPTY_SENTINEL_VALUE = object() 1064 def kickoff_call(): 1065 """Invoke a given devserver call using urllib.open or ssh. 1066 1067 @param call: a url string that calls a method to a devserver. 1068 @param is_in_restricted_subnet: whether the devserver is in subnet. 1069 @param readline: whether read http response line by line. 1070 @param timeout: The timeout seconds for urlopen call or ssh call. 1071 """ 1072 if (not ENABLE_SSH_CONNECTION_FOR_DEVSERVER or 1073 not is_in_restricted_subnet): 1074 response = super(ImageServerBase, cls).run_call( 1075 call, readline=readline, timeout=timeout) 1076 else: 1077 response = cls.run_ssh_call( 1078 call, readline=readline, timeout=timeout) 1079 # Retry if devserver service is temporarily down, e.g. in a 1080 # devserver push. 1081 if ERR_MSG_FOR_DOWN_DEVSERVER in response: 1082 return False 1083 1084 # Don't return response directly since it may be empty string, 1085 # which causes poll_for_condition to retry. 1086 return _EMPTY_SENTINEL_VALUE if not response else response 1087 1088 try: 1089 response = bin_utils.poll_for_condition( 1090 kickoff_call, 1091 exception=bin_utils.TimeoutError(), 1092 timeout=60, 1093 sleep_interval=5) 1094 return '' if response is _EMPTY_SENTINEL_VALUE else response 1095 except bin_utils.TimeoutError: 1096 return ERR_MSG_FOR_DOWN_DEVSERVER 1097 1098 1099 @classmethod 1100 def download_file(cls, remote_file, local_file, timeout=None): 1101 """Download file from devserver. 1102 1103 The format of remote_file should be: 1104 http://devserver_ip:8082/static/board/... 1105 1106 @param remote_file: The URL of the file on devserver that need to be 1107 downloaded. 1108 @param local_file: The path of the file saved to local. 1109 @param timeout: The timeout seconds for this call. 1110 """ 1111 response = cls.run_call(remote_file, timeout=timeout) 1112 with open(local_file, 'w') as out_log: 1113 out_log.write(response) 1114 1115 1116 def _poll_is_staged(self, **kwargs): 1117 """Polling devserver.is_staged until all artifacts are staged. 1118 1119 @param kwargs: keyword arguments to make is_staged devserver call. 1120 1121 @return: True if all artifacts are staged in devserver. 1122 """ 1123 call = self.build_call('is_staged', **kwargs) 1124 1125 def all_staged(): 1126 """Call devserver.is_staged rpc to check if all files are staged. 1127 1128 @return: True if all artifacts are staged in devserver. False 1129 otherwise. 1130 @rasies DevServerException, the exception is a wrapper of all 1131 exceptions that were raised when devserver tried to download 1132 the artifacts. devserver raises an HTTPError or a CmdError 1133 when an exception was raised in the code. Such exception 1134 should be re-raised here to stop the caller from waiting. 1135 If the call to devserver failed for connection issue, a 1136 URLError exception is raised, and caller should retry the 1137 call to avoid such network flakiness. 1138 1139 """ 1140 try: 1141 result = self.run_call(call) 1142 logging.debug('whether artifact is staged: %r', result) 1143 return result == 'True' 1144 except urllib2.HTTPError as e: 1145 error_markup = e.read() 1146 raise DevServerException(_strip_http_message(error_markup)) 1147 except urllib2.URLError as e: 1148 # Could be connection issue, retry it. 1149 # For example: <urlopen error [Errno 111] Connection refused> 1150 logging.error('URLError happens in is_stage: %r', e) 1151 return False 1152 except error.CmdError as e: 1153 # Retry if SSH failed to connect to the devserver. 1154 logging.warning('CmdError happens in is_stage: %r, will retry', e) 1155 return False 1156 1157 bin_utils.poll_for_condition( 1158 all_staged, 1159 exception=bin_utils.TimeoutError(), 1160 timeout=DEVSERVER_IS_STAGING_RETRY_MIN * 60, 1161 sleep_interval=_ARTIFACT_STAGE_POLLING_INTERVAL) 1162 1163 return True 1164 1165 1166 def _call_and_wait(self, call_name, error_message, 1167 expected_response=SUCCESS, **kwargs): 1168 """Helper method to make a urlopen call, and wait for artifacts staged. 1169 1170 @param call_name: name of devserver rpc call. 1171 @param error_message: Error message to be thrown if response does not 1172 match expected_response. 1173 @param expected_response: Expected response from rpc, default to 1174 |Success|. If it's set to None, do not compare 1175 the actual response. Any response is consider 1176 to be good. 1177 @param kwargs: keyword arguments to make is_staged devserver call. 1178 1179 @return: The response from rpc. 1180 @raise DevServerException upon any return code that's expected_response. 1181 1182 """ 1183 call = self.build_call(call_name, async=True, **kwargs) 1184 try: 1185 response = self.run_call(call) 1186 logging.debug('response for RPC: %r', response) 1187 if ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE in response: 1188 logging.debug('Proxy error happens in RPC call, ' 1189 'will retry in 30 seconds') 1190 time.sleep(30) 1191 raise DevServerOverloadException() 1192 except httplib.BadStatusLine as e: 1193 logging.error(e) 1194 raise DevServerException('Received Bad Status line, Devserver %s ' 1195 'might have gone down while handling ' 1196 'the call: %s' % (self.url(), call)) 1197 1198 if expected_response and not response == expected_response: 1199 raise DevServerException(error_message) 1200 1201 # `os_type` is needed in build a devserver call, but not needed for 1202 # wait_for_artifacts_staged, since that method is implemented by 1203 # each ImageServerBase child class. 1204 if 'os_type' in kwargs: 1205 del kwargs['os_type'] 1206 self.wait_for_artifacts_staged(**kwargs) 1207 return response 1208 1209 1210 def _stage_artifacts(self, build, artifacts, files, archive_url, **kwargs): 1211 """Tell the devserver to download and stage |artifacts| from |image| 1212 specified by kwargs. 1213 1214 This is the main call point for staging any specific artifacts for a 1215 given build. To see the list of artifacts one can stage see: 1216 1217 ~src/platfrom/dev/artifact_info.py. 1218 1219 This is maintained along with the actual devserver code. 1220 1221 @param artifacts: A list of artifacts. 1222 @param files: A list of files to stage. 1223 @param archive_url: Optional parameter that has the archive_url to stage 1224 this artifact from. Default is specified in autotest config + 1225 image. 1226 @param kwargs: keyword arguments that specify the build information, to 1227 make stage devserver call. 1228 1229 @raise DevServerException upon any return code that's not HTTP OK. 1230 """ 1231 if not archive_url: 1232 archive_url = _get_storage_server_for_artifacts(artifacts) + build 1233 1234 artifacts_arg = ','.join(artifacts) if artifacts else '' 1235 files_arg = ','.join(files) if files else '' 1236 error_message = ("staging %s for %s failed;" 1237 "HTTP OK not accompanied by 'Success'." % 1238 ('artifacts=%s files=%s ' % (artifacts_arg, files_arg), 1239 build)) 1240 1241 staging_info = ('build=%s, artifacts=%s, files=%s, archive_url=%s' % 1242 (build, artifacts, files, archive_url)) 1243 logging.info('Staging artifacts on devserver %s: %s', 1244 self.url(), staging_info) 1245 success = False 1246 try: 1247 arguments = {'archive_url': archive_url, 1248 'artifacts': artifacts_arg, 1249 'files': files_arg} 1250 if kwargs: 1251 arguments.update(kwargs) 1252 # TODO(akeshet): canonicalize artifacts_arg before using it as a 1253 # metric field (as it stands it is a not-very-well-controlled 1254 # string). 1255 f = {'artifacts': artifacts_arg, 1256 'dev_server': self.resolved_hostname} 1257 with metrics.SecondsTimer( 1258 'chromeos/autotest/devserver/stage_artifact_duration', 1259 fields=f): 1260 self.call_and_wait(call_name='stage', error_message=error_message, 1261 **arguments) 1262 logging.info('Finished staging artifacts: %s', staging_info) 1263 success = True 1264 except (bin_utils.TimeoutError, error.TimeoutException): 1265 logging.error('stage_artifacts timed out: %s', staging_info) 1266 raise DevServerException( 1267 'stage_artifacts timed out: %s' % staging_info) 1268 finally: 1269 f = {'success': success, 1270 'artifacts': artifacts_arg, 1271 'dev_server': self.resolved_hostname} 1272 metrics.Counter('chromeos/autotest/devserver/stage_artifact' 1273 ).increment(fields=f) 1274 1275 1276 def call_and_wait(self, *args, **kwargs): 1277 """Helper method to make a urlopen call, and wait for artifacts staged. 1278 1279 This method needs to be overridden in the subclass to implement the 1280 logic to call _call_and_wait. 1281 """ 1282 raise NotImplementedError 1283 1284 1285 def _trigger_download(self, build, artifacts, files, synchronous=True, 1286 **kwargs_build_info): 1287 """Tell the devserver to download and stage image specified in 1288 kwargs_build_info. 1289 1290 Tells the devserver to fetch |image| from the image storage server 1291 named by _get_image_storage_server(). 1292 1293 If |synchronous| is True, waits for the entire download to finish 1294 staging before returning. Otherwise only the artifacts necessary 1295 to start installing images onto DUT's will be staged before returning. 1296 A caller can then call finish_download to guarantee the rest of the 1297 artifacts have finished staging. 1298 1299 @param synchronous: if True, waits until all components of the image are 1300 staged before returning. 1301 @param kwargs_build_info: Dictionary of build information. 1302 For CrOS, it is None as build is the CrOS image name. 1303 For Android, it is {'target': target, 1304 'build_id': build_id, 1305 'branch': branch} 1306 1307 @raise DevServerException upon any return code that's not HTTP OK. 1308 1309 """ 1310 if kwargs_build_info: 1311 archive_url = None 1312 else: 1313 archive_url = _get_image_storage_server() + build 1314 error_message = ("trigger_download for %s failed;" 1315 "HTTP OK not accompanied by 'Success'." % build) 1316 kwargs = {'archive_url': archive_url, 1317 'artifacts': artifacts, 1318 'files': files, 1319 'error_message': error_message} 1320 if kwargs_build_info: 1321 kwargs.update(kwargs_build_info) 1322 1323 logging.info('trigger_download starts for %s', build) 1324 try: 1325 response = self.call_and_wait(call_name='stage', **kwargs) 1326 logging.info('trigger_download finishes for %s', build) 1327 except (bin_utils.TimeoutError, error.TimeoutException): 1328 logging.error('trigger_download timed out for %s.', build) 1329 raise DevServerException( 1330 'trigger_download timed out for %s.' % build) 1331 was_successful = response == SUCCESS 1332 if was_successful and synchronous: 1333 self._finish_download(build, artifacts, files, **kwargs_build_info) 1334 1335 1336 def _finish_download(self, build, artifacts, files, **kwargs_build_info): 1337 """Tell the devserver to finish staging image specified in 1338 kwargs_build_info. 1339 1340 If trigger_download is called with synchronous=False, it will return 1341 before all artifacts have been staged. This method contacts the 1342 devserver and blocks until all staging is completed and should be 1343 called after a call to trigger_download. 1344 1345 @param kwargs_build_info: Dictionary of build information. 1346 For CrOS, it is None as build is the CrOS image name. 1347 For Android, it is {'target': target, 1348 'build_id': build_id, 1349 'branch': branch} 1350 1351 @raise DevServerException upon any return code that's not HTTP OK. 1352 """ 1353 archive_url = _get_image_storage_server() + build 1354 error_message = ("finish_download for %s failed;" 1355 "HTTP OK not accompanied by 'Success'." % build) 1356 kwargs = {'archive_url': archive_url, 1357 'artifacts': artifacts, 1358 'files': files, 1359 'error_message': error_message} 1360 if kwargs_build_info: 1361 kwargs.update(kwargs_build_info) 1362 try: 1363 self.call_and_wait(call_name='stage', **kwargs) 1364 except (bin_utils.TimeoutError, error.TimeoutException): 1365 logging.error('finish_download timed out for %s', build) 1366 raise DevServerException( 1367 'finish_download timed out for %s.' % build) 1368 1369 1370 @remote_devserver_call() 1371 def locate_file(self, file_name, artifacts, build, build_info): 1372 """Locate a file with the given file_name on devserver. 1373 1374 This method calls devserver RPC `locate_file` to look up a file with 1375 the given file name inside specified build artifacts. 1376 1377 @param file_name: Name of the file to look for a file. 1378 @param artifacts: A list of artifact names to search for the file. 1379 @param build: Name of the build. For Android, it's None as build_info 1380 should be used. 1381 @param build_info: Dictionary of build information. 1382 For CrOS, it is None as build is the CrOS image name. 1383 For Android, it is {'target': target, 1384 'build_id': build_id, 1385 'branch': branch} 1386 1387 @return: A devserver url to the file. 1388 @raise DevServerException upon any return code that's not HTTP OK. 1389 """ 1390 if not build and not build_info: 1391 raise DevServerException('You must specify build information to ' 1392 'look for file %s in artifacts %s.' % 1393 (file_name, artifacts)) 1394 kwargs = {'file_name': file_name, 1395 'artifacts': artifacts} 1396 if build_info: 1397 build_path = '%(branch)s/%(target)s/%(build_id)s' % build_info 1398 kwargs.update(build_info) 1399 # Devserver treats Android and Brillo build in the same way as they 1400 # are both retrieved from Launch Control and have similar build 1401 # artifacts. Therefore, os_type for devserver calls is `android` for 1402 # both Android and Brillo builds. 1403 kwargs['os_type'] = 'android' 1404 else: 1405 build_path = build 1406 kwargs['build'] = build 1407 call = self.build_call('locate_file', async=False, **kwargs) 1408 try: 1409 file_path = self.run_call(call) 1410 return os.path.join(self.url(), 'static', build_path, file_path) 1411 except httplib.BadStatusLine as e: 1412 logging.error(e) 1413 raise DevServerException('Received Bad Status line, Devserver %s ' 1414 'might have gone down while handling ' 1415 'the call: %s' % (self.url(), call)) 1416 1417 1418 @remote_devserver_call() 1419 def list_control_files(self, build, suite_name=''): 1420 """Ask the devserver to list all control files for |build|. 1421 1422 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1423 whose control files the caller wants listed. 1424 @param suite_name: The name of the suite for which we require control 1425 files. 1426 @return None on failure, or a list of control file paths 1427 (e.g. server/site_tests/autoupdate/control) 1428 @raise DevServerException upon any return code that's not HTTP OK. 1429 """ 1430 build = self.translate(build) 1431 call = self.build_call('controlfiles', build=build, 1432 suite_name=suite_name) 1433 return self.run_call(call, readline=True) 1434 1435 1436 @remote_devserver_call() 1437 def get_control_file(self, build, control_path): 1438 """Ask the devserver for the contents of a control file. 1439 1440 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1441 whose control file the caller wants to fetch. 1442 @param control_path: The file to fetch 1443 (e.g. server/site_tests/autoupdate/control) 1444 @return The contents of the desired file. 1445 @raise DevServerException upon any return code that's not HTTP OK. 1446 """ 1447 build = self.translate(build) 1448 call = self.build_call('controlfiles', build=build, 1449 control_path=control_path) 1450 return self.run_call(call) 1451 1452 1453 @remote_devserver_call() 1454 def list_suite_controls(self, build, suite_name=''): 1455 """Ask the devserver to list contents of all control files for |build|. 1456 1457 @param build: The build (e.g. x86-mario-release/R18-1586.0.0-a1-b1514) 1458 whose control files' contents the caller wants returned. 1459 @param suite_name: The name of the suite for which we require control 1460 files. 1461 @return None on failure, or a dict of contents of all control files 1462 (e.g. {'path1': "#Copyright controls ***", ..., 1463 pathX': "#Copyright controls ***"} 1464 @raise DevServerException upon any return code that's not HTTP OK. 1465 """ 1466 build = self.translate(build) 1467 call = self.build_call('list_suite_controls', build=build, 1468 suite_name=suite_name) 1469 return json.load(cStringIO.StringIO(self.run_call(call))) 1470 1471 1472 class ImageServer(ImageServerBase): 1473 """Class for DevServer that handles RPCs related to CrOS images. 1474 1475 The calls to devserver to stage artifacts, including stage and download, are 1476 made in async mode. That is, when caller makes an RPC |stage| to request 1477 devserver to stage certain artifacts, devserver handles the call and starts 1478 staging artifacts in a new thread, and return |Success| without waiting for 1479 staging being completed. When caller receives message |Success|, it polls 1480 devserver's is_staged call until all artifacts are staged. 1481 Such mechanism is designed to prevent cherrypy threads in devserver being 1482 running out, as staging artifacts might take long time, and cherrypy starts 1483 with a fixed number of threads that handle devserver rpc. 1484 """ 1485 1486 class ArtifactUrls(object): 1487 """A container for URLs of staged artifacts. 1488 1489 Attributes: 1490 full_payload: URL for downloading a staged full release update 1491 mton_payload: URL for downloading a staged M-to-N release update 1492 nton_payload: URL for downloading a staged N-to-N release update 1493 1494 """ 1495 def __init__(self, full_payload=None, mton_payload=None, 1496 nton_payload=None): 1497 self.full_payload = full_payload 1498 self.mton_payload = mton_payload 1499 self.nton_payload = nton_payload 1500 1501 1502 def wait_for_artifacts_staged(self, archive_url, artifacts='', files=''): 1503 """Polling devserver.is_staged until all artifacts are staged. 1504 1505 @param archive_url: Google Storage URL for the build. 1506 @param artifacts: Comma separated list of artifacts to download. 1507 @param files: Comma separated list of files to download. 1508 @return: True if all artifacts are staged in devserver. 1509 """ 1510 kwargs = {'archive_url': archive_url, 1511 'artifacts': artifacts, 1512 'files': files} 1513 return self._poll_is_staged(**kwargs) 1514 1515 1516 @remote_devserver_call() 1517 def call_and_wait(self, call_name, archive_url, artifacts, files, 1518 error_message, expected_response=SUCCESS): 1519 """Helper method to make a urlopen call, and wait for artifacts staged. 1520 1521 @param call_name: name of devserver rpc call. 1522 @param archive_url: Google Storage URL for the build.. 1523 @param artifacts: Comma separated list of artifacts to download. 1524 @param files: Comma separated list of files to download. 1525 @param expected_response: Expected response from rpc, default to 1526 |Success|. If it's set to None, do not compare 1527 the actual response. Any response is consider 1528 to be good. 1529 @param error_message: Error message to be thrown if response does not 1530 match expected_response. 1531 1532 @return: The response from rpc. 1533 @raise DevServerException upon any return code that's expected_response. 1534 1535 """ 1536 kwargs = {'archive_url': archive_url, 1537 'artifacts': artifacts, 1538 'files': files} 1539 return self._call_and_wait(call_name, error_message, 1540 expected_response, **kwargs) 1541 1542 1543 @remote_devserver_call() 1544 def stage_artifacts(self, image=None, artifacts=None, files='', 1545 archive_url=None): 1546 """Tell the devserver to download and stage |artifacts| from |image|. 1547 1548 This is the main call point for staging any specific artifacts for a 1549 given build. To see the list of artifacts one can stage see: 1550 1551 ~src/platfrom/dev/artifact_info.py. 1552 1553 This is maintained along with the actual devserver code. 1554 1555 @param image: the image to fetch and stage. 1556 @param artifacts: A list of artifacts. 1557 @param files: A list of files to stage. 1558 @param archive_url: Optional parameter that has the archive_url to stage 1559 this artifact from. Default is specified in autotest config + 1560 image. 1561 1562 @raise DevServerException upon any return code that's not HTTP OK. 1563 """ 1564 if not artifacts and not files: 1565 raise DevServerException('Must specify something to stage.') 1566 image = self.translate(image) 1567 self._stage_artifacts(image, artifacts, files, archive_url) 1568 1569 1570 @remote_devserver_call(timeout_min=DEVSERVER_SSH_TIMEOUT_MINS) 1571 def list_image_dir(self, image): 1572 """List the contents of the image stage directory, on the devserver. 1573 1574 @param image: The image name, eg: <board>-<branch>/<Milestone>-<build>. 1575 1576 @raise DevServerException upon any return code that's not HTTP OK. 1577 """ 1578 image = self.translate(image) 1579 logging.info('Requesting contents from devserver %s for image %s', 1580 self.url(), image) 1581 archive_url = _get_storage_server_for_artifacts() + image 1582 call = self.build_call('list_image_dir', archive_url=archive_url) 1583 response = self.run_call(call, readline=True) 1584 for line in response: 1585 logging.info(line) 1586 1587 1588 def trigger_download(self, image, synchronous=True): 1589 """Tell the devserver to download and stage |image|. 1590 1591 Tells the devserver to fetch |image| from the image storage server 1592 named by _get_image_storage_server(). 1593 1594 If |synchronous| is True, waits for the entire download to finish 1595 staging before returning. Otherwise only the artifacts necessary 1596 to start installing images onto DUT's will be staged before returning. 1597 A caller can then call finish_download to guarantee the rest of the 1598 artifacts have finished staging. 1599 1600 @param image: the image to fetch and stage. 1601 @param synchronous: if True, waits until all components of the image are 1602 staged before returning. 1603 1604 @raise DevServerException upon any return code that's not HTTP OK. 1605 1606 """ 1607 image = self.translate(image) 1608 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE 1609 self._trigger_download(image, artifacts, files='', 1610 synchronous=synchronous) 1611 1612 1613 @remote_devserver_call() 1614 def setup_telemetry(self, build): 1615 """Tell the devserver to setup telemetry for this build. 1616 1617 The devserver will stage autotest and then extract the required files 1618 for telemetry. 1619 1620 @param build: the build to setup telemetry for. 1621 1622 @returns path on the devserver that telemetry is installed to. 1623 """ 1624 build = self.translate(build) 1625 archive_url = _get_image_storage_server() + build 1626 call = self.build_call('setup_telemetry', archive_url=archive_url) 1627 try: 1628 response = self.run_call(call) 1629 except httplib.BadStatusLine as e: 1630 logging.error(e) 1631 raise DevServerException('Received Bad Status line, Devserver %s ' 1632 'might have gone down while handling ' 1633 'the call: %s' % (self.url(), call)) 1634 return response 1635 1636 1637 def finish_download(self, image): 1638 """Tell the devserver to finish staging |image|. 1639 1640 If trigger_download is called with synchronous=False, it will return 1641 before all artifacts have been staged. This method contacts the 1642 devserver and blocks until all staging is completed and should be 1643 called after a call to trigger_download. 1644 1645 @param image: the image to fetch and stage. 1646 @raise DevServerException upon any return code that's not HTTP OK. 1647 """ 1648 image = self.translate(image) 1649 artifacts = _ARTIFACTS_TO_BE_STAGED_FOR_IMAGE_WITH_AUTOTEST 1650 self._finish_download(image, artifacts, files='') 1651 1652 1653 def get_update_url(self, image): 1654 """Returns the url that should be passed to the updater. 1655 1656 @param image: the image that was fetched. 1657 """ 1658 image = self.translate(image) 1659 url_pattern = CONFIG.get_config_value('CROS', 'image_url_pattern', 1660 type=str) 1661 return (url_pattern % (self.url(), image)) 1662 1663 1664 def get_staged_file_url(self, filename, image): 1665 """Returns the url of a staged file for this image on the devserver.""" 1666 return '/'.join([self._get_image_url(image), filename]) 1667 1668 1669 def get_full_payload_url(self, image): 1670 """Returns a URL to a staged full payload. 1671 1672 @param image: the image that was fetched. 1673 1674 @return A fully qualified URL that can be used for downloading the 1675 payload. 1676 1677 """ 1678 return self._get_image_url(image) + '/update.gz' 1679 1680 1681 def get_test_image_url(self, image): 1682 """Returns a URL to a staged test image. 1683 1684 @param image: the image that was fetched. 1685 1686 @return A fully qualified URL that can be used for downloading the 1687 image. 1688 1689 """ 1690 return self._get_image_url(image) + '/chromiumos_test_image.bin' 1691 1692 1693 @remote_devserver_call() 1694 def get_dependencies_file(self, build): 1695 """Ask the dev server for the contents of the suite dependencies file. 1696 1697 Ask the dev server at |self._dev_server| for the contents of the 1698 pre-processed suite dependencies file (at DEPENDENCIES_FILE) 1699 for |build|. 1700 1701 @param build: The build (e.g. x86-mario-release/R21-2333.0.0) 1702 whose dependencies the caller is interested in. 1703 @return The contents of the dependencies file, which should eval to 1704 a dict of dicts, as per bin_utils/suite_preprocessor.py. 1705 @raise DevServerException upon any return code that's not HTTP OK. 1706 """ 1707 build = self.translate(build) 1708 call = self.build_call('controlfiles', 1709 build=build, control_path=DEPENDENCIES_FILE) 1710 return self.run_call(call) 1711 1712 1713 @remote_devserver_call() 1714 def get_latest_build_in_gs(self, board): 1715 """Ask the devservers for the latest offical build in Google Storage. 1716 1717 @param board: The board for who we want the latest official build. 1718 @return A string of the returned build rambi-release/R37-5868.0.0 1719 @raise DevServerException upon any return code that's not HTTP OK. 1720 """ 1721 call = self.build_call( 1722 'xbuddy_translate/remote/%s/latest-official' % board, 1723 image_dir=_get_image_storage_server()) 1724 image_name = self.run_call(call) 1725 return os.path.dirname(image_name) 1726 1727 1728 def translate(self, build_name): 1729 """Translate the build name if it's in LATEST format. 1730 1731 If the build name is in the format [builder]/LATEST, return the latest 1732 build in Google Storage otherwise return the build name as is. 1733 1734 @param build_name: build_name to check. 1735 1736 @return The actual build name to use. 1737 """ 1738 match = re.match(r'([\w-]+)-(\w+)/LATEST', build_name, re.I) 1739 if not match: 1740 return build_name 1741 translated_build = self.get_latest_build_in_gs(match.groups()[0]) 1742 logging.debug('Translated relative build %s to %s', build_name, 1743 translated_build) 1744 return translated_build 1745 1746 1747 @classmethod 1748 @remote_devserver_call() 1749 def get_latest_build(cls, target, milestone=''): 1750 """Ask all the devservers for the latest build for a given target. 1751 1752 @param target: The build target, typically a combination of the board 1753 and the type of build e.g. x86-mario-release. 1754 @param milestone: For latest build set to '', for builds only in a 1755 specific milestone set to a str of format Rxx 1756 (e.g. R16). Default: ''. Since we are dealing with a 1757 webserver sending an empty string, '', ensures that 1758 the variable in the URL is ignored as if it was set 1759 to None. 1760 @return A string of the returned build e.g. R20-2226.0.0. 1761 @raise DevServerException upon any return code that's not HTTP OK. 1762 """ 1763 calls = cls.build_all_calls('latestbuild', target=target, 1764 milestone=milestone) 1765 latest_builds = [] 1766 for call in calls: 1767 latest_builds.append(cls.run_call(call)) 1768 1769 return max(latest_builds, key=version.LooseVersion) 1770 1771 1772 @remote_devserver_call() 1773 def _kill_au_process_for_host(self, **kwargs): 1774 """Kill the triggerred auto_update process if error happens in cros_au. 1775 1776 @param kwargs: Arguments to make kill_au_proc devserver call. 1777 """ 1778 call = self.build_call('kill_au_proc', **kwargs) 1779 response = self.run_call(call) 1780 if not response == 'True': 1781 raise DevServerException( 1782 'Failed to kill the triggerred CrOS auto_update process' 1783 'on devserver %s, the response is %s' % ( 1784 self.url(), response)) 1785 1786 1787 def kill_au_process_for_host(self, host_name, pid): 1788 """Kill the triggerred auto_update process if error happens. 1789 1790 Usually this function is used to clear all potential left au processes 1791 of the given host name. 1792 1793 If pid is specified, the devserver will further check the given pid to 1794 make sure the process is killed. This is used for the case that the au 1795 process has started in background, but then provision fails due to 1796 some unknown issues very fast. In this case, when 'kill_au_proc' is 1797 called, there's no corresponding background track log created for this 1798 ongoing au process, which prevents this RPC call from killing this au 1799 process. 1800 1801 @param host_name: The DUT's hostname. 1802 @param pid: The ongoing au process's pid. 1803 1804 @return: True if successfully kill the auto-update process for host. 1805 """ 1806 kwargs = {'host_name': host_name, 'pid': pid} 1807 try: 1808 self._kill_au_process_for_host(**kwargs) 1809 except DevServerException: 1810 return False 1811 1812 return True 1813 1814 1815 @remote_devserver_call() 1816 def _clean_track_log(self, **kwargs): 1817 """Clean track log for the current auto-update process.""" 1818 call = self.build_call('handler_cleanup', **kwargs) 1819 self.run_call(call) 1820 1821 1822 def clean_track_log(self, host_name, pid): 1823 """Clean track log for the current auto-update process. 1824 1825 @param host_name: The host name to be updated. 1826 @param pid: The auto-update process id. 1827 1828 @return: True if track log is successfully cleaned, False otherwise. 1829 """ 1830 if not pid: 1831 return False 1832 1833 kwargs = {'host_name': host_name, 'pid': pid} 1834 try: 1835 self._clean_track_log(**kwargs) 1836 except DevServerException as e: 1837 logging.debug('Failed to clean track_status_file on ' 1838 'devserver for host %s and process id %s: %s', 1839 host_name, pid, str(e)) 1840 return False 1841 1842 return True 1843 1844 1845 def _get_au_log_filename(self, log_dir, host_name, pid): 1846 """Return the auto-update log's filename.""" 1847 return os.path.join(log_dir, CROS_AU_LOG_FILENAME % ( 1848 host_name, pid)) 1849 1850 def _read_json_response_from_devserver(self, response): 1851 """Reads the json response from the devserver. 1852 1853 This is extracted to its own function so that it can be easily mocked. 1854 @param response: the response for a devserver. 1855 """ 1856 try: 1857 return json.loads(response) 1858 except ValueError as e: 1859 logging.debug('Failed to load json response: %s', response) 1860 raise DevServerException(e) 1861 1862 1863 @remote_devserver_call() 1864 def _collect_au_log(self, log_dir, **kwargs): 1865 """Collect logs from devserver after cros-update process is finished. 1866 1867 Collect the logs that recording the whole cros-update process, and 1868 write it to sysinfo path of a job. 1869 1870 The example log file name that is stored is like: 1871 '1220-repair/sysinfo/CrOS_update_host_name_pid.log' 1872 1873 @param host_name: the DUT's hostname. 1874 @param pid: the auto-update process id on devserver. 1875 @param log_dir: The directory to save the cros-update process log 1876 retrieved from devserver. 1877 """ 1878 call = self.build_call('collect_cros_au_log', **kwargs) 1879 response = self.run_call(call) 1880 if not os.path.exists(log_dir): 1881 os.mkdir(log_dir) 1882 write_file = self._get_au_log_filename( 1883 log_dir, kwargs['host_name'], kwargs['pid']) 1884 logging.debug('Saving auto-update logs into %s', write_file) 1885 1886 au_logs = self._read_json_response_from_devserver(response) 1887 1888 try: 1889 for k, v in au_logs['host_logs'].items(): 1890 log_name = '%s_%s_%s' % (k, kwargs['host_name'], kwargs['pid']) 1891 log_path = os.path.join(log_dir, log_name) 1892 with open(log_path, 'w') as out_log: 1893 out_log.write(v) 1894 except IOError as e: 1895 raise DevServerException('Failed to write auto-update hostlogs: ' 1896 '%s' % e) 1897 1898 try: 1899 with open(write_file, 'w') as out_log: 1900 out_log.write(au_logs['cros_au_log']) 1901 except: 1902 raise DevServerException('Failed to write auto-update logs into ' 1903 '%s' % write_file) 1904 1905 1906 def collect_au_log(self, host_name, pid, log_dir): 1907 """Collect logs from devserver after cros-update process is finished. 1908 1909 @param host_name: the DUT's hostname. 1910 @param pid: the auto-update process id on devserver. 1911 @param log_dir: The directory to save the cros-update process log 1912 retrieved from devserver. 1913 1914 @return: True if auto-update log is successfully collected, False 1915 otherwise. 1916 """ 1917 if not pid: 1918 return False 1919 1920 kwargs = {'host_name': host_name, 'pid': pid} 1921 try: 1922 self._collect_au_log(log_dir, **kwargs) 1923 except DevServerException as e: 1924 logging.debug('Failed to collect auto-update log on ' 1925 'devserver for host %s and process id %s: %s', 1926 host_name, pid, str(e)) 1927 return False 1928 1929 return True 1930 1931 1932 @remote_devserver_call() 1933 def _trigger_auto_update(self, **kwargs): 1934 """Trigger auto-update by calling devserver.cros_au. 1935 1936 @param kwargs: Arguments to make cros_au devserver call. 1937 1938 @return: a tuple indicates whether the RPC call cros_au succeeds and 1939 the auto-update process id running on devserver. 1940 """ 1941 host_name = kwargs['host_name'] 1942 call = self.build_call('cros_au', async=True, **kwargs) 1943 try: 1944 response = self.run_call(call) 1945 logging.info( 1946 'Received response from devserver for cros_au call: %r', 1947 response) 1948 except httplib.BadStatusLine as e: 1949 logging.error(e) 1950 raise DevServerException('Received Bad Status line, Devserver %s ' 1951 'might have gone down while handling ' 1952 'the call: %s' % (self.url(), call)) 1953 1954 return response 1955 1956 1957 def _check_for_auto_update_finished(self, pid, wait=True, **kwargs): 1958 """Polling devserver.get_au_status to get current auto-update status. 1959 1960 The current auto-update status is used to identify whether the update 1961 process is finished. 1962 1963 @param pid: The background process id for auto-update in devserver. 1964 @param kwargs: keyword arguments to make get_au_status devserver call. 1965 @param wait: Should the check wait for completion. 1966 1967 @return: True if auto-update is finished for a given dut. 1968 """ 1969 logging.debug('Check the progress for auto-update process %r', pid) 1970 kwargs['pid'] = pid 1971 call = self.build_call('get_au_status', **kwargs) 1972 1973 def all_finished(): 1974 """Call devserver.get_au_status rpc to check if auto-update 1975 is finished. 1976 1977 @return: True if auto-update is finished for a given dut. False 1978 otherwise. 1979 @rasies DevServerException, the exception is a wrapper of all 1980 exceptions that were raised when devserver tried to 1981 download the artifacts. devserver raises an HTTPError or 1982 a CmdError when an exception was raised in the code. Such 1983 exception should be re-raised here to stop the caller from 1984 waiting. If the call to devserver failed for connection 1985 issue, a URLError exception is raised, and caller should 1986 retry the call to avoid such network flakiness. 1987 1988 """ 1989 try: 1990 au_status = self.run_call(call) 1991 response = json.loads(au_status) 1992 # This is a temp fix to fit both dict and tuple returning 1993 # values. The dict check will be removed after a corresponding 1994 # devserver CL is deployed. 1995 if isinstance(response, dict): 1996 if response.get('detailed_error_msg'): 1997 raise DevServerException( 1998 response.get('detailed_error_msg')) 1999 2000 if response.get('finished'): 2001 logging.debug('CrOS auto-update is finished') 2002 return True 2003 else: 2004 logging.debug('Current CrOS auto-update status: %s', 2005 response.get('status')) 2006 return False 2007 2008 if not response[0]: 2009 logging.debug('Current CrOS auto-update status: %s', 2010 response[1]) 2011 return False 2012 else: 2013 logging.debug('CrOS auto-update is finished') 2014 return True 2015 except urllib2.HTTPError as e: 2016 error_markup = e.read() 2017 raise DevServerException(_strip_http_message(error_markup)) 2018 except urllib2.URLError as e: 2019 # Could be connection issue, retry it. 2020 # For example: <urlopen error [Errno 111] Connection refused> 2021 logging.warning('URLError (%r): Retrying connection to ' 2022 'devserver to check auto-update status.', e) 2023 return False 2024 except error.CmdError: 2025 # Retry if SSH failed to connect to the devserver. 2026 logging.warning('CmdError: Retrying SSH connection to check ' 2027 'auto-update status.') 2028 return False 2029 except socket.error as e: 2030 # Could be some temporary devserver connection issues. 2031 logging.warning('Socket Error (%r): Retrying connection to ' 2032 'devserver to check auto-update status.', e) 2033 return False 2034 except ValueError as e: 2035 raise DevServerException( 2036 '%s (Got AU status: %r)' % (str(e), au_status)) 2037 2038 if wait: 2039 bin_utils.poll_for_condition( 2040 all_finished, 2041 exception=bin_utils.TimeoutError(), 2042 timeout=DEVSERVER_IS_CROS_AU_FINISHED_TIMEOUT_MIN * 60, 2043 sleep_interval=CROS_AU_POLLING_INTERVAL) 2044 2045 return True 2046 else: 2047 return all_finished() 2048 2049 2050 def check_for_auto_update_finished(self, response, wait=True, **kwargs): 2051 """Processing response of 'cros_au' and polling for auto-update status. 2052 2053 Will wait for the whole auto-update process is finished. 2054 2055 @param response: The response from RPC 'cros_au' 2056 @param kwargs: keyword arguments to make get_au_status devserver call. 2057 2058 @return: a tuple includes two elements. 2059 finished: True if the operation has completed. 2060 raised_error: None if everything works well or the raised error. 2061 pid: the auto-update process id on devserver. 2062 """ 2063 2064 pid = 0 2065 raised_error = None 2066 finished = False 2067 try: 2068 response = json.loads(response) 2069 if response[0]: 2070 pid = response[1] 2071 # If provision is kicked off asynchronously, pid will be -1. 2072 # If provision is not successfully kicked off , pid continues 2073 # to be 0. 2074 if pid > 0: 2075 logging.debug('start process %r for auto_update in ' 2076 'devserver', pid) 2077 finished = self._check_for_auto_update_finished( 2078 pid, wait=wait, **kwargs) 2079 except Exception as e: 2080 logging.debug('Failed to trigger auto-update process on devserver') 2081 finished = True 2082 raised_error = e 2083 finally: 2084 return finished, raised_error, pid 2085 2086 2087 def _check_error_message(self, error_patterns_to_check, error_msg): 2088 """Detect whether specific error pattern exist in error message. 2089 2090 @param error_patterns_to_check: the error patterns to check 2091 @param error_msg: the error message which may include any error 2092 pattern. 2093 2094 @return A boolean variable, True if error_msg contains any error 2095 pattern in error_patterns_to_check, False otherwise. 2096 """ 2097 for err in error_patterns_to_check: 2098 if err in error_msg: 2099 return True 2100 2101 return False 2102 2103 2104 def _is_retryable(self, error_msg): 2105 """Detect whether we will retry auto-update based on error_msg. 2106 2107 @param error_msg: The given error message. 2108 2109 @return A boolean variable which indicates whether we will retry 2110 auto_update with another devserver based on the given error_msg. 2111 """ 2112 # For now we just hard-code the error message we think it's suspicious. 2113 # When we get more date about what's the json response when devserver 2114 # is overloaded, we can update this part. 2115 retryable_error_patterns = [ERR_MSG_FOR_INVALID_DEVSERVER_RESPONSE, 2116 'is not pingable'] 2117 return self._check_error_message(retryable_error_patterns, error_msg) 2118 2119 2120 def _should_use_original_payload(self, error_msg): 2121 devserver_error_patterns = ['DevserverCannotStartError'] 2122 return self._check_error_message(devserver_error_patterns, error_msg) 2123 2124 2125 def _parse_buildname_safely(self, build_name): 2126 """Parse a given buildname safely. 2127 2128 @param build_name: the build name to be parsed. 2129 2130 @return: a tuple (board, build_type, milestone) 2131 """ 2132 try: 2133 board, build_type, milestone, _ = server_utils.ParseBuildName( 2134 build_name) 2135 except server_utils.ParseBuildNameException: 2136 logging.warning('Unable to parse build name %s for metrics. ' 2137 'Continuing anyway.', build_name) 2138 board, build_type, milestone = ('', '', '') 2139 2140 return board, build_type, milestone 2141 2142 2143 def _emit_auto_update_metrics(self, board, build_type, dut_host_name, 2144 build_name, attempt, 2145 success, failure_reason, duration): 2146 """Send metrics for a single auto_update attempt. 2147 2148 @param board: a field in metrics representing which board this 2149 auto_update tries to update. 2150 @param build_type: a field in metrics representing which build type this 2151 auto_update tries to update. 2152 @param dut_host_name: a field in metrics representing which DUT this 2153 auto_update tries to update. 2154 @param build_name: auto update build being updated to. 2155 @param attempt: a field in metrics, representing which attempt/retry 2156 this auto_update is. 2157 @param success: a field in metrics, representing whether this 2158 auto_update succeeds or not. 2159 @param failure_reason: DevServerExceptionClassifier object to show 2160 auto update failure reason, or None. 2161 @param duration: auto update duration time, in seconds. 2162 """ 2163 # The following is high cardinality, but sparse. 2164 # Each DUT is of a single board type, and likely build type. 2165 # The affinity also results in each DUT being attached to the same 2166 # dev_server as well. 2167 fields = { 2168 'board': board, 2169 'build_type': build_type, 2170 'dut_host_name': dut_host_name, 2171 'dev_server': self.resolved_hostname, 2172 'attempt': attempt, 2173 'success': success, 2174 } 2175 2176 # reset_after=True is required for String gauges events to ensure that 2177 # the metrics are not repeatedly emitted until the server restarts. 2178 2179 metrics.String(PROVISION_PATH + '/auto_update_build_by_devserver_dut', 2180 reset_after=True).set(build_name, fields=fields) 2181 2182 if not success: 2183 metrics.String( 2184 PROVISION_PATH + 2185 '/auto_update_failure_reason_by_devserver_dut', 2186 reset_after=True).set( 2187 failure_reason.classification if failure_reason else '', 2188 fields=fields) 2189 2190 metrics.SecondsDistribution( 2191 PROVISION_PATH + '/auto_update_duration_by_devserver_dut').add( 2192 duration, fields=fields) 2193 2194 2195 def _emit_provision_metrics(self, error_list, duration_list, 2196 is_au_success, board, build_type, milestone, 2197 dut_host_name, is_aue2etest, 2198 total_duration, build_name): 2199 """Send metrics for provision request. 2200 2201 Provision represents potentially multiple auto update attempts. 2202 2203 Please note: to avoid reaching or exceeding the monarch field 2204 cardinality limit, we avoid a metric that includes both dut hostname 2205 and other high cardinality fields. 2206 2207 @param error_list: a list of DevServerExceptionClassifier objects to 2208 show errors happened in provision. Usually it contains 1 ~ 2209 AU_RETRY_LIMIT objects since we only retry provision for several 2210 times. 2211 @param duration_list: a list of provision duration time, counted by 2212 seconds. 2213 @param is_au_success: a field in metrics, representing whether this 2214 auto_update succeeds or not. 2215 @param board: a field in metrics representing which board this 2216 auto_update tries to update. 2217 @param build_type: a field in metrics representing which build type this 2218 auto_update tries to update. 2219 @param milestone: a field in metrics representing which milestone this 2220 auto_update tries to update. 2221 @param dut_host_name: a field in metrics representing which DUT this 2222 auto_update tries to update. 2223 @param is_aue2etest: a field in metrics representing if provision was 2224 done as part of the autoupdate_EndToEndTest. 2225 """ 2226 # The following is high cardinality, but sparse. 2227 # Each DUT is of a single board type, and likely build type. 2228 # The affinity also results in each DUT being attached to the same 2229 # dev_server as well. 2230 fields = { 2231 'board': board, 2232 'build_type': build_type, 2233 'dut_host_name': dut_host_name, 2234 'dev_server': self.resolved_hostname, 2235 'success': is_au_success, 2236 } 2237 2238 # reset_after=True is required for String gauges events to ensure that 2239 # the metrics are not repeatedly emitted until the server restarts. 2240 2241 metrics.String(PROVISION_PATH + '/provision_build_by_devserver_dut', 2242 reset_after=True).set(build_name, fields=fields) 2243 2244 if error_list: 2245 metrics.String( 2246 PROVISION_PATH + 2247 '/provision_failure_reason_by_devserver_dut', 2248 reset_after=True).set(error_list[0].classification, 2249 fields=fields) 2250 2251 metrics.SecondsDistribution( 2252 PROVISION_PATH + '/provision_duration_by_devserver_dut').add( 2253 total_duration, fields=fields) 2254 2255 2256 def _parse_buildname_from_gs_uri(self, uri): 2257 """Get parameters needed for AU metrics when build_name is not known. 2258 2259 autoupdate_EndToEndTest is run with two Google Storage URIs from the 2260 gs://chromeos-releases bucket. URIs in this bucket do not have the 2261 build_name in the format samus-release/R60-0000.0.0. 2262 2263 We can get the milestone and board by checking the instructions.json 2264 file contained in the bucket with the payloads. 2265 2266 @param uri: The partial uri we received from autoupdate_EndToEndTest. 2267 """ 2268 try: 2269 # Get the instructions file that contains info about the build. 2270 gs_file = 'gs://chromeos-releases/' + uri + '/*instructions.json' 2271 files = bin_utils.gs_ls(gs_file) 2272 for f in files: 2273 gs_folder, _, instruction_file = f.rpartition('/') 2274 self.stage_artifacts(image=uri, 2275 files=[instruction_file], 2276 archive_url=gs_folder) 2277 json_file = self.get_staged_file_url(instruction_file, uri) 2278 response = urllib2.urlopen(json_file) 2279 data = json.load(response) 2280 return data['board'], 'release', data['version']['milestone'] 2281 except (ValueError, error.CmdError, urllib2.URLError) as e: 2282 logging.debug('Problem getting values for metrics: %s', e) 2283 logging.warning('Unable to parse build name %s from AU test for ' 2284 'metrics. Continuing anyway.', uri) 2285 2286 return '', '', '' 2287 2288 2289 def auto_update(self, host_name, build_name, original_board=None, 2290 original_release_version=None, log_dir=None, 2291 force_update=False, full_update=False, 2292 payload_filename=None, force_original=False, 2293 clobber_stateful=True, quick_provision=False): 2294 """Auto-update a CrOS host. 2295 2296 @param host_name: The hostname of the DUT to auto-update. 2297 @param build_name: The build name to be auto-updated on the DUT. 2298 @param original_board: The original board of the DUT to auto-update. 2299 @param original_release_version: The release version of the DUT's 2300 current build. 2301 @param log_dir: The log directory to store auto-update logs from 2302 devserver. 2303 @param force_update: Force an update even if the version installed 2304 is the same. Default: False. 2305 @param full_update: If True, do not run stateful update, directly 2306 force a full reimage. If False, try stateful 2307 update first if the dut is already installed 2308 with the same version. 2309 @param payload_filename: Used to specify the exact file to 2310 use for autoupdating. If None, the payload 2311 will be determined by build_name. You 2312 must have already staged this file before 2313 passing it in here. 2314 @param force_original: Whether to force stateful update with the 2315 original payload. 2316 @param clobber_stateful: If True do a clean install of stateful. 2317 @param quick_provision: Attempt to use quick provision path first. 2318 2319 @return A set (is_success, pid) in which: 2320 1. is_success indicates whether this auto_update succeeds. 2321 2. pid is the process id of the successful autoupdate run. 2322 2323 @raise DevServerException if auto_update fails and is not retryable. 2324 @raise RetryableProvisionException if it fails and is retryable. 2325 """ 2326 kwargs = {'host_name': host_name, 2327 'build_name': build_name, 2328 'force_update': force_update, 2329 'full_update': full_update, 2330 'clobber_stateful': clobber_stateful, 2331 'quick_provision': quick_provision} 2332 2333 is_aue2etest = payload_filename is not None 2334 2335 if is_aue2etest: 2336 kwargs['payload_filename'] = payload_filename 2337 2338 error_msg = 'CrOS auto-update failed for host %s: %s' 2339 error_msg_attempt = 'Exception raised on auto_update attempt #%s:\n%s' 2340 is_au_success = False 2341 au_log_dir = os.path.join(log_dir, 2342 AUTO_UPDATE_LOG_DIR) if log_dir else None 2343 error_list = [] 2344 retry_with_another_devserver = False 2345 duration_list = [] 2346 2347 if is_aue2etest: 2348 board, build_type, milestone = self._parse_buildname_from_gs_uri( 2349 build_name) 2350 else: 2351 board, build_type, milestone = self._parse_buildname_safely( 2352 build_name) 2353 2354 provision_start_time = time.time() 2355 for au_attempt in range(AU_RETRY_LIMIT): 2356 logging.debug('Start CrOS auto-update for host %s at %d time(s).', 2357 host_name, au_attempt + 1) 2358 au_start_time = time.time() 2359 failure_reason = None 2360 # No matter _trigger_auto_update succeeds or fails, the auto-update 2361 # track_status_file should be cleaned, and the auto-update execute 2362 # log should be collected to directory sysinfo. Also, the error 2363 # raised by _trigger_auto_update should be displayed. 2364 try: 2365 # Try update with stateful.tgz of old release version in the 2366 # last try of auto-update. 2367 if force_original and original_release_version: 2368 # Monitor this case in monarch 2369 original_build = '%s/%s' % (original_board, 2370 original_release_version) 2371 c = metrics.Counter( 2372 'chromeos/autotest/provision/' 2373 'cros_update_with_original_build') 2374 f = {'dev_server': self.resolved_hostname, 2375 'board': board, 2376 'build_type': build_type, 2377 'milestone': milestone, 2378 'original_build': original_build} 2379 c.increment(fields=f) 2380 2381 logging.debug('Try updating stateful partition of the ' 2382 'host with the same version of its current ' 2383 'rootfs partition: %s', original_build) 2384 response = self._trigger_auto_update( 2385 original_build=original_build, **kwargs) 2386 else: 2387 response = self._trigger_auto_update(**kwargs) 2388 except DevServerException as e: 2389 logging.debug(error_msg_attempt, au_attempt+1, str(e)) 2390 failure_reason = DevServerExceptionClassifier(str(e)) 2391 else: 2392 _, raised_error, pid = self.check_for_auto_update_finished( 2393 response, **kwargs) 2394 2395 # Error happens in _collect_au_log won't be raised. 2396 if au_log_dir: 2397 is_collect_success = self.collect_au_log( 2398 kwargs['host_name'], pid, au_log_dir) 2399 else: 2400 is_collect_success = True 2401 2402 # Error happens in _clean_track_log won't be raised. 2403 if pid >= 0: 2404 is_clean_success = self.clean_track_log( 2405 kwargs['host_name'], pid) 2406 else: 2407 is_clean_success = True 2408 2409 # If any error is raised previously, log it and retry 2410 # auto-update. Otherwise, claim a successful CrOS auto-update. 2411 if (not raised_error and is_clean_success and 2412 is_collect_success): 2413 logging.debug('CrOS auto-update succeed for host %s', 2414 host_name) 2415 is_au_success = True 2416 break 2417 else: 2418 if not self.kill_au_process_for_host(kwargs['host_name'], 2419 pid): 2420 logging.debug('Failed to kill auto_update process %d', 2421 pid) 2422 if raised_error: 2423 error_str = str(raised_error) 2424 logging.debug(error_msg_attempt, au_attempt + 1, 2425 error_str) 2426 if au_log_dir: 2427 logging.debug('Please see error details in log %s', 2428 self._get_au_log_filename( 2429 au_log_dir, 2430 kwargs['host_name'], 2431 pid)) 2432 failure_reason = DevServerExceptionClassifier( 2433 error_str, keep_full_trace=False) 2434 if self._is_retryable(error_str): 2435 retry_with_another_devserver = True 2436 2437 if self._should_use_original_payload(error_str): 2438 force_original = True 2439 2440 finally: 2441 duration = int(time.time() - au_start_time) 2442 duration_list.append(duration) 2443 if failure_reason: 2444 error_list.append(failure_reason) 2445 self._emit_auto_update_metrics(board, build_type, host_name, 2446 build_name, au_attempt + 1, 2447 is_au_success, failure_reason, 2448 duration) 2449 if retry_with_another_devserver: 2450 break 2451 2452 if not is_au_success and au_attempt < AU_RETRY_LIMIT - 1: 2453 time.sleep(CROS_AU_RETRY_INTERVAL) 2454 # Use the IP of DUT if the hostname failed. 2455 host_name_ip = socket.gethostbyname(host_name) 2456 kwargs['host_name'] = host_name_ip 2457 logging.debug( 2458 'AU failed, trying IP instead of hostname: %s', 2459 host_name_ip) 2460 2461 total_duration = int(time.time() - provision_start_time) 2462 self._emit_provision_metrics(error_list, duration_list, is_au_success, 2463 board, build_type, milestone, host_name, 2464 is_aue2etest, total_duration, build_name) 2465 2466 if is_au_success: 2467 return (is_au_success, pid) 2468 2469 # If errors happen in the CrOS AU process, report the concatenation 2470 # of the errors happening in first & second provision. 2471 # If error happens in RPCs of cleaning track log, collecting 2472 # auto-update logs, or killing auto-update processes, just report a 2473 # common error here. 2474 if error_list: 2475 real_error = ', '.join(['%d) %s' % (i, e.summary) 2476 for i, e in enumerate(error_list)]) 2477 if retry_with_another_devserver: 2478 raise RetryableProvisionException( 2479 error_msg % (host_name, real_error)) 2480 else: 2481 raise error_list[0].classified_exception( 2482 error_msg % (host_name, real_error)) 2483 else: 2484 raise DevServerException(error_msg % ( 2485 host_name, ('RPC calls after the whole auto-update ' 2486 'process failed.'))) 2487 2488 2489 class AndroidBuildServer(ImageServerBase): 2490 """Class for DevServer that handles RPCs related to Android builds. 2491 2492 The calls to devserver to stage artifacts, including stage and download, are 2493 made in async mode. That is, when caller makes an RPC |stage| to request 2494 devserver to stage certain artifacts, devserver handles the call and starts 2495 staging artifacts in a new thread, and return |Success| without waiting for 2496 staging being completed. When caller receives message |Success|, it polls 2497 devserver's is_staged call until all artifacts are staged. 2498 Such mechanism is designed to prevent cherrypy threads in devserver being 2499 running out, as staging artifacts might take long time, and cherrypy starts 2500 with a fixed number of threads that handle devserver rpc. 2501 """ 2502 2503 def wait_for_artifacts_staged(self, target, build_id, branch, 2504 archive_url=None, artifacts='', files=''): 2505 """Polling devserver.is_staged until all artifacts are staged. 2506 2507 @param target: Target of the android build to stage, e.g., 2508 shamu-userdebug. 2509 @param build_id: Build id of the android build to stage. 2510 @param branch: Branch of the android build to stage. 2511 @param archive_url: Google Storage URL for the build. 2512 @param artifacts: Comma separated list of artifacts to download. 2513 @param files: Comma separated list of files to download. 2514 2515 @return: True if all artifacts are staged in devserver. 2516 """ 2517 kwargs = {'target': target, 2518 'build_id': build_id, 2519 'branch': branch, 2520 'artifacts': artifacts, 2521 'files': files, 2522 'os_type': 'android'} 2523 if archive_url: 2524 kwargs['archive_url'] = archive_url 2525 return self._poll_is_staged(**kwargs) 2526 2527 2528 @remote_devserver_call() 2529 def call_and_wait(self, call_name, target, build_id, branch, archive_url, 2530 artifacts, files, error_message, 2531 expected_response=SUCCESS): 2532 """Helper method to make a urlopen call, and wait for artifacts staged. 2533 2534 @param call_name: name of devserver rpc call. 2535 @param target: Target of the android build to stage, e.g., 2536 shamu-userdebug. 2537 @param build_id: Build id of the android build to stage. 2538 @param branch: Branch of the android build to stage. 2539 @param archive_url: Google Storage URL for the CrOS build. 2540 @param artifacts: Comma separated list of artifacts to download. 2541 @param files: Comma separated list of files to download. 2542 @param expected_response: Expected response from rpc, default to 2543 |Success|. If it's set to None, do not compare 2544 the actual response. Any response is consider 2545 to be good. 2546 @param error_message: Error message to be thrown if response does not 2547 match expected_response. 2548 2549 @return: The response from rpc. 2550 @raise DevServerException upon any return code that's expected_response. 2551 2552 """ 2553 kwargs = {'target': target, 2554 'build_id': build_id, 2555 'branch': branch, 2556 'artifacts': artifacts, 2557 'files': files, 2558 'os_type': 'android'} 2559 if archive_url: 2560 kwargs['archive_url'] = archive_url 2561 return self._call_and_wait(call_name, error_message, expected_response, 2562 **kwargs) 2563 2564 2565 @remote_devserver_call() 2566 def stage_artifacts(self, target=None, build_id=None, branch=None, 2567 image=None, artifacts=None, files='', archive_url=None): 2568 """Tell the devserver to download and stage |artifacts| from |image|. 2569 2570 This is the main call point for staging any specific artifacts for a 2571 given build. To see the list of artifacts one can stage see: 2572 2573 ~src/platfrom/dev/artifact_info.py. 2574 2575 This is maintained along with the actual devserver code. 2576 2577 @param target: Target of the android build to stage, e.g., 2578 shamu-userdebug. 2579 @param build_id: Build id of the android build to stage. 2580 @param branch: Branch of the android build to stage. 2581 @param image: Name of a build to test, in the format of 2582 branch/target/build_id 2583 @param artifacts: A list of artifacts. 2584 @param files: A list of files to stage. 2585 @param archive_url: Optional parameter that has the archive_url to stage 2586 this artifact from. Default is specified in autotest config + 2587 image. 2588 2589 @raise DevServerException upon any return code that's not HTTP OK. 2590 """ 2591 if image and not target and not build_id and not branch: 2592 branch, target, build_id = utils.parse_launch_control_build(image) 2593 if not target or not build_id or not branch: 2594 raise DevServerException('Must specify all build info (target, ' 2595 'build_id and branch) to stage.') 2596 2597 android_build_info = {'target': target, 2598 'build_id': build_id, 2599 'branch': branch} 2600 if not artifacts and not files: 2601 raise DevServerException('Must specify something to stage.') 2602 if not all(android_build_info.values()): 2603 raise DevServerException( 2604 'To stage an Android build, must specify target, build id ' 2605 'and branch.') 2606 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2607 self._stage_artifacts(build, artifacts, files, archive_url, 2608 **android_build_info) 2609 2610 def get_pull_url(self, target, build_id, branch): 2611 """Get the url to pull files from the devserver. 2612 2613 @param target: Target of the android build, e.g., shamu_userdebug 2614 @param build_id: Build id of the android build. 2615 @param branch: Branch of the android build. 2616 2617 @return A url to pull files from the dev server given a specific 2618 android build. 2619 """ 2620 return os.path.join(self.url(), 'static', branch, target, build_id) 2621 2622 2623 def trigger_download(self, target, build_id, branch, artifacts=None, 2624 files='', os='android', synchronous=True): 2625 """Tell the devserver to download and stage an Android build. 2626 2627 Tells the devserver to fetch an Android build from the image storage 2628 server named by _get_image_storage_server(). 2629 2630 If |synchronous| is True, waits for the entire download to finish 2631 staging before returning. Otherwise only the artifacts necessary 2632 to start installing images onto DUT's will be staged before returning. 2633 A caller can then call finish_download to guarantee the rest of the 2634 artifacts have finished staging. 2635 2636 @param target: Target of the android build to stage, e.g., 2637 shamu-userdebug. 2638 @param build_id: Build id of the android build to stage. 2639 @param branch: Branch of the android build to stage. 2640 @param artifacts: A string of artifacts separated by comma. If None, 2641 use the default artifacts for Android or Brillo build. 2642 @param files: String of file seperated by commas. 2643 @param os: OS artifacts to download (android/brillo). 2644 @param synchronous: if True, waits until all components of the image are 2645 staged before returning. 2646 2647 @raise DevServerException upon any return code that's not HTTP OK. 2648 2649 """ 2650 android_build_info = {'target': target, 2651 'build_id': build_id, 2652 'branch': branch} 2653 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2654 if not artifacts: 2655 board = target.split('-')[0] 2656 artifacts = ( 2657 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2658 board, os)) 2659 self._trigger_download(build, artifacts, files=files, 2660 synchronous=synchronous, **android_build_info) 2661 2662 2663 def finish_download(self, target, build_id, branch, os='android'): 2664 """Tell the devserver to finish staging an Android build. 2665 2666 If trigger_download is called with synchronous=False, it will return 2667 before all artifacts have been staged. This method contacts the 2668 devserver and blocks until all staging is completed and should be 2669 called after a call to trigger_download. 2670 2671 @param target: Target of the android build to stage, e.g., 2672 shamu-userdebug. 2673 @param build_id: Build id of the android build to stage. 2674 @param branch: Branch of the android build to stage. 2675 @param os: OS artifacts to download (android/brillo). 2676 2677 @raise DevServerException upon any return code that's not HTTP OK. 2678 """ 2679 android_build_info = {'target': target, 2680 'build_id': build_id, 2681 'branch': branch} 2682 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2683 board = target.split('-')[0] 2684 artifacts = ( 2685 android_utils.AndroidArtifacts.get_artifacts_for_reimage( 2686 board)) 2687 self._finish_download(build, artifacts, files='', **android_build_info) 2688 2689 2690 def get_staged_file_url(self, filename, target, build_id, branch): 2691 """Returns the url of a staged file for this image on the devserver. 2692 2693 @param filename: Name of the file. 2694 @param target: Target of the android build to stage, e.g., 2695 shamu-userdebug. 2696 @param build_id: Build id of the android build to stage. 2697 @param branch: Branch of the android build to stage. 2698 2699 @return: The url of a staged file for this image on the devserver. 2700 """ 2701 android_build_info = {'target': target, 2702 'build_id': build_id, 2703 'branch': branch, 2704 'os_type': 'android'} 2705 build = ANDROID_BUILD_NAME_PATTERN % android_build_info 2706 return '/'.join([self._get_image_url(build), filename]) 2707 2708 2709 @remote_devserver_call() 2710 def translate(self, build_name): 2711 """Translate the build name if it's in LATEST format. 2712 2713 If the build name is in the format [branch]/[target]/LATEST, return the 2714 latest build in Launch Control otherwise return the build name as is. 2715 2716 @param build_name: build_name to check. 2717 2718 @return The actual build name to use. 2719 """ 2720 branch, target, build_id = utils.parse_launch_control_build(build_name) 2721 if build_id.upper() != 'LATEST': 2722 return build_name 2723 call = self.build_call('latestbuild', branch=branch, target=target, 2724 os_type='android') 2725 translated_build_id = self.run_call(call) 2726 translated_build = (ANDROID_BUILD_NAME_PATTERN % 2727 {'branch': branch, 2728 'target': target, 2729 'build_id': translated_build_id}) 2730 logging.debug('Translated relative build %s to %s', build_name, 2731 translated_build) 2732 return translated_build 2733 2734 2735 def _is_load_healthy(load): 2736 """Check if devserver's load meets the minimum threshold. 2737 2738 @param load: The devserver's load stats to check. 2739 2740 @return: True if the load meets the minimum threshold. Return False 2741 otherwise. 2742 2743 """ 2744 # Threshold checks, including CPU load. 2745 if load[DevServer.CPU_LOAD] > DevServer.MAX_CPU_LOAD: 2746 logging.debug('CPU load of devserver %s is at %s%%, which is higher ' 2747 'than the threshold of %s%%', load['devserver'], 2748 load[DevServer.CPU_LOAD], DevServer.MAX_CPU_LOAD) 2749 return False 2750 if load[DevServer.NETWORK_IO] > DevServer.MAX_NETWORK_IO: 2751 logging.debug('Network IO of devserver %s is at %i Bps, which is ' 2752 'higher than the threshold of %i bytes per second.', 2753 load['devserver'], load[DevServer.NETWORK_IO], 2754 DevServer.MAX_NETWORK_IO) 2755 return False 2756 return True 2757 2758 2759 def _compare_load(devserver1, devserver2): 2760 """Comparator function to compare load between two devservers. 2761 2762 @param devserver1: A dictionary of devserver load stats to be compared. 2763 @param devserver2: A dictionary of devserver load stats to be compared. 2764 2765 @return: Negative value if the load of `devserver1` is less than the load 2766 of `devserver2`. Return positive value otherwise. 2767 2768 """ 2769 return int(devserver1[DevServer.DISK_IO] - devserver2[DevServer.DISK_IO]) 2770 2771 2772 def _get_subnet_for_host_ip(host_ip, 2773 restricted_subnets=utils.RESTRICTED_SUBNETS): 2774 """Get the subnet for a given host IP. 2775 2776 @param host_ip: the IP of a DUT. 2777 @param restricted_subnets: A list of restriected subnets. 2778 2779 @return: a (subnet_ip, mask_bits) tuple. If no matched subnet for the 2780 host_ip, return (None, None). 2781 """ 2782 for subnet_ip, mask_bits in restricted_subnets: 2783 if utils.is_in_same_subnet(host_ip, subnet_ip, mask_bits): 2784 return subnet_ip, mask_bits 2785 2786 return None, None 2787 2788 2789 def get_least_loaded_devserver(devserver_type=ImageServer, hostname=None): 2790 """Get the devserver with the least load. 2791 2792 Iterate through all devservers and get the one with least load. 2793 2794 TODO(crbug.com/486278): Devserver with required build already staged should 2795 take higher priority. This will need check_health call to be able to verify 2796 existence of a given build/artifact. Also, in case all devservers are 2797 overloaded, the logic here should fall back to the old behavior that randomly 2798 selects a devserver based on the hash of the image name/url. 2799 2800 @param devserver_type: Type of devserver to select from. Default is set to 2801 ImageServer. 2802 @param hostname: Hostname of the dut that the devserver is used for. The 2803 picked devserver needs to respect the location of the host if 2804 `prefer_local_devserver` is set to True or `restricted_subnets` is 2805 set. 2806 2807 @return: Name of the devserver with the least load. 2808 2809 """ 2810 logging.debug('Get the least loaded %r', devserver_type) 2811 devservers, can_retry = devserver_type.get_available_devservers( 2812 hostname) 2813 # If no healthy devservers available and can_retry is False, return None. 2814 # Otherwise, relax the constrain on hostname, allow all devservers to be 2815 # available. 2816 if not devserver_type.get_healthy_devserver('', devservers): 2817 if not can_retry: 2818 return None 2819 else: 2820 devservers, _ = devserver_type.get_available_devservers() 2821 2822 # get_devserver_load call needs to be made in a new process to allow force 2823 # timeout using signal. 2824 output = multiprocessing.Queue() 2825 processes = [] 2826 for devserver in devservers: 2827 processes.append(multiprocessing.Process( 2828 target=devserver_type.get_devserver_load_wrapper, 2829 args=(devserver, TIMEOUT_GET_DEVSERVER_LOAD, output))) 2830 2831 for p in processes: 2832 p.start() 2833 for p in processes: 2834 p.join() 2835 loads = [output.get() for p in processes] 2836 # Filter out any load failed to be retrieved or does not support load check. 2837 loads = [load for load in loads if load and DevServer.CPU_LOAD in load and 2838 DevServer.is_free_disk_ok(load) and 2839 DevServer.is_apache_client_count_ok(load)] 2840 if not loads: 2841 logging.debug('Failed to retrieve load stats from any devserver. No ' 2842 'load balancing can be applied.') 2843 return None 2844 loads = [load for load in loads if _is_load_healthy(load)] 2845 if not loads: 2846 logging.error('No devserver has the capacity to be selected.') 2847 return None 2848 loads = sorted(loads, cmp=_compare_load) 2849 return loads[0]['devserver'] 2850 2851 2852 def resolve(build, hostname=None, ban_list=None): 2853 """Resolve a devserver can be used for given build and hostname. 2854 2855 @param build: Name of a build to stage on devserver, e.g., 2856 ChromeOS build: daisy-release/R50-1234.0.0 2857 Launch Control build: git_mnc_release/shamu-eng 2858 @param hostname: Hostname of a devserver for, default is None, which means 2859 devserver is not restricted by the network location of the host. 2860 @param ban_list: The blacklist of devservers shouldn't be chosen. 2861 2862 @return: A DevServer instance that can be used to stage given build for the 2863 given host. 2864 """ 2865 if utils.is_launch_control_build(build): 2866 return AndroidBuildServer.resolve(build, hostname) 2867 else: 2868 return ImageServer.resolve(build, hostname, ban_list=ban_list) 2869