1 #!/usr/bin/env python 2 # Copyright 2015 The Chromium Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 """Cleanup orphaned containers. 7 8 If an autoserv process dies without being able to call handler of SIGTERM, the 9 container used to run the test will be orphaned. This adds overhead to the 10 drone. This script is used to clean up such containers. 11 12 This module also checks if the test job associated with a container has 13 finished. If so, kill the autoserv process for the test job and destroy the 14 container. To avoid racing condition, this only applies to job finished at least 15 1 hour ago. 16 17 """ 18 19 import argparse 20 import datetime 21 import logging 22 import os 23 import re 24 import signal 25 26 import common 27 from autotest_lib.client.common_lib import logging_config 28 from autotest_lib.client.common_lib import time_utils 29 from autotest_lib.client.common_lib import utils 30 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 31 from autotest_lib.site_utils import lxc 32 33 34 AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10) 35 # The cutoff time to declare a test job is completed and container is orphaned. 36 # This is to avoid a race condition that scheduler aborts a job and autoserv 37 # is still in the process of destroying the container it used. 38 FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1) 39 40 def get_info(container_name): 41 """Get job id and autoserv process id from container name. 42 43 @param container: Name of the container. 44 45 @return: job id and autoserv process id for the given container name. 46 47 """ 48 match = re.match('test_(\d+)_(\d+)_(\d+)', container_name) 49 if not match: 50 # Container is not created for test, e.g., the base container. 51 return None, None 52 job_id = int(match.groups()[0]) 53 pid = match.groups()[2] 54 return job_id, pid 55 56 57 def is_container_orphaned(container): 58 """Check if a container is orphaned. 59 60 A container is orphaned if any of these condition is True: 61 1. The autoserv process created the container is no longer running. 62 2. The test job is finished at least 1 hour ago. 63 64 @param container: A Container object. 65 66 @return: True if the container is orphaned. 67 68 """ 69 logging.debug('Checking if container is orphaned: %s', container.name) 70 job_id, pid = get_info(container.name) 71 if not job_id: 72 logging.debug('Container %s is not created for test.', container.name) 73 return False 74 75 if pid and not utils.pid_is_alive(pid): 76 logging.debug('Process with PID %s is not alive, container %s is ' 77 'orphaned.', pid, container.name) 78 return True 79 80 try: 81 hqes = AFE.get_host_queue_entries(job_id=job_id) 82 except Exception as e: 83 logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e) 84 return False 85 86 if not hqes: 87 # The job has not run yet. 88 return False 89 for hqe in hqes: 90 if hqe.active or not hqe.complete: 91 logging.debug('Test job %s is not completed yet, container %s is ' 92 'not orphaned.', job_id, container.name) 93 return False 94 if (hqe.finished_on and 95 (time_utils.time_string_to_datetime(hqes.finished_on) > 96 FINISHED_JOB_CUTOFF_TIME)): 97 logging.debug('Test job %s was completed less than an hour ago.', 98 job_id) 99 return False 100 101 logging.debug('Test job %s was completed, container %s is orphaned.', 102 job_id, container.name) 103 return True 104 105 106 def cleanup(container, options): 107 """Cleanup orphaned container. 108 109 @param container: A Container object to be cleaned up. 110 @param options: Options to do cleanup. 111 112 @return: True if cleanup is successful. False otherwise. 113 114 """ 115 if not options.execute: 116 logging.info('dryrun: Cleanup container %s', container.name) 117 return False 118 119 try: 120 _, pid = get_info(container.name) 121 # Kill autoserv process 122 if pid and utils.pid_is_alive(pid): 123 logging.info('Stopping process %s...', pid) 124 utils.nuke_pid(int(pid), (signal.SIGKILL,)) 125 126 # Destroy container 127 logging.info('Destroying container %s...', container.name) 128 container.destroy() 129 return True 130 except Exception as e: 131 logging.error('Failed to cleanup container %s. Error: %s', 132 container.name, e) 133 return False 134 135 136 def parse_options(): 137 """Parse command line inputs. 138 139 @return: Options to run the script. 140 """ 141 parser = argparse.ArgumentParser() 142 parser.add_argument('-v', '--verbose', action='store_true', 143 default=False, 144 help='Print out ALL entries.') 145 parser.add_argument('-x', '--execute', action='store_true', 146 default=False, 147 help=('Execute the actions to kill autoserv processes ' 148 'and destroy containers. Default is False to do ' 149 'dry run')) 150 # TODO(dshi): Consider to adopt the scheduler log model: 151 # 1. Create one log per run. 152 # 2. Create a symlink to the latest log. 153 parser.add_argument('-l', '--logfile', type=str, 154 default=None, 155 help='Path to the log file to save logs.') 156 return parser.parse_args() 157 158 159 def main(options): 160 """Main script. 161 162 @param options: Options to run the script. 163 """ 164 config = logging_config.LoggingConfig() 165 if options.logfile: 166 config.add_file_handler( 167 file_path=os.path.abspath(options.logfile), 168 level=logging.DEBUG if options.verbose else logging.INFO) 169 170 bucket = lxc.ContainerBucket() 171 logging.info('') 172 logging.info('Cleaning container bucket %s', bucket.container_path) 173 success_count = 0 174 failure_count = 0 175 for container in bucket.get_all().values(): 176 if is_container_orphaned(container): 177 if cleanup(container, options): 178 success_count += 1 179 else: 180 failure_count += 1 181 logging.info('Cleanup finished.') 182 183 184 if __name__ == '__main__': 185 options = parse_options() 186 main(options) 187