Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/env python
      2 # Copyright 2015 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Cleanup orphaned containers.
      7 
      8 If an autoserv process dies without being able to call handler of SIGTERM, the
      9 container used to run the test will be orphaned. This adds overhead to the
     10 drone. This script is used to clean up such containers.
     11 
     12 This module also checks if the test job associated with a container has
     13 finished. If so, kill the autoserv process for the test job and destroy the
     14 container. To avoid racing condition, this only applies to job finished at least
     15 1 hour ago.
     16 
     17 """
     18 
     19 import argparse
     20 import datetime
     21 import logging
     22 import os
     23 import re
     24 import signal
     25 
     26 import common
     27 from autotest_lib.client.common_lib import logging_config
     28 from autotest_lib.client.common_lib import time_utils
     29 from autotest_lib.client.common_lib import utils
     30 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     31 from autotest_lib.site_utils import lxc
     32 
     33 
     34 AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10)
     35 # The cutoff time to declare a test job is completed and container is orphaned.
     36 # This is to avoid a race condition that scheduler aborts a job and autoserv
     37 # is still in the process of destroying the container it used.
     38 FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1)
     39 
     40 def get_info(container_name):
     41     """Get job id and autoserv process id from container name.
     42 
     43     @param container: Name of the container.
     44 
     45     @return: job id and autoserv process id for the given container name.
     46 
     47     """
     48     match = re.match('test_(\d+)_(\d+)_(\d+)', container_name)
     49     if not match:
     50         # Container is not created for test, e.g., the base container.
     51         return None, None
     52     job_id = int(match.groups()[0])
     53     pid = match.groups()[2]
     54     return job_id, pid
     55 
     56 
     57 def is_container_orphaned(container):
     58     """Check if a container is orphaned.
     59 
     60     A container is orphaned if any of these condition is True:
     61     1. The autoserv process created the container is no longer running.
     62     2. The test job is finished at least 1 hour ago.
     63 
     64     @param container: A Container object.
     65 
     66     @return: True if the container is orphaned.
     67 
     68     """
     69     logging.debug('Checking if container is orphaned: %s', container.name)
     70     job_id, pid = get_info(container.name)
     71     if not job_id:
     72         logging.debug('Container %s is not created for test.', container.name)
     73         return False
     74 
     75     if pid and not utils.pid_is_alive(pid):
     76         logging.debug('Process with PID %s is not alive, container %s is '
     77                       'orphaned.', pid, container.name)
     78         return True
     79 
     80     try:
     81         hqes = AFE.get_host_queue_entries(job_id=job_id)
     82     except Exception as e:
     83         logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e)
     84         return False
     85 
     86     if not hqes:
     87         # The job has not run yet.
     88         return False
     89     for hqe in hqes:
     90         if hqe.active or not hqe.complete:
     91             logging.debug('Test job %s is not completed yet, container %s is '
     92                           'not orphaned.', job_id, container.name)
     93             return False
     94         if (hqe.finished_on and
     95             (time_utils.time_string_to_datetime(hqes.finished_on) >
     96              FINISHED_JOB_CUTOFF_TIME)):
     97             logging.debug('Test job %s was completed less than an hour ago.',
     98                           job_id)
     99             return False
    100 
    101     logging.debug('Test job %s was completed, container %s is orphaned.',
    102                   job_id, container.name)
    103     return True
    104 
    105 
    106 def cleanup(container, options):
    107     """Cleanup orphaned container.
    108 
    109     @param container: A Container object to be cleaned up.
    110     @param options: Options to do cleanup.
    111 
    112     @return: True if cleanup is successful. False otherwise.
    113 
    114     """
    115     if not options.execute:
    116         logging.info('dryrun: Cleanup container %s', container.name)
    117         return False
    118 
    119     try:
    120         _, pid = get_info(container.name)
    121         # Kill autoserv process
    122         if pid and utils.pid_is_alive(pid):
    123             logging.info('Stopping process %s...', pid)
    124             utils.nuke_pid(int(pid), (signal.SIGKILL,))
    125 
    126         # Destroy container
    127         logging.info('Destroying container %s...', container.name)
    128         container.destroy()
    129         return True
    130     except Exception as e:
    131         logging.error('Failed to cleanup container %s. Error: %s',
    132                       container.name, e)
    133         return False
    134 
    135 
    136 def parse_options():
    137     """Parse command line inputs.
    138 
    139     @return: Options to run the script.
    140     """
    141     parser = argparse.ArgumentParser()
    142     parser.add_argument('-v', '--verbose', action='store_true',
    143                         default=False,
    144                         help='Print out ALL entries.')
    145     parser.add_argument('-x', '--execute', action='store_true',
    146                         default=False,
    147                         help=('Execute the actions to kill autoserv processes '
    148                               'and destroy containers. Default is False to do '
    149                               'dry run'))
    150     # TODO(dshi): Consider to adopt the scheduler log model:
    151     # 1. Create one log per run.
    152     # 2. Create a symlink to the latest log.
    153     parser.add_argument('-l', '--logfile', type=str,
    154                         default=None,
    155                         help='Path to the log file to save logs.')
    156     return parser.parse_args()
    157 
    158 
    159 def main(options):
    160     """Main script.
    161 
    162     @param options: Options to run the script.
    163     """
    164     config = logging_config.LoggingConfig()
    165     if options.logfile:
    166         config.add_file_handler(
    167                 file_path=os.path.abspath(options.logfile),
    168                 level=logging.DEBUG if options.verbose else logging.INFO)
    169 
    170     bucket = lxc.ContainerBucket()
    171     logging.info('')
    172     logging.info('Cleaning container bucket %s', bucket.container_path)
    173     success_count = 0
    174     failure_count = 0
    175     for container in bucket.get_all().values():
    176         if is_container_orphaned(container):
    177             if cleanup(container, options):
    178                 success_count += 1
    179             else:
    180                 failure_count += 1
    181     logging.info('Cleanup finished.')
    182 
    183 
    184 if __name__ == '__main__':
    185     options = parse_options()
    186     main(options)
    187