Home | History | Annotate | Download | only in site_utils
      1 #!/usr/bin/env python
      2 # Copyright 2015 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Cleanup orphaned containers.
      7 
      8 If an autoserv process dies without being able to call handler of SIGTERM, the
      9 container used to run the test will be orphaned. This adds overhead to the
     10 drone. This script is used to clean up such containers.
     11 
     12 This module also checks if the test job associated with a container has
     13 finished. If so, kill the autoserv process for the test job and destroy the
     14 container. To avoid racing condition, this only applies to job finished at least
     15 1 hour ago.
     16 
     17 """
     18 
     19 import argparse
     20 import datetime
     21 import logging
     22 import os
     23 import signal
     24 
     25 import common
     26 from autotest_lib.client.common_lib import logging_config
     27 from autotest_lib.client.common_lib import time_utils
     28 from autotest_lib.client.common_lib import utils
     29 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
     30 from autotest_lib.site_utils import lxc
     31 
     32 
     33 AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10)
     34 # The cutoff time to declare a test job is completed and container is orphaned.
     35 # This is to avoid a race condition that scheduler aborts a job and autoserv
     36 # is still in the process of destroying the container it used.
     37 FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1)
     38 
     39 def is_container_orphaned(container):
     40     """Check if a container is orphaned.
     41 
     42     A container is orphaned if any of these condition is True:
     43     1. The autoserv process created the container is no longer running.
     44     2. The test job is finished at least 1 hour ago.
     45 
     46     @param container: A Container object.
     47 
     48     @return: True if the container is orphaned.
     49 
     50     """
     51     logging.debug('Checking if container is orphaned: %s', container.name)
     52     if container.id is None:
     53         logging.debug('Container %s is not created for test.', container.name)
     54         return False
     55 
     56     job_id = container.id.job_id
     57     pid = container.id.pid
     58 
     59     if pid and not utils.pid_is_alive(pid):
     60         logging.debug('Process with PID %s is not alive, container %s is '
     61                       'orphaned.', pid, container.name)
     62         return True
     63 
     64     try:
     65         hqes = AFE.get_host_queue_entries(job_id=job_id)
     66     except Exception as e:
     67         logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e)
     68         return False
     69 
     70     if not hqes:
     71         # The job has not run yet.
     72         return False
     73     for hqe in hqes:
     74         if hqe.active or not hqe.complete:
     75             logging.debug('Test job %s is not completed yet, container %s is '
     76                           'not orphaned.', job_id, container.name)
     77             return False
     78         if (hqe.finished_on and
     79             (time_utils.time_string_to_datetime(hqe.finished_on) >
     80              FINISHED_JOB_CUTOFF_TIME)):
     81             logging.debug('Test job %s was completed less than an hour ago.',
     82                           job_id)
     83             return False
     84 
     85     logging.debug('Test job %s was completed, container %s is orphaned.',
     86                   job_id, container.name)
     87     return True
     88 
     89 
     90 def cleanup(container, options):
     91     """Cleanup orphaned container.
     92 
     93     @param container: A Container object to be cleaned up.
     94     @param options: Options to do cleanup.
     95 
     96     @return: True if cleanup is successful. False otherwise.
     97 
     98     """
     99     if not options.execute:
    100         logging.info('dryrun: Cleanup container %s', container.name)
    101         return False
    102 
    103     try:
    104         # cleanup is protected by is_container_orphaned.  At this point the
    105         # container may be assumed to have a valid ID.
    106         pid = container.id.pid
    107         # Kill autoserv process
    108         if pid and utils.pid_is_alive(pid):
    109             logging.info('Stopping process %s...', pid)
    110             utils.nuke_pid(int(pid), (signal.SIGKILL,))
    111 
    112         # Destroy container
    113         logging.info('Destroying container %s...', container.name)
    114         container.destroy()
    115         return True
    116     except Exception as e:
    117         logging.error('Failed to cleanup container %s. Error: %s',
    118                       container.name, e)
    119         return False
    120 
    121 
    122 def parse_options():
    123     """Parse command line inputs.
    124 
    125     @return: Options to run the script.
    126     """
    127     parser = argparse.ArgumentParser()
    128     parser.add_argument('-v', '--verbose', action='store_true',
    129                         default=False,
    130                         help='Print out ALL entries.')
    131     parser.add_argument('-x', '--execute', action='store_true',
    132                         default=False,
    133                         help=('Execute the actions to kill autoserv processes '
    134                               'and destroy containers. Default is False to do '
    135                               'dry run'))
    136     # TODO(dshi): Consider to adopt the scheduler log model:
    137     # 1. Create one log per run.
    138     # 2. Create a symlink to the latest log.
    139     parser.add_argument('-l', '--logfile', type=str,
    140                         default=None,
    141                         help='Path to the log file to save logs.')
    142     return parser.parse_args()
    143 
    144 
    145 def main(options):
    146     """Main script.
    147 
    148     @param options: Options to run the script.
    149     """
    150     config = logging_config.LoggingConfig()
    151     if options.logfile:
    152         config.add_file_handler(
    153                 file_path=os.path.abspath(options.logfile),
    154                 level=logging.DEBUG if options.verbose else logging.INFO)
    155 
    156     bucket = lxc.ContainerBucket()
    157     logging.info('')
    158     logging.info('Cleaning container bucket %s', bucket.container_path)
    159     success_count = 0
    160     failure_count = 0
    161     for container in bucket.get_all().values():
    162         if is_container_orphaned(container):
    163             if cleanup(container, options):
    164                 success_count += 1
    165             else:
    166                 failure_count += 1
    167     logging.info('Cleanup finished.')
    168 
    169 
    170 if __name__ == '__main__':
    171     options = parse_options()
    172     main(options)
    173