Home | History | Annotate | Download | only in scheduler
      1 #!/usr/bin/python -u
      2 import os, socket, sys, signal, time, subprocess, logging
      3 from optparse import OptionParser
      4 import common
      5 from autotest_lib.scheduler import babysitter_logging_config
      6 from autotest_lib.client.common_lib import error, global_config, utils
      7 from autotest_lib.client.common_lib import logging_manager
      8 from autotest_lib.scheduler import scheduler_logging_config
      9 from autotest_lib.scheduler import status_server
     10 from autotest_lib.scheduler import monitor_db
     11 
     12 PAUSE_LENGTH = 60
     13 STALL_TIMEOUT = 2*60*60
     14 
     15 parser = OptionParser()
     16 parser.add_option("-r", action="store_true", dest="recover",
     17                   help=("run recovery mode (implicit after any crash)"))
     18 parser.add_option("--background", dest="background", action="store_true",
     19                   default=False, help=("runs the scheduler monitor on "
     20                                        "background"))
     21 (options, args) = parser.parse_args()
     22 
     23 autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
     24 results_dir = os.path.join(autodir, 'results')
     25 monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py')
     26 recover = (options.recover == True)
     27 
     28 if len(args) != 0:
     29     parser.print_help()
     30     sys.exit(1)
     31 
     32 
     33 def run_banner_output(cmd):
     34     """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
     35     banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
     36     command_output = ''
     37     try:
     38         cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
     39         command_output = cmd_out.stdout + cmd_out.stderr
     40     except error.CmdError:
     41         command_output = 'Timed out'
     42 
     43     return banner_output % command_output
     44 
     45 
     46 def kill_monitor():
     47     logging.info("Killing monitor_db")
     48     # try shutdown first
     49     utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT)
     50     if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed?
     51         # give it some time to shutdown
     52         time.sleep(30)
     53         # kill it
     54         utils.signal_process(monitor_db.PID_FILE_PREFIX)
     55 
     56 
     57 def handle_sigterm(signum, frame):
     58     logging.info('Caught SIGTERM')
     59     kill_monitor()
     60     utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX)
     61     sys.exit(1)
     62 
     63 signal.signal(signal.SIGTERM, handle_sigterm)
     64 
     65 
     66 SiteMonitorProc = utils.import_site_class(
     67     __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter',
     68     'SiteMonitorProc', object)
     69 
     70 
     71 class MonitorProc(SiteMonitorProc):
     72     def __init__(self, do_recovery=False):
     73         args = [monitor_db_path]
     74         if do_recovery:
     75             args.append("--recover-hosts")
     76         args.append(results_dir)
     77 
     78         kill_monitor()
     79         environ = os.environ
     80         scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
     81         log_name = scheduler_config.get_log_name()
     82         os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
     83         scheduler_log_dir = scheduler_config.get_server_log_dir()
     84         self.log_path = os.path.join(scheduler_log_dir, log_name)
     85 
     86         self.log_size = 0
     87         self.last_log_change = time.time()
     88 
     89         logging.info("STARTING monitor_db with log file %s" % self.log_path)
     90         self.args = args
     91 
     92         # Allow site specific code to run, set environment variables and
     93         # modify self.args if desired.
     94         super(MonitorProc, self).__init__()
     95 
     96 
     97     def start(self):
     98         devnull = open(os.devnull, 'w')
     99         self.proc = subprocess.Popen(self.args, stdout=devnull)
    100 
    101 
    102     def is_running(self):
    103         if self.proc.poll() is not None:
    104             logging.info("monitor_db DIED")
    105             return False
    106 
    107         old_size = self.log_size
    108         new_size = os.path.getsize(self.log_path)
    109         if old_size != new_size:
    110             logging.info("Log was touched")
    111             self.log_size = new_size
    112             self.last_log_change = time.time()
    113         elif self.last_log_change + STALL_TIMEOUT < time.time():
    114             logging.info("monitor_db STALLED")
    115             self.collect_stalled_info()
    116             return False
    117 
    118         return True
    119 
    120 
    121     def collect_stalled_info(self):
    122         INFO_TO_COLLECT = ['uptime',
    123                            'ps auxwww',
    124                            'iostat -k -x 2 4',
    125                           ]
    126         db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
    127         config = global_config.global_config
    128         try:
    129             user = config.get_config_value("BACKUP", "user")
    130             password = config.get_config_value("BACKUP", "password")
    131             db_cmd %= (user, password)
    132             INFO_TO_COLLECT.append(db_cmd)
    133         except global_config.ConfigError:
    134             pass
    135         stall_log_path = self.log_path + '.stall_info'
    136         log = open(stall_log_path, "w")
    137         for cmd in INFO_TO_COLLECT:
    138             log.write(run_banner_output(cmd))
    139 
    140         log.close()
    141 
    142 
    143 if os.getuid() == 0:
    144     logging.critical("Running as root, aborting!")
    145     sys.exit(1)
    146 
    147 if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX):
    148     logging.critical("Monitor_db_babysitter already running, aborting!")
    149     sys.exit(1)
    150 
    151 utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX)
    152 
    153 if options.background:
    154     logging_manager.configure_logging(
    155            babysitter_logging_config.BabysitterLoggingConfig(use_console=False))
    156 
    157     # Double fork - see http://code.activestate.com/recipes/66012/
    158     try:
    159         pid = os.fork()
    160         if (pid > 0):
    161             sys.exit(0) # exit from first parent
    162     except OSError, e:
    163         sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror))
    164         sys.exit(1)
    165 
    166     # Decouple from parent environment
    167     os.chdir("/")
    168     os.umask(0)
    169     os.setsid()
    170 
    171     # Second fork
    172     try:
    173         pid = os.fork()
    174         if (pid > 0):
    175             sys.exit(0) # exit from second parent
    176     except OSError, e:
    177         sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror))
    178         sys.exit(1)
    179 else:
    180     logging_manager.configure_logging(
    181                             babysitter_logging_config.BabysitterLoggingConfig())
    182 
    183 
    184 while True:
    185     sock = socket.socket()
    186     try:
    187         # Try to bind to the same port as the status_server.
    188         sock.bind(('localhost', status_server._PORT))
    189     except socket.error, msg:
    190         # If binding failed, open the port.
    191         logging.error('Failed to open socket with error:%s. Closing socket.',
    192                       msg)
    193         release_port_cmd_list = ['fuser', '-k', '-n', 'tcp',
    194                                  '%d' % status_server._PORT]
    195         process = subprocess.Popen(release_port_cmd_list)
    196         process.wait()
    197     sock.close()
    198     proc = MonitorProc(do_recovery=recover)
    199     proc.start()
    200     time.sleep(PAUSE_LENGTH)
    201     while proc.is_running():
    202         logging.info("Tick")
    203         time.sleep(PAUSE_LENGTH)
    204     recover = False
    205