1 #!/usr/bin/python -u 2 import os, socket, sys, signal, time, subprocess, logging 3 from optparse import OptionParser 4 import common 5 from autotest_lib.scheduler import babysitter_logging_config 6 from autotest_lib.client.common_lib import error, global_config, utils 7 from autotest_lib.client.common_lib import logging_manager 8 from autotest_lib.scheduler import scheduler_logging_config 9 from autotest_lib.scheduler import status_server 10 from autotest_lib.scheduler import monitor_db 11 12 PAUSE_LENGTH = 60 13 STALL_TIMEOUT = 2*60*60 14 15 parser = OptionParser() 16 parser.add_option("-r", action="store_true", dest="recover", 17 help=("run recovery mode (implicit after any crash)")) 18 parser.add_option("--background", dest="background", action="store_true", 19 default=False, help=("runs the scheduler monitor on " 20 "background")) 21 (options, args) = parser.parse_args() 22 23 autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 24 results_dir = os.path.join(autodir, 'results') 25 monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py') 26 recover = (options.recover == True) 27 28 if len(args) != 0: 29 parser.print_help() 30 sys.exit(1) 31 32 33 def run_banner_output(cmd): 34 """Returns ------ CMD ------\nCMD_OUTPUT in a string""" 35 banner_output = '%s\n%%s\n\n' % cmd.center(60, '-') 36 command_output = '' 37 try: 38 cmd_out = utils.run(cmd, ignore_status=True, timeout=30) 39 command_output = cmd_out.stdout + cmd_out.stderr 40 except error.CmdError: 41 command_output = 'Timed out' 42 43 return banner_output % command_output 44 45 46 def kill_monitor(): 47 logging.info("Killing monitor_db") 48 # try shutdown first 49 utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT) 50 if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed? 51 # give it some time to shutdown 52 time.sleep(30) 53 # kill it 54 utils.signal_process(monitor_db.PID_FILE_PREFIX) 55 56 57 def handle_sigterm(signum, frame): 58 logging.info('Caught SIGTERM') 59 kill_monitor() 60 utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX) 61 sys.exit(1) 62 63 signal.signal(signal.SIGTERM, handle_sigterm) 64 65 66 SiteMonitorProc = utils.import_site_class( 67 __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter', 68 'SiteMonitorProc', object) 69 70 71 class MonitorProc(SiteMonitorProc): 72 def __init__(self, do_recovery=False): 73 args = [monitor_db_path] 74 if do_recovery: 75 args.append("--recover-hosts") 76 args.append(results_dir) 77 78 kill_monitor() 79 environ = os.environ 80 scheduler_config = scheduler_logging_config.SchedulerLoggingConfig 81 log_name = scheduler_config.get_log_name() 82 os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name 83 scheduler_log_dir = scheduler_config.get_server_log_dir() 84 self.log_path = os.path.join(scheduler_log_dir, log_name) 85 86 self.log_size = 0 87 self.last_log_change = time.time() 88 89 logging.info("STARTING monitor_db with log file %s" % self.log_path) 90 self.args = args 91 92 # Allow site specific code to run, set environment variables and 93 # modify self.args if desired. 94 super(MonitorProc, self).__init__() 95 96 97 def start(self): 98 devnull = open(os.devnull, 'w') 99 self.proc = subprocess.Popen(self.args, stdout=devnull) 100 101 102 def is_running(self): 103 if self.proc.poll() is not None: 104 logging.info("monitor_db DIED") 105 return False 106 107 old_size = self.log_size 108 new_size = os.path.getsize(self.log_path) 109 if old_size != new_size: 110 logging.info("Log was touched") 111 self.log_size = new_size 112 self.last_log_change = time.time() 113 elif self.last_log_change + STALL_TIMEOUT < time.time(): 114 logging.info("monitor_db STALLED") 115 self.collect_stalled_info() 116 return False 117 118 return True 119 120 121 def collect_stalled_info(self): 122 INFO_TO_COLLECT = ['uptime', 123 'ps auxwww', 124 'iostat -k -x 2 4', 125 ] 126 db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s' 127 config = global_config.global_config 128 try: 129 user = config.get_config_value("BACKUP", "user") 130 password = config.get_config_value("BACKUP", "password") 131 db_cmd %= (user, password) 132 INFO_TO_COLLECT.append(db_cmd) 133 except global_config.ConfigError: 134 pass 135 stall_log_path = self.log_path + '.stall_info' 136 log = open(stall_log_path, "w") 137 for cmd in INFO_TO_COLLECT: 138 log.write(run_banner_output(cmd)) 139 140 log.close() 141 142 143 if os.getuid() == 0: 144 logging.critical("Running as root, aborting!") 145 sys.exit(1) 146 147 if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX): 148 logging.critical("Monitor_db_babysitter already running, aborting!") 149 sys.exit(1) 150 151 utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX) 152 153 if options.background: 154 logging_manager.configure_logging( 155 babysitter_logging_config.BabysitterLoggingConfig(use_console=False)) 156 157 # Double fork - see http://code.activestate.com/recipes/66012/ 158 try: 159 pid = os.fork() 160 if (pid > 0): 161 sys.exit(0) # exit from first parent 162 except OSError, e: 163 sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror)) 164 sys.exit(1) 165 166 # Decouple from parent environment 167 os.chdir("/") 168 os.umask(0) 169 os.setsid() 170 171 # Second fork 172 try: 173 pid = os.fork() 174 if (pid > 0): 175 sys.exit(0) # exit from second parent 176 except OSError, e: 177 sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror)) 178 sys.exit(1) 179 else: 180 logging_manager.configure_logging( 181 babysitter_logging_config.BabysitterLoggingConfig()) 182 183 184 while True: 185 sock = socket.socket() 186 try: 187 # Try to bind to the same port as the status_server. 188 sock.bind(('localhost', status_server._PORT)) 189 except socket.error, msg: 190 # If binding failed, open the port. 191 logging.error('Failed to open socket with error:%s. Closing socket.', 192 msg) 193 release_port_cmd_list = ['fuser', '-k', '-n', 'tcp', 194 '%d' % status_server._PORT] 195 process = subprocess.Popen(release_port_cmd_list) 196 process.wait() 197 sock.close() 198 proc = MonitorProc(do_recovery=recover) 199 proc.start() 200 time.sleep(PAUSE_LENGTH) 201 while proc.is_running(): 202 logging.info("Tick") 203 time.sleep(PAUSE_LENGTH) 204 recover = False 205