1 #!/bin/bash 2 3 # Force a repair special task for any host that hasn't seen activity in 4 # the past day. 5 # 6 # Various scripts/cron jobs look for DUTs that aren't working. To be 7 # conservative, those scripts assume that a DUT that hasn't run any jobs 8 # within a reasonable time interval isn't working, since some of the 9 # ways a DUT may be unavailable manifest as inactivity. 10 # 11 # In some cases, we'd like to be more certain as to a DUT's status. 12 # This script goes through the entire AFE hosts table, and identifies 13 # unlocked hosts that would otherwise be flagged as "not working due to 14 # lack of activity", and forces a repair task. 15 # 16 # We use a repair task (as opposed to verify) for various reasons: 17 # + If a DUT is working, repair and verify perform the same checks, 18 # and generally run in the same time. 19 # + If a DUT is broken, a verify task will fail and invoke repair, 20 # which will take longer than just repair alone. 21 # + Repair tasks that pass update labels; without this, labels could 22 # become out-of-date simply because a DUT is idle. 23 # 24 # Locked hosts are skipped because they can't run jobs and because we 25 # want them to show up as suspicious anyway. 26 27 28 cd $(dirname $0)/.. 29 30 # Gather all the hosts under supervision of the lab techs. 31 # Basically, that's any host in any managed pool. 32 33 GET_HOSTS=' 34 /pool:(suites|bvt|cq|continuous|cts|arc-presubmit|crosperf|performance)/ { 35 print $1 36 } 37 ' 38 HOSTS=( $(cli/atest host list --unlocked | awk "$GET_HOSTS") ) 39 40 41 # Go through the gathered hosts, and use dut_status to find the 42 # ones with unknown state (anything without a positive "OK" or 43 # "NO" diagnosis). 44 45 NEED_CHECK=' 46 /OK/ || /NO/ { next } 47 /^chromeos/ { print $1 } 48 ' 49 CHECK=( $(site_utils/dut_status.py -d 19 "${HOSTS[@]}" | awk "$NEED_CHECK") ) 50 51 contrib/repair_hosts "${CHECK[@]}" 52