#!/bin/bash

# Force a repair special task for any host that hasn't seen activity in
# the past day.
#
# Various scripts/cron jobs look for DUTs that aren't working.  To be
# conservative, those scripts assume that a DUT that hasn't run any jobs
# within a reasonable time interval isn't working, since some of the
# ways a DUT may be unavailable manifest as inactivity.
#
# In some cases, we'd like to be more certain as to a DUT's status.
# This script goes through the entire AFE hosts table, and identifies
# unlocked hosts that would otherwise be flagged as "not working due to
# lack of activity", and forces a repair task.
#
# We use a repair task (as opposed to verify) for various reasons:
#   + If a DUT is working, repair and verify perform the same checks,
#     and generally run in the same time.
#   + If a DUT is broken, a verify task will fail and invoke repair,
#     which will take longer than just repair alone.
#   + Repair tasks that pass update labels; without this, labels could
#     become out-of-date simply because a DUT is idle.
#
# Locked hosts are skipped because they can't run jobs and because we
# want them to show up as suspicious anyway.


cd $(dirname $0)/..

# Gather all the hosts under supervision of the lab techs.
# Basically, that's any host in any managed pool.

GET_HOSTS='
  /pool:(suites|bvt|cq|continuous|cts|arc-presubmit|crosperf|performance)/ {
    print $1
  }
'
HOSTS=( $(cli/atest host list --unlocked | awk "$GET_HOSTS") )


# Go through the gathered hosts, and use dut_status to find the
# ones with unknown state (anything without a positive "OK" or
# "NO" diagnosis).

NEED_CHECK='
  /OK/ || /NO/ { next }
  /^chromeos/ { print $1 }
'
CHECK=( $(site_utils/dut_status.py -d 19 "${HOSTS[@]}" | awk "$NEED_CHECK") )

contrib/repair_hosts "${CHECK[@]}"