#!/bin/sh # # Copyright 2015, Daniel Axtens, IBM Corporation # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # do we have ./getscom, ./putscom? if [ -x ./getscom ] && [ -x ./putscom ]; then GETSCOM=./getscom PUTSCOM=./putscom elif which getscom > /dev/null; then GETSCOM=$(which getscom) PUTSCOM=$(which putscom) else cat <<EOF Can't find getscom/putscom in . or \$PATH. See https://github.com/open-power/skiboot. The tool is in external/xscom-utils EOF exit 1 fi # We will get 8 HMI events per injection # todo: deal with things being offline expected_hmis=8 COUNT_HMIS() { dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt' } # massively expand snooze delay, allowing injection on all cores ppc64_cpu --smt-snooze-delay=1000000000 # when we exit, restore it trap "ppc64_cpu --smt-snooze-delay=100" 0 1 # for each chip+core combination # todo - less fragile parsing egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog | while read chipcore; do chip=$(echo "$chipcore"|awk '{print $3}') core=$(echo "$chipcore"|awk '{print $5}') fir="0x1${core}013100" # verify that Core FIR is zero as expected if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then echo "FIR was not zero before injection for chip $chip, core $core. Aborting!" echo "Result of $GETSCOM -c 0x${chip} $fir:" $GETSCOM -c 0x${chip} $fir echo "If you get a -5 error, the core may be in idle state. Try stress-ng." echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0" exit 1 fi # keep track of the number of HMIs handled old_hmis=$(COUNT_HMIS) # do injection, adding a marker to dmesg for clarity echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg # inject a RegFile recoverable error if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then echo "Error injecting. Aborting!" exit 1 fi # now we want to wait for all the HMIs to be processed # we expect one per thread on the core i=0; new_hmis=$(COUNT_HMIS) while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping" sleep 5; i=$((i + 1)) new_hmis=$(COUNT_HMIS) done if [ $i = 12 ]; then echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting." exit 1 fi echo "Processed $expected_hmis events; presumed success. Check dmesg." echo "" done