#!/bin/bash

# Control script for a program that tests shared memory access speeds.

# Copyright (C) 2003-2006 IBM
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.

# Kill anything in this process group
trap 'kill -9 `pgrep -P $$` 2> /dev/null; exit 0' 1 2 15

# Try to find ramsnake
PROGRAM="$POUNDER_SRCDIR/ramsnake/snake.exe"
if [ -z "$PROGRAM" ]; then
	echo "Cannot find ramsnake; did you run Install?"
	exit -1
fi

# We want enough nodes to consume 1/2 of whatever RAM isn't being used
# for programs or buffers.  Note that we must divide again by the number
# of CPUs in the system so that the _total_ memory used is 1/4.
#
# Does the LONG_BIT config variable really refer to the size
# of a pointer?
MEM_AVAIL=$(( `grep ^MemFree: /proc/meminfo | awk -F " " '{print $2}'` + `grep ^Cached: /proc/meminfo | awk -F " " '{print $2}'`))
NUM_NODES=$(( (MEM_AVAIL * 1024 / (2 * NR_CPUS)) / $((`getconf LONG_BIT` / 8)) ))

# Start up a bunch of ramsnakes in the background.
for ((k=0; k < $NR_CPUS; k++))
do
	"$POUNDER_HOME/timed_loop" 300 "$PROGRAM" -p $NR_CPUS -n $NUM_NODES &
done

LOGFILE=/proc/$$/fd/1
OLD_ERRORS=0

# Check for errors while snake runs.
while [ `pgrep -P $$ | wc -l` -gt 1 ];
do
	NEW_ERRORS=`egrep -ic "(segmentation|fault|error|illegal)" $LOGFILE`
	ERRORS=$(( NEW_ERRORS - OLD_ERRORS ))
	if [ $ERRORS -eq 255 ]; then
		ERRORS=254
	fi

	if [ $ERRORS -gt 0 ]; then
		echo "Program crash detected.  Aborting."
		exit 1
	fi
	OLD_ERRORS=$ERRORS
	sleep 5
done

# Failures will probably hang the system or segfault snake.
exit 0