/******************************************************************************
*
* Copyright © International Business Machines Corp., 2007, 2008
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* NAME
* matrix_mult.c
*
* DESCRIPTION
* Compare running sequential matrix multiplication routines
* to running them in parallel to judge mutliprocessor
* performance
*
* USAGE:
* Use run_auto.sh script in current directory to build and run test.
*
* AUTHOR
* Darren Hart <dvhltc@us.ibm.com>
*
* HISTORY
* 2007-Mar-09: Initial version by Darren Hart <dvhltc@us.ibm.com>
* 2008-Feb-26: Closely emulate jvm Dinakar Guniguntala <dino@in.ibm.com>
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <librttest.h>
#include <libstats.h>
#define MAX_CPUS 8192
#define PRIO 43
#define MATRIX_SIZE 100
#define DEF_OPS 8 /* the higher the number, the more CPU intensive */
/* (and therefore SMP performance goes up) */
#define PASS_CRITERIA 0.75 /* Avg concurrent time * pass criteria < avg seq time - */
/* for every addition of a cpu */
#define ITERATIONS 128
#define HIST_BUCKETS 100
#define THREAD_WAIT 1
#define THREAD_WORK 2
#define THREAD_DONE 3
#define THREAD_SLEEP 1 * NS_PER_US
static int ops = DEF_OPS;
static int numcpus;
static float criteria;
static int *tids;
static int online_cpu_id = -1;
static int iterations = ITERATIONS;
static int iterations_percpu;
stats_container_t sdat, cdat, *curdat;
stats_container_t shist, chist;
static pthread_barrier_t mult_start;
static pthread_mutex_t mutex_cpu;
void usage(void)
{
rt_help();
printf("matrix_mult specific options:\n");
printf
(" -l# #: number of multiplications per iteration (load)\n");
printf(" -i# #: number of iterations\n");
}
int parse_args(int c, char *v)
{
int handled = 1;
switch (c) {
case 'i':
iterations = atoi(v);
break;
case 'l':
ops = atoi(v);
break;
case 'h':
usage();
exit(0);
default:
handled = 0;
break;
}
return handled;
}
void matrix_init(double A[MATRIX_SIZE][MATRIX_SIZE],
double B[MATRIX_SIZE][MATRIX_SIZE])
{
int i, j;
for (i = 0; i < MATRIX_SIZE; i++) {
for (j = 0; j < MATRIX_SIZE; j++) {
A[i][j] = (double)(i * j);
B[i][j] = (double)((i * j) % 10);
}
}
}
void matrix_mult(int m_size)
{
double A[m_size][m_size];
double B[m_size][m_size];
double C[m_size][m_size];
int i, j, k;
matrix_init(A, B);
for (i = 0; i < m_size; i++) {
int i_m = m_size - i;
for (j = 0; j < m_size; j++) {
double sum = A[i_m][j] * B[j][i];
for (k = 0; k < m_size; k++)
sum += A[i_m][k] * B[k][j];
C[i][j] = sum;
}
}
}
void matrix_mult_record(int m_size, int index)
{
nsec_t start, end, delta;
int i;
start = rt_gettime();
for (i = 0; i < ops; i++)
matrix_mult(MATRIX_SIZE);
end = rt_gettime();
delta = (long)((end - start) / NS_PER_US);
curdat->records[index].x = index;
curdat->records[index].y = delta;
}
int set_affinity(void)
{
cpu_set_t mask;
int cpuid;
pthread_mutex_lock(&mutex_cpu);
do {
++online_cpu_id;
CPU_ZERO(&mask);
CPU_SET(online_cpu_id, &mask);
if (!sched_setaffinity(0, sizeof(mask), &mask)) {
cpuid = online_cpu_id; /* Save this value before unlocking mutex */
pthread_mutex_unlock(&mutex_cpu);
return cpuid;
}
} while (online_cpu_id < MAX_CPUS);
pthread_mutex_unlock(&mutex_cpu);
return -1;
}
void *concurrent_thread(void *thread)
{
struct thread *t = (struct thread *)thread;
int thread_id = (intptr_t) t->id;
int cpuid;
int i;
int index;
cpuid = set_affinity();
if (cpuid == -1) {
fprintf(stderr, "Thread %d: Can't set affinity.\n", thread_id);
exit(1);
}
index = iterations_percpu * thread_id; /* To avoid stats overlapping */
pthread_barrier_wait(&mult_start);
for (i = 0; i < iterations_percpu; i++)
matrix_mult_record(MATRIX_SIZE, index++);
return NULL;
}
void main_thread(void)
{
int ret, i, j;
nsec_t start, end;
long smin = 0, smax = 0, cmin = 0, cmax = 0, delta = 0;
float savg, cavg;
int cpuid;
if (stats_container_init(&sdat, iterations) ||
stats_container_init(&shist, HIST_BUCKETS) ||
stats_container_init(&cdat, iterations) ||
stats_container_init(&chist, HIST_BUCKETS)
) {
fprintf(stderr, "Cannot init stats container\n");
exit(1);
}
tids = malloc(sizeof(int) * numcpus);
if (!tids) {
perror("malloc");
exit(1);
}
memset(tids, 0, numcpus);
cpuid = set_affinity();
if (cpuid == -1) {
fprintf(stderr, "Main thread: Can't set affinity.\n");
exit(1);
}
/* run matrix mult operation sequentially */
curdat = &sdat;
curdat->index = iterations - 1;
printf("\nRunning sequential operations\n");
start = rt_gettime();
for (i = 0; i < iterations; i++)
matrix_mult_record(MATRIX_SIZE, i);
end = rt_gettime();
delta = (long)((end - start) / NS_PER_US);
savg = delta / iterations; /* don't use the stats record, use the total time recorded */
smin = stats_min(&sdat);
smax = stats_max(&sdat);
printf("Min: %ld us\n", smin);
printf("Max: %ld us\n", smax);
printf("Avg: %.4f us\n", savg);
printf("StdDev: %.4f us\n", stats_stddev(&sdat));
if (stats_hist(&shist, &sdat) ||
stats_container_save("sequential",
"Matrix Multiplication Sequential Execution Runtime Scatter Plot",
"Iteration", "Runtime (us)", &sdat, "points")
|| stats_container_save("sequential_hist",
"Matrix Multiplicatoin Sequential Execution Runtime Histogram",
"Runtime (us)", "Samples", &shist, "steps")
) {
fprintf(stderr,
"Warning: could not save sequential mults stats\n");
}
pthread_barrier_init(&mult_start, NULL, numcpus + 1);
set_priority(PRIO);
curdat = &cdat;
curdat->index = iterations - 1;
online_cpu_id = -1; /* Redispatch cpus */
/* Create numcpus-1 concurrent threads */
for (j = 0; j < numcpus; j++) {
tids[j] = create_fifo_thread(concurrent_thread, NULL, PRIO);
if (tids[j] == -1) {
printf
("Thread creation failed (max threads exceeded?)\n");
exit(1);
}
}
/* run matrix mult operation concurrently */
printf("\nRunning concurrent operations\n");
pthread_barrier_wait(&mult_start);
start = rt_gettime();
join_threads();
end = rt_gettime();
delta = (long)((end - start) / NS_PER_US);
cavg = delta / iterations; /* don't use the stats record, use the total time recorded */
cmin = stats_min(&cdat);
cmax = stats_max(&cdat);
printf("Min: %ld us\n", cmin);
printf("Max: %ld us\n", cmax);
printf("Avg: %.4f us\n", cavg);
printf("StdDev: %.4f us\n", stats_stddev(&cdat));
if (stats_hist(&chist, &cdat) ||
stats_container_save("concurrent",
"Matrix Multiplication Concurrent Execution Runtime Scatter Plot",
"Iteration", "Runtime (us)", &cdat, "points")
|| stats_container_save("concurrent_hist",
"Matrix Multiplication Concurrent Execution Runtime Histogram",
"Iteration", "Runtime (us)", &chist,
"steps")
) {
fprintf(stderr,
"Warning: could not save concurrent mults stats\n");
}
printf("\nConcurrent Multipliers:\n");
printf("Min: %.4f\n", (float)smin / cmin);
printf("Max: %.4f\n", (float)smax / cmax);
printf("Avg: %.4f\n", (float)savg / cavg);
ret = 1;
if (savg > (cavg * criteria))
ret = 0;
printf
("\nCriteria: %.2f * average concurrent time < average sequential time\n",
criteria);
printf("Result: %s\n", ret ? "FAIL" : "PASS");
return;
}
int main(int argc, char *argv[])
{
setup();
pass_criteria = PASS_CRITERIA;
rt_init("l:i:h", parse_args, argc, argv);
numcpus = sysconf(_SC_NPROCESSORS_ONLN);
/* the minimum avg concurrent multiplier to pass */
criteria = pass_criteria * numcpus;
int new_iterations;
if (iterations <= 0) {
fprintf(stderr, "iterations must be greater than zero\n");
exit(1);
}
printf("\n---------------------------------------\n");
printf("Matrix Multiplication (SMP Performance)\n");
printf("---------------------------------------\n\n");
/* Line below rounds up iterations to a multiple of numcpus.
* Without this, having iterations not a mutiple of numcpus causes
* stats to segfault (overflow stats array).
*/
new_iterations = (int)((iterations + numcpus - 1) / numcpus) * numcpus;
if (new_iterations != iterations)
printf
("Rounding up iterations value to nearest multiple of total online CPUs\n");
iterations = new_iterations;
iterations_percpu = iterations / numcpus;
printf("Running %d iterations\n", iterations);
printf("Matrix Dimensions: %dx%d\n", MATRIX_SIZE, MATRIX_SIZE);
printf("Calculations per iteration: %d\n", ops);
printf("Number of CPUs: %u\n", numcpus);
set_priority(PRIO);
main_thread();
return 0;
}