/****************************************************************************** * * Copyright © International Business Machines Corp., 2007, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * NAME * matrix_mult.c * * DESCRIPTION * Compare running sequential matrix multiplication routines * to running them in parallel to judge mutliprocessor * performance * * USAGE: * Use run_auto.sh script in current directory to build and run test. * * AUTHOR * Darren Hart <dvhltc@us.ibm.com> * * HISTORY * 2007-Mar-09: Initial version by Darren Hart <dvhltc@us.ibm.com> * 2008-Feb-26: Closely emulate jvm Dinakar Guniguntala <dino@in.ibm.com> * *****************************************************************************/ #include <stdio.h> #include <stdlib.h> #include <math.h> #include <librttest.h> #include <libstats.h> #define MAX_CPUS 8192 #define PRIO 43 #define MATRIX_SIZE 100 #define DEF_OPS 8 /* the higher the number, the more CPU intensive */ /* (and therefore SMP performance goes up) */ #define PASS_CRITERIA 0.75 /* Avg concurrent time * pass criteria < avg seq time - */ /* for every addition of a cpu */ #define ITERATIONS 128 #define HIST_BUCKETS 100 #define THREAD_WAIT 1 #define THREAD_WORK 2 #define THREAD_DONE 3 #define THREAD_SLEEP 1 * NS_PER_US static int ops = DEF_OPS; static int numcpus; static float criteria; static int *tids; static int online_cpu_id = -1; static int iterations = ITERATIONS; static int iterations_percpu; stats_container_t sdat, cdat, *curdat; stats_container_t shist, chist; static pthread_barrier_t mult_start; static pthread_mutex_t mutex_cpu; void usage(void) { rt_help(); printf("matrix_mult specific options:\n"); printf (" -l# #: number of multiplications per iteration (load)\n"); printf(" -i# #: number of iterations\n"); } int parse_args(int c, char *v) { int handled = 1; switch (c) { case 'i': iterations = atoi(v); break; case 'l': ops = atoi(v); break; case 'h': usage(); exit(0); default: handled = 0; break; } return handled; } void matrix_init(double A[MATRIX_SIZE][MATRIX_SIZE], double B[MATRIX_SIZE][MATRIX_SIZE]) { int i, j; for (i = 0; i < MATRIX_SIZE; i++) { for (j = 0; j < MATRIX_SIZE; j++) { A[i][j] = (double)(i * j); B[i][j] = (double)((i * j) % 10); } } } void matrix_mult(int m_size) { double A[m_size][m_size]; double B[m_size][m_size]; double C[m_size][m_size]; int i, j, k; matrix_init(A, B); for (i = 0; i < m_size; i++) { int i_m = m_size - i; for (j = 0; j < m_size; j++) { double sum = A[i_m][j] * B[j][i]; for (k = 0; k < m_size; k++) sum += A[i_m][k] * B[k][j]; C[i][j] = sum; } } } void matrix_mult_record(int m_size, int index) { nsec_t start, end, delta; int i; start = rt_gettime(); for (i = 0; i < ops; i++) matrix_mult(MATRIX_SIZE); end = rt_gettime(); delta = (long)((end - start) / NS_PER_US); curdat->records[index].x = index; curdat->records[index].y = delta; } int set_affinity(void) { cpu_set_t mask; int cpuid; pthread_mutex_lock(&mutex_cpu); do { ++online_cpu_id; CPU_ZERO(&mask); CPU_SET(online_cpu_id, &mask); if (!sched_setaffinity(0, sizeof(mask), &mask)) { cpuid = online_cpu_id; /* Save this value before unlocking mutex */ pthread_mutex_unlock(&mutex_cpu); return cpuid; } } while (online_cpu_id < MAX_CPUS); pthread_mutex_unlock(&mutex_cpu); return -1; } void *concurrent_thread(void *thread) { struct thread *t = (struct thread *)thread; int thread_id = (intptr_t) t->id; int cpuid; int i; int index; cpuid = set_affinity(); if (cpuid == -1) { fprintf(stderr, "Thread %d: Can't set affinity.\n", thread_id); exit(1); } index = iterations_percpu * thread_id; /* To avoid stats overlapping */ pthread_barrier_wait(&mult_start); for (i = 0; i < iterations_percpu; i++) matrix_mult_record(MATRIX_SIZE, index++); return NULL; } void main_thread(void) { int ret, i, j; nsec_t start, end; long smin = 0, smax = 0, cmin = 0, cmax = 0, delta = 0; float savg, cavg; int cpuid; if (stats_container_init(&sdat, iterations) || stats_container_init(&shist, HIST_BUCKETS) || stats_container_init(&cdat, iterations) || stats_container_init(&chist, HIST_BUCKETS) ) { fprintf(stderr, "Cannot init stats container\n"); exit(1); } tids = malloc(sizeof(int) * numcpus); if (!tids) { perror("malloc"); exit(1); } memset(tids, 0, numcpus); cpuid = set_affinity(); if (cpuid == -1) { fprintf(stderr, "Main thread: Can't set affinity.\n"); exit(1); } /* run matrix mult operation sequentially */ curdat = &sdat; curdat->index = iterations - 1; printf("\nRunning sequential operations\n"); start = rt_gettime(); for (i = 0; i < iterations; i++) matrix_mult_record(MATRIX_SIZE, i); end = rt_gettime(); delta = (long)((end - start) / NS_PER_US); savg = delta / iterations; /* don't use the stats record, use the total time recorded */ smin = stats_min(&sdat); smax = stats_max(&sdat); printf("Min: %ld us\n", smin); printf("Max: %ld us\n", smax); printf("Avg: %.4f us\n", savg); printf("StdDev: %.4f us\n", stats_stddev(&sdat)); if (stats_hist(&shist, &sdat) || stats_container_save("sequential", "Matrix Multiplication Sequential Execution Runtime Scatter Plot", "Iteration", "Runtime (us)", &sdat, "points") || stats_container_save("sequential_hist", "Matrix Multiplicatoin Sequential Execution Runtime Histogram", "Runtime (us)", "Samples", &shist, "steps") ) { fprintf(stderr, "Warning: could not save sequential mults stats\n"); } pthread_barrier_init(&mult_start, NULL, numcpus + 1); set_priority(PRIO); curdat = &cdat; curdat->index = iterations - 1; online_cpu_id = -1; /* Redispatch cpus */ /* Create numcpus-1 concurrent threads */ for (j = 0; j < numcpus; j++) { tids[j] = create_fifo_thread(concurrent_thread, NULL, PRIO); if (tids[j] == -1) { printf ("Thread creation failed (max threads exceeded?)\n"); exit(1); } } /* run matrix mult operation concurrently */ printf("\nRunning concurrent operations\n"); pthread_barrier_wait(&mult_start); start = rt_gettime(); join_threads(); end = rt_gettime(); delta = (long)((end - start) / NS_PER_US); cavg = delta / iterations; /* don't use the stats record, use the total time recorded */ cmin = stats_min(&cdat); cmax = stats_max(&cdat); printf("Min: %ld us\n", cmin); printf("Max: %ld us\n", cmax); printf("Avg: %.4f us\n", cavg); printf("StdDev: %.4f us\n", stats_stddev(&cdat)); if (stats_hist(&chist, &cdat) || stats_container_save("concurrent", "Matrix Multiplication Concurrent Execution Runtime Scatter Plot", "Iteration", "Runtime (us)", &cdat, "points") || stats_container_save("concurrent_hist", "Matrix Multiplication Concurrent Execution Runtime Histogram", "Iteration", "Runtime (us)", &chist, "steps") ) { fprintf(stderr, "Warning: could not save concurrent mults stats\n"); } printf("\nConcurrent Multipliers:\n"); printf("Min: %.4f\n", (float)smin / cmin); printf("Max: %.4f\n", (float)smax / cmax); printf("Avg: %.4f\n", (float)savg / cavg); ret = 1; if (savg > (cavg * criteria)) ret = 0; printf ("\nCriteria: %.2f * average concurrent time < average sequential time\n", criteria); printf("Result: %s\n", ret ? "FAIL" : "PASS"); return; } int main(int argc, char *argv[]) { setup(); pass_criteria = PASS_CRITERIA; rt_init("l:i:h", parse_args, argc, argv); numcpus = sysconf(_SC_NPROCESSORS_ONLN); /* the minimum avg concurrent multiplier to pass */ criteria = pass_criteria * numcpus; int new_iterations; if (iterations <= 0) { fprintf(stderr, "iterations must be greater than zero\n"); exit(1); } printf("\n---------------------------------------\n"); printf("Matrix Multiplication (SMP Performance)\n"); printf("---------------------------------------\n\n"); /* Line below rounds up iterations to a multiple of numcpus. * Without this, having iterations not a mutiple of numcpus causes * stats to segfault (overflow stats array). */ new_iterations = (int)((iterations + numcpus - 1) / numcpus) * numcpus; if (new_iterations != iterations) printf ("Rounding up iterations value to nearest multiple of total online CPUs\n"); iterations = new_iterations; iterations_percpu = iterations / numcpus; printf("Running %d iterations\n", iterations); printf("Matrix Dimensions: %dx%d\n", MATRIX_SIZE, MATRIX_SIZE); printf("Calculations per iteration: %d\n", ops); printf("Number of CPUs: %u\n", numcpus); set_priority(PRIO); main_thread(); return 0; }