/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <float.h>
#include <stdbool.h>
//
//
//
#include <cuda_runtime.h>
//
//
//
#include "common/cuda/assert_cuda.h"
#include "common/macros.h"
//
//
//
#include "hs/cuda/sm_35/u32/hs_cuda.h"
#include "hs/cuda/sm_35/u64/hs_cuda.h"
//
// PFNs to select between different key widths
//
typedef void (*hs_cuda_info_pfn)(uint32_t * const key_words,
uint32_t * const val_words,
uint32_t * const slab_height,
uint32_t * const slab_width_log2);
typedef void (*hs_cuda_pad_pfn)(uint32_t const count,
uint32_t * const count_padded_in,
uint32_t * const count_padded_out);
typedef void (*hs_cuda_sort_pfn)(void * const vin,
void * const vout,
uint32_t const count,
uint32_t const count_padded_in,
uint32_t const count_padded_out,
bool const linearize,
cudaStream_t stream0,
cudaStream_t stream1,
cudaStream_t stream2);
//
// The quality of the RNG doesn't matter. The same number of
// instructions will be run no matter what the key distribution looks
// like. So here is something small and fast.
//
static
uint32_t
hs_rand_u32()
{
static uint32_t seed = 0xDEADBEEF;
// Numerical Recipes
seed = seed * 1664525 + 1013904223;
return seed;
}
//
//
//
static
void
hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
{
#if 1
for (uint32_t ii=0; ii<count*words; ii++)
vin_h[ii] = hs_rand_u32();
#elif 0 // in-order
memset(vin_h,0,count*words*sizeof(uint32_t));
for (uint32_t ii=0; ii<count; ii++)
vin_h[ii*words] = ii;
#else // reverse order
memset(vin_h,0,count*words*sizeof(uint32_t));
for (uint32_t ii=0; ii<count; ii++)
vin_h[ii*words] = count - 1 - ii;
#endif
}
//
//
//
char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
//
//
//
static
char const *
hs_cpu_sort(void * sorted_h,
uint32_t const hs_words,
uint32_t const count,
double * const cpu_ns)
{
if (hs_words == 1)
return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
else
return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
}
static
bool
hs_verify_linear(uint32_t const hs_words,
void * sorted_h,
void * vout_h,
uint32_t const count)
{
return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0;
}
static
void
hs_transpose_slabs_u32(uint32_t const hs_words,
uint32_t const hs_width,
uint32_t const hs_height,
uint32_t * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
uint32_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
for (uint32_t row=0; row<hs_height; row++)
for (uint32_t col=0; col<hs_width; col++)
vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
}
static
void
hs_transpose_slabs_u64(uint32_t const hs_words,
uint32_t const hs_width,
uint32_t const hs_height,
uint64_t * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
uint64_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
for (uint32_t row=0; row<hs_height; row++)
for (uint32_t col=0; col<hs_width; col++)
vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
}
static
void
hs_transpose_slabs(uint32_t const hs_words,
uint32_t const hs_width,
uint32_t const hs_height,
void * vout_h,
uint32_t const count)
{
if (hs_words == 1)
hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
else
hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
}
//
//
//
static
void
hs_debug_u32(uint32_t const hs_width,
uint32_t const hs_height,
uint32_t const * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
for (uint32_t cc=0; cc<hs_height; cc++) {
for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
fprintf(stderr,"\n");
}
}
}
static
void
hs_debug_u64(uint32_t const hs_width,
uint32_t const hs_height,
uint64_t const * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
for (uint32_t cc=0; cc<hs_height; cc++) {
for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
fprintf(stderr,"\n");
}
}
}
//
//
//
static
void
hs_bench(hs_cuda_pad_pfn hs_pad,
hs_cuda_sort_pfn hs_sort,
cudaStream_t stream0,
cudaStream_t stream1,
cudaStream_t stream2,
struct cudaDeviceProp const * props,
int const driver_version,
uint32_t const hs_words,
uint32_t const hs_height,
uint32_t const hs_width,
uint32_t const count_lo,
uint32_t const count_hi,
uint32_t const count_step,
uint32_t const loops,
uint32_t const warmup,
bool const linearize,
bool const verify)
{
//
// return if nothing to do
//
if (count_hi <= 1)
return;
//
// size for the largest array
//
uint32_t count_hi_padded_in, count_hi_padded_out;
hs_pad(count_hi,&count_hi_padded_in,&count_hi_padded_out);
size_t const key_size = sizeof(uint32_t) * hs_words;
size_t const size_hi_in = count_hi_padded_in * key_size;
size_t const size_hi_out = count_hi_padded_out * key_size;
//
// allocate device extents
//
void * random_d;
void * vin_d;
void * vout_d;
cuda(Malloc(&random_d,size_hi_in));
cuda(Malloc(&vin_d, size_hi_in));
cuda(Malloc(&vout_d, size_hi_out));
//
// initialize device random extent
//
void * random_h = malloc(size_hi_in);
// fill with random numbers
hs_fill_rand(random_h,count_hi,hs_words);
// copy to device
cuda(Memcpy(random_d,random_h,size_hi_in,cudaMemcpyHostToDevice));
free(random_h);
//
// allocate host result extent
//
void * sorted_h = malloc(size_hi_in);
void * vout_h = malloc(size_hi_in);
//
// LABELS
//
fprintf(stdout,
"Device, "
"Driver, "
"Type, "
"Slab/Linear, "
"Verified?, "
"Keys, "
"Keys Padded In, "
"Keys Padded Out, "
"CPU Algorithm, "
"CPU Msecs, "
"CPU Mkeys/s, "
"Trials, "
"Avg. Msecs, "
"Min Msecs, "
"Max Msecs, "
"Avg. Mkeys/s, "
"Max. Mkeys/s\n");
//
// BENCHMARK
//
cudaEvent_t start, end;
cuda(EventCreate(&start));
cuda(EventCreate(&end));
for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
{
// compute padding before sorting
uint32_t count_padded_in, count_padded_out;
hs_pad(count,&count_padded_in,&count_padded_out);
cuda(Memcpy(vin_d,random_d,count*key_size,cudaMemcpyDeviceToDevice));
float elapsed_ms_min = FLT_MAX;
float elapsed_ms_max = 0.0f;
float elapsed_ms_sum = 0.0f;
for (uint32_t ii=0; ii<warmup+loops; ii++)
{
if (ii == warmup)
{
elapsed_ms_min = FLT_MAX;
elapsed_ms_max = 0.0f;
elapsed_ms_sum = 0.0f;
}
//
// sort vin/vout
//
cuda(EventRecord(start,stream0));
cuda(StreamWaitEvent(stream1,start,0));
cuda(StreamWaitEvent(stream2,start,0));
hs_sort(vin_d,
vout_d,
count,
count_padded_in,
count_padded_out,
linearize,
stream0,
stream1,
stream2);
cuda(EventRecord(end,stream0));
cuda(EventSynchronize(end));
float elapsed;
cuda(EventElapsedTime(&elapsed,start,end));
elapsed_ms_min = MIN_MACRO(elapsed_ms_min,elapsed);
elapsed_ms_max = MAX_MACRO(elapsed_ms_max,elapsed);
elapsed_ms_sum += elapsed;
}
//
// verify
//
char const * cpu_algo = NULL;
double cpu_ns = 1.0;
bool verified = false;
if (verify)
{
//
// copy back the results
//
size_t const size_padded_in = count_padded_in * key_size;
cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost));
cuda(Memcpy(vout_h, vout_d,size_padded_in,cudaMemcpyDeviceToHost));
//
// sort the input with another algorithm
//
cpu_algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
// transpose the cpu sorted slabs before comparison
if (!linearize) {
hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
}
verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
#ifndef NDEBUG
if (!verified)
{
if (hs_words == 1) {
hs_debug_u32(hs_width,hs_height,vout_h, count);
hs_debug_u32(hs_width,hs_height,sorted_h,count);
} else { // ulong
hs_debug_u64(hs_width,hs_height,vout_h, count);
hs_debug_u64(hs_width,hs_height,sorted_h,count);
}
}
#endif
}
//
// REPORT
//
fprintf(stdout,"%s, %u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
props->name,
driver_version,
(hs_words == 1) ? "uint32_t" : "uint64_t",
linearize ? "linear" : "slab",
verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED",
count,
count_padded_in,
count_padded_out,
// CPU
verify ? cpu_algo : "UNVERIFIED",
verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds
verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec
// GPU
loops,
elapsed_ms_sum / loops, // avg msecs
elapsed_ms_min, // min msecs
elapsed_ms_max, // max msecs
(double)(count * loops) / (1000.0 * elapsed_ms_sum), // mkeys / sec - avg
(double) count / (1000.0 * elapsed_ms_min)); // mkeys / sec - max
// quit early if not verified
if (verify && !verified)
break;
}
//
// dispose
//
cuda(EventDestroy(start));
cuda(EventDestroy(end));
free(sorted_h);
free(vout_h);
cuda(Free(random_d));
cuda(Free(vin_d));
cuda(Free(vout_d));
}
//
//
//
int
main(int argc, char const * argv[])
{
//
// which CUDA device?
//
const int32_t device = (argc == 1) ? 0 : atoi(argv[1]);
struct cudaDeviceProp props;
cuda(GetDeviceProperties(&props,device));
cuda(SetDeviceFlags(cudaDeviceScheduleBlockingSync));
cuda(SetDevice(device));
int driver_version;
cuda(DriverGetVersion(&driver_version));
#ifndef NDEBUG
fprintf(stdout,"%s (%2d) : %u\n",
props.name,
props.multiProcessorCount,
driver_version);
#endif
//
// create some streams
//
cudaStream_t stream0,stream1,stream2;
cuda(StreamCreate(&stream0));
cuda(StreamCreate(&stream1));
cuda(StreamCreate(&stream2));
//
//
//
#ifdef NDEBUG
#define HS_BENCH_LOOPS 100
#define HS_BENCH_WARMUP 100
#else
#define HS_BENCH_LOOPS 1
#define HS_BENCH_WARMUP 0
#endif
//
// are we sorting 32-bit or 64-bit keys?
//
uint32_t const key_size = (argc <= 2) ? 2 : strtoul(argv[2],NULL,0);
hs_cuda_info_pfn hs_info;
hs_cuda_pad_pfn hs_pad;
hs_cuda_sort_pfn hs_sort;
if (key_size == 1)
{
hs_info = hs_cuda_info_u32;
hs_pad = hs_cuda_pad_u32;
hs_sort = hs_cuda_sort_u32;
}
else
{
hs_info = hs_cuda_info_u64;
hs_pad = hs_cuda_pad_u64;
hs_sort = hs_cuda_sort_u64;
}
//
// get some configuration info
//
uint32_t key_words, val_words, slab_height, slab_width_log2;
hs_info(&key_words,&val_words,&slab_height,&slab_width_log2);
//
// sort sizes and loops
//
uint32_t const kpb = slab_height << slab_width_log2;
uint32_t const count_lo = (argc <= 3) ? kpb : strtoul(argv[3],NULL,0);
uint32_t const count_hi = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0);
uint32_t const count_step = (argc <= 5) ? count_lo : strtoul(argv[5],NULL,0);
uint32_t const loops = (argc <= 6) ? HS_BENCH_LOOPS : strtoul(argv[6],NULL,0);
uint32_t const warmup = (argc <= 7) ? HS_BENCH_WARMUP : strtoul(argv[7],NULL,0);
bool const linearize = (argc <= 8) ? true : strtoul(argv[8],NULL,0);
bool const verify = (argc <= 9) ? true : strtoul(argv[9],NULL,0);
//
// benchmark
//
hs_bench(hs_pad,
hs_sort,
stream0,
stream1,
stream2,
&props,
driver_version,
key_words + val_words,
slab_height,
1 << slab_width_log2,
count_lo,
count_hi,
count_step,
loops,
warmup,
linearize,
verify);
//
// cleanup
//
cuda(StreamDestroy(stream0));
cuda(StreamDestroy(stream1));
cuda(StreamDestroy(stream2));
cuda(DeviceReset());
return EXIT_SUCCESS;
}