/* * Copyright 2016 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ // // // #include <stdlib.h> #include <stdio.h> #include <string.h> #include <inttypes.h> // // squelch OpenCL 1.2 deprecation warning // #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #endif #include "common/macros.h" #include "common/cl/assert_cl.h" #include "common/cl/find_cl.h" // // // #include "hs_cl.h" // // FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW // #include "intel/gen8/u32/hs_target.h" #include "intel/gen8/u64/hs_target.h" // #include "intel/gen9lp/u32/hs_target.h" // #include "intel/gen9lp/u64/hs_target.h" // // The quality of the RNG doesn't matter. The same number of // instructions will be run no matter what the key distribution looks // like. So here is something small and fast. // static uint32_t hs_rand_u32() { static uint32_t seed = 0xDEADBEEF; // Numerical Recipes seed = seed * 1664525 + 1013904223; return seed; } // // // static void hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words) { #if 1 for (uint32_t ii=0; ii<count*words; ii++) vin_h[ii] = hs_rand_u32(); #elif 0 // in-order memset(vin_h,0,count*words*sizeof(uint32_t)); for (uint32_t ii=0; ii<count; ii++) vin_h[ii*words] = ii; #else // reverse order memset(vin_h,0,count*words*sizeof(uint32_t)); for (uint32_t ii=0; ii<count; ii++) vin_h[ii*words] = count - 1 - ii; #endif } // // // char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns); char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns); // // // static char const * hs_cpu_sort(void * sorted_h, uint32_t const hs_words, uint32_t const count, double * const cpu_ns) { if (hs_words == 1) return hs_cpu_sort_u32(sorted_h,count,cpu_ns); else return hs_cpu_sort_u64(sorted_h,count,cpu_ns); } static void hs_transpose_slabs_u32(uint32_t const hs_words, uint32_t const hs_width, uint32_t const hs_height, uint32_t * vout_h, uint32_t const count) { uint32_t const slab_keys = hs_width * hs_height; size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; uint32_t * const slab = ALLOCA_MACRO(slab_size); uint32_t slab_count = count / slab_keys; while (slab_count-- > 0) { memcpy(slab,vout_h,slab_size); for (uint32_t row=0; row<hs_height; row++) for (uint32_t col=0; col<hs_width; col++) vout_h[col * hs_height + row] = slab[row * hs_width + col]; vout_h += slab_keys; } } static void hs_transpose_slabs_u64(uint32_t const hs_words, uint32_t const hs_width, uint32_t const hs_height, uint64_t * vout_h, uint32_t const count) { uint32_t const slab_keys = hs_width * hs_height; size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; uint64_t * const slab = ALLOCA_MACRO(slab_size); uint32_t slab_count = count / slab_keys; while (slab_count-- > 0) { memcpy(slab,vout_h,slab_size); for (uint32_t row=0; row<hs_height; row++) for (uint32_t col=0; col<hs_width; col++) vout_h[col * hs_height + row] = slab[row * hs_width + col]; vout_h += slab_keys; } } static void hs_transpose_slabs(uint32_t const hs_words, uint32_t const hs_width, uint32_t const hs_height, void * vout_h, uint32_t const count) { if (hs_words == 1) hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count); else hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count); } // // // static void hs_debug_u32(uint32_t const hs_width, uint32_t const hs_height, uint32_t const * vout_h, uint32_t const count) { uint32_t const slab_keys = hs_width * hs_height; uint32_t const slabs = (count + slab_keys - 1) / slab_keys; for (uint32_t ss=0; ss<slabs; ss++) { fprintf(stderr,"%u\n",ss); for (uint32_t cc=0; cc<hs_height; cc++) { for (uint32_t rr=0; rr<hs_width; rr++) fprintf(stderr,"%8" PRIX32 " ",*vout_h++); fprintf(stderr,"\n"); } } } static void hs_debug_u64(uint32_t const hs_width, uint32_t const hs_height, uint64_t const * vout_h, uint32_t const count) { uint32_t const slab_keys = hs_width * hs_height; uint32_t const slabs = (count + slab_keys - 1) / slab_keys; for (uint32_t ss=0; ss<slabs; ss++) { fprintf(stderr,"%u\n",ss); for (uint32_t cc=0; cc<hs_height; cc++) { for (uint32_t rr=0; rr<hs_width; rr++) fprintf(stderr,"%16" PRIX64 " ",*vout_h++); fprintf(stderr,"\n"); } } } // // Used for benchmarking on out-of-order queues. Attaching an event // to a kernel on an OOQ with profiling enabled will result in a // synchronization point and block concurrent execution of kernels. // // The workaround that enables measuring the entire runtime of the // sort is to launch a dummy kernel with an event, a barrier without // an event, then the call to hs_sort(), followed by a final dummy // kernel with an event. // // The end time of the first dummy and start time of the second dummy // will provide a conservative estimate of the total execution time of // the hs_sort() routine. // // Note that once kernels are enqueued they are scheduled with only // microseconds between them so this should only be a small number of // microseconds longer than the true hs_sort() execution time. // #define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }" static cl_kernel hs_dummy_kernel; static void hs_dummy_kernel_create(cl_context context, cl_device_id device_id) { cl_int err; char const * strings[] = { HS_DUMMY_KERNEL_PROGRAM }; size_t const strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) }; cl_program program = clCreateProgramWithSource(context, 1, strings, strings_sizeof, &err); cl_ok(err); cl(BuildProgram(program, 1, &device_id, NULL, NULL, NULL)); hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err); cl(ReleaseProgram(program)); } static void hs_dummy_kernel_release() { cl(ReleaseKernel(hs_dummy_kernel)); } static void hs_dummy_kernel_enqueue(cl_command_queue cq, uint32_t wait_list_size, cl_event const * wait_list, cl_event * event) { size_t const global_work_size = 1; cl(EnqueueNDRangeKernel(cq, hs_dummy_kernel, 1, NULL, &global_work_size, NULL, wait_list_size, wait_list, event)); } // // // static void hs_bench(cl_context context, cl_command_queue cq, cl_command_queue cq_profile, char const * const device_name, char const * const driver_version, uint32_t const hs_words, uint32_t const hs_width, uint32_t const hs_height, struct hs_cl const * const hs, uint32_t const count_lo, uint32_t const count_hi, uint32_t const count_step, uint32_t const loops, uint32_t const warmup, bool const linearize) { // // return if nothing to do // if (count_hi <= 1) return; // // size the arrays // uint32_t count_hi_padded_in, count_hi_padded_out; hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out); // // SIZE // size_t const key_size = sizeof(uint32_t) * hs_words; size_t const size_hi_in = count_hi_padded_in * key_size; size_t const size_hi_out = count_hi_padded_out * key_size; // // ALLOCATE // cl_int cl_err; void * sorted_h = malloc(size_hi_in); cl_mem random = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size_hi_in, NULL,&cl_err); cl_ok(cl_err); cl_mem vin = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_hi_in, NULL,&cl_err); cl_ok(cl_err); cl_mem vout = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size_hi_out, NULL,&cl_err); cl_ok(cl_err); // // BLOCKING MAP AND INIT KEYS // { void * random_h = clEnqueueMapBuffer(cq, random, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,size_hi_in, 0,NULL,NULL, &cl_err); cl_ok(cl_err); // fill with random numbers hs_fill_rand(random_h,count_hi,hs_words); // // UNMAP // cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL)); } // // BENCHMARK // for (uint32_t count=count_lo; count<=count_hi; count+=count_step) { // compute padding before sorting uint32_t count_padded_in, count_padded_out; hs_cl_pad(hs,count,&count_padded_in,&count_padded_out); cl_ulong elapsed_ns_min = UINT64_MAX; cl_ulong elapsed_ns_max = 0; cl_ulong elapsed_ns_sum = 0; cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); cl(Finish(cq)); for (uint32_t ii=0; ii<warmup+loops; ii++) { if (ii == warmup) { elapsed_ns_min = UINT64_MAX; elapsed_ns_max = 0; elapsed_ns_sum = 0; } #if 0 // // optionally, initialize vin on every loop -- no need // cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); cl(Finish(cq)); #endif // // sort vin // cl_event start, complete, end; hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start); // note hs_sort enqueues a final barrier hs_cl_sort(hs, cq, 1,&start,&complete, vin,vout, count, count_padded_in, count_padded_out, linearize); hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end); cl(Finish(cq_profile)); // // measure duration // cl_ulong t_start=0, t_end=0; // start cl(GetEventProfilingInfo(start, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t_start, NULL)); // end cl(GetEventProfilingInfo(end, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t_end, NULL)); cl_ulong const t = t_end - t_start; elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t); elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t); elapsed_ns_sum += t; cl(ReleaseEvent(start)); cl(ReleaseEvent(complete)); cl(ReleaseEvent(end)); } // // COPY KEYS BACK FOR VERIFICATION // size_t const size_padded_in = count_padded_in * key_size; void * vin_h = clEnqueueMapBuffer(cq, vin, CL_FALSE, CL_MAP_READ, 0,size_padded_in, 0,NULL,NULL, &cl_err); cl_ok(cl_err); void * vout_h = clEnqueueMapBuffer(cq, vout, CL_FALSE, CL_MAP_READ, 0,size_padded_in, 0,NULL,NULL, &cl_err); cl_ok(cl_err); cl(Finish(cq)); // // SORT THE UNTOUCHED RANDOM INPUT // memcpy(sorted_h,vin_h,size_padded_in); double cpu_ns; char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns); // // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING // if (!linearize) hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in); // // VERIFY // bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0; #ifndef NDEBUG if (!verified) { if (hs_words == 1) hs_debug_u32(hs_width,hs_height,vout_h,count); else // ulong hs_debug_u64(hs_width,hs_height,vout_h,count); } #endif cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL)); cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL)); cl(Finish(cq)); // // REPORT // fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", device_name, driver_version, (hs_words == 1) ? "uint" : "ulong", linearize ? "linear" : "slab", verified ? " OK " : "*FAIL*", count, count_padded_in, count_padded_out, // CPU algo, cpu_ns / 1000000.0, // milliseconds 1000.0 * count / cpu_ns, // mkeys / sec // GPU loops, elapsed_ns_sum / 1000000.0 / loops, // avg msecs elapsed_ns_min / 1000000.0, // min msecs elapsed_ns_max / 1000000.0, // max msecs 1000.0 * count * loops / elapsed_ns_sum, // mkeys / sec - avg 1000.0 * count / elapsed_ns_min); // mkeys / sec - max // quit early if not verified if (!verified) break; } // // dispose // cl(ReleaseMemObject(vout)); cl(ReleaseMemObject(vin)); cl(ReleaseMemObject(random)); free(sorted_h); } // // // int main(int argc, char const * argv[]) { char const * const target_platform_substring = "Intel"; char const * const target_device_substring = "Graphics"; // // find platform and device ids // cl_platform_id platform_id; cl_device_id device_id; #define HS_DEVICE_NAME_SIZE 64 char device_name[HS_DEVICE_NAME_SIZE]; size_t device_name_size; cl(FindIdsByName(target_platform_substring, target_device_substring, &platform_id, &device_id, HS_DEVICE_NAME_SIZE, device_name, &device_name_size, true)); // // create context // cl_context_properties context_properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0 }; cl_int cl_err; cl_context context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_err); cl_ok(cl_err); // // create command queue // #if 0 // OPENCL 2.0 cl_queue_properties props[] = { CL_QUEUE_PROPERTIES, (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, #ifndef NDEBUG (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, #endif 0 }; cl_queue_properties props_profile[] = { CL_QUEUE_PROPERTIES, (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, 0 }; cl_command_queue cq = clCreateCommandQueueWithProperties(context, device_id, props, &cl_err); cl_ok(cl_err); cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context, device_id, props_profile, &cl_err); cl_ok(cl_err); #else // OPENCL 1.2 cl_command_queue cq = clCreateCommandQueue(context, device_id, #ifndef NDEBUG CL_QUEUE_PROFILING_ENABLE | #endif CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &cl_err); cl_ok(cl_err); cl_command_queue cq_profile = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &cl_err); cl_ok(cl_err); #endif // // Intel GEN workaround -- create dummy kernel for semi-accurate // profiling on an out-of-order queue. // hs_dummy_kernel_create(context,device_id); // // select the target // uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0); struct hs_cl_target const * hs_target; if (key_val_words == 1) hs_target = &hs_intel_gen8_u32; else hs_target = &hs_intel_gen8_u64; // // create kernels // fprintf(stdout,"Creating... "); struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id); fprintf(stdout,"done.\n"); // // // #ifdef NDEBUG #define HS_BENCH_LOOPS 100 #define HS_BENCH_WARMUP 100 #else #define HS_BENCH_LOOPS 1 #define HS_BENCH_WARMUP 0 #endif // // sort sizes and loops // uint32_t const kpb = hs_target->config.slab.height << hs_target->config.slab.width_log2; uint32_t const count_lo = (argc <= 2) ? kpb : strtoul(argv[2],NULL,0); uint32_t const count_hi = (argc <= 3) ? count_lo : strtoul(argv[3],NULL,0); uint32_t const count_step = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0); uint32_t const loops = (argc <= 5) ? HS_BENCH_LOOPS : strtoul(argv[5],NULL,0); uint32_t const warmup = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0); bool const linearize = (argc <= 7) ? true : strtoul(argv[7],NULL,0); // // labels // fprintf(stdout, "Device, " "Driver, " "Type, " "Slab/Linear, " "Verified?, " "Keys, " "Keys Padded In, " "Keys Padded Out, " "CPU Algorithm, " "CPU Msecs, " "CPU Mkeys/s, " "Trials, " "Avg. Msecs, " "Min Msecs, " "Max Msecs, " "Avg. Mkeys/s, " "Max. Mkeys/s\n"); // // we want to track driver versions // size_t driver_version_size; cl(GetDeviceInfo(device_id, CL_DRIVER_VERSION, 0, NULL, &driver_version_size)); char * const driver_version = ALLOCA_MACRO(driver_version_size); cl(GetDeviceInfo(device_id, CL_DRIVER_VERSION, driver_version_size, driver_version, NULL)); // // benchmark // hs_bench(context, cq,cq_profile, device_name, driver_version, hs_target->config.words.key + hs_target->config.words.val, 1 << hs_target->config.slab.width_log2, hs_target->config.slab.height, hs, count_lo, count_hi, count_step, loops, warmup, linearize); // // release everything // hs_cl_release(hs); hs_dummy_kernel_release(); cl(ReleaseCommandQueue(cq)); cl(ReleaseCommandQueue(cq_profile)); cl(ReleaseContext(context)); return 0; }