/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
//
//
#include "common/macros.h"
#include "common/vk/assert_vk.h"
#include "common/vk/host_alloc.h"
#include "common/vk/cache_vk.h"
//
//
//
#include "hs_vk.h"
//
// Compile-time images of HotSort targets
//
#include "hs/vk/intel/gen8/u32/hs_target.h"
#include "hs/vk/intel/gen8/u64/hs_target.h"
#include "hs/vk/nvidia/sm_35/u32/hs_target.h"
#include "hs/vk/nvidia/sm_35/u64/hs_target.h"
#include "hs/vk/amd/gcn/u32/hs_target.h"
#include "hs/vk/amd/gcn/u64/hs_target.h"
//
//
//
char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
//
//
//
static
char const *
hs_cpu_sort(void * sorted_h,
uint32_t const hs_words,
uint32_t const count,
double * const cpu_ns)
{
if (hs_words == 1)
return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
else
return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
}
static
void
hs_transpose_slabs_u32(uint32_t const hs_words,
uint32_t const hs_width,
uint32_t const hs_height,
uint32_t * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
uint32_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
for (uint32_t row=0; row<hs_height; row++)
for (uint32_t col=0; col<hs_width; col++)
vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
}
static
void
hs_transpose_slabs_u64(uint32_t const hs_words,
uint32_t const hs_width,
uint32_t const hs_height,
uint64_t * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
uint64_t * const slab = ALLOCA_MACRO(slab_size);
uint32_t slab_count = count / slab_keys;
while (slab_count-- > 0)
{
memcpy(slab,vout_h,slab_size);
for (uint32_t row=0; row<hs_height; row++)
for (uint32_t col=0; col<hs_width; col++)
vout_h[col * hs_height + row] = slab[row * hs_width + col];
vout_h += slab_keys;
}
}
static
void
hs_transpose_slabs(uint32_t const hs_words,
uint32_t const hs_width,
uint32_t const hs_height,
void * vout_h,
uint32_t const count)
{
if (hs_words == 1)
hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
else
hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
}
//
//
//
#ifndef NDEBUG
static
VkBool32
VKAPI_PTR
vk_debug_report_cb(VkDebugReportFlagsEXT flags,
VkDebugReportObjectTypeEXT objectType,
uint64_t object,
size_t location,
int32_t messageCode,
const char* pLayerPrefix,
const char* pMessage,
void* pUserData)
{
char const * flag_str = "";
bool is_error = false;
#define VK_FLAG_CASE_TO_STRING(c) \
case c: \
flag_str = #c; \
is_error = true; \
break
switch (flags)
{
// VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_INFORMATION_BIT_EXT);
VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_WARNING_BIT_EXT);
VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT);
VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_ERROR_BIT_EXT);
VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_DEBUG_BIT_EXT);
}
if (is_error)
{
fprintf(stderr,"%s %s %s\n",
flag_str,
pLayerPrefix,
pMessage);
}
return VK_FALSE;
}
#endif
//
//
//
static
uint32_t
hs_rand_u32()
{
static uint32_t seed = 0xDEADBEEF;
// Numerical Recipes
seed = seed * 1664525 + 1013904223;
return seed;
}
//
//
//
static
void
hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
{
#if 1
for (uint32_t ii=0; ii<count*words; ii++)
vin_h[ii] = hs_rand_u32();
#elif 0 // in-order
memset(vin_h,0,count*words*sizeof(uint32_t));
for (uint32_t ii=0; ii<count; ii++)
vin_h[ii*words] = ii;
#else // reverse order
memset(vin_h,0,count*words*sizeof(uint32_t));
for (uint32_t ii=0; ii<count; ii++)
vin_h[ii*words] = count - 1 - ii;
#endif
}
//
//
//
static
void
hs_debug_u32(uint32_t const hs_width,
uint32_t const hs_height,
uint32_t const * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
for (uint32_t cc=0; cc<hs_height; cc++) {
for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
fprintf(stderr,"\n");
}
}
}
static
void
hs_debug_u64(uint32_t const hs_width,
uint32_t const hs_height,
uint64_t const * vout_h,
uint32_t const count)
{
uint32_t const slab_keys = hs_width * hs_height;
uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
for (uint32_t ss=0; ss<slabs; ss++) {
fprintf(stderr,"%u\n",ss);
for (uint32_t cc=0; cc<hs_height; cc++) {
for (uint32_t rr=0; rr<hs_width; rr++)
fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
fprintf(stderr,"\n");
}
}
}
//
//
//
bool
is_matching_device(VkPhysicalDeviceProperties const * const phy_device_props,
struct hs_vk_target const * * const hs_target,
uint32_t const vendor_id,
uint32_t const device_id,
uint32_t const key_val_words)
{
if ((phy_device_props->vendorID != vendor_id) || (phy_device_props->deviceID != device_id))
return false;
if (phy_device_props->vendorID == 0x10DE)
{
//
// FIXME -- for now, the kernels in this app are targeting
// sm_35+ devices. You could add some rigorous rejection by
// device id here...
//
if (key_val_words == 1)
*hs_target = &hs_nvidia_sm35_u32;
else
*hs_target = &hs_nvidia_sm35_u64;
}
else if (phy_device_props->vendorID == 0x8086)
{
//
// FIXME -- for now, the kernels in this app are targeting GEN8+
// devices -- this does *not* include variants of GEN9LP+
// "Apollo Lake" because that device has a different
// architectural "shape" than GEN8 GTx. You could add some
// rigorous rejection by device id here...
//
if (key_val_words == 1)
*hs_target = &hs_intel_gen8_u32;
else
*hs_target = &hs_intel_gen8_u64;
}
else if (phy_device_props->vendorID == 0x1002)
{
//
// AMD GCN
//
if (key_val_words == 1)
*hs_target = &hs_amd_gcn_u32;
else
*hs_target = &hs_amd_gcn_u64;
}
else
{
return false;
}
return true;
}
//
//
//
uint32_t
vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props,
uint32_t const compatible_mem_types,
VkMemoryPropertyFlags const required_mem_props,
bool const abort)
{
//
// FIXME -- jump between indices in the memoryTypeBits mask
//
uint32_t const count = phy_device_mem_props->memoryTypeCount;
for (uint32_t index=0; index<count; index++)
{
// acceptable memory type for this resource?
if ((compatible_mem_types & (1<<index)) == 0)
continue;
// otherwise, find first match...
VkMemoryPropertyFlags const common_props =
phy_device_mem_props->memoryTypes[index].propertyFlags & required_mem_props;
if (common_props == required_mem_props)
return index;
}
if (abort)
{
fprintf(stderr,"Memory type not found: %X\n",required_mem_props);
exit(EXIT_FAILURE);
}
return UINT32_MAX;
}
//
//
//
#ifdef NDEBUG
#define HS_BENCH_LOOPS 100
#define HS_BENCH_WARMUP 100
#else
#define HS_BENCH_LOOPS 1
#define HS_BENCH_WARMUP 0
#endif
//
//
//
int
main(int argc, char const * argv[])
{
//
// select the target by vendor and device id
//
uint32_t const vendor_id = (argc <= 1) ? UINT32_MAX : strtoul(argv[1],NULL,16);
uint32_t const device_id = (argc <= 2) ? UINT32_MAX : strtoul(argv[2],NULL,16);
uint32_t const key_val_words = (argc <= 3) ? 1 : strtoul(argv[3],NULL,0);
if ((key_val_words != 1) && (key_val_words != 2))
{
fprintf(stderr,"Key/Val Words must be 1 or 2\n");
exit(EXIT_FAILURE);
}
//
// create a Vulkan instances
//
VkApplicationInfo const app_info = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pNext = NULL,
.pApplicationName = "Google HotSort Bench",
.applicationVersion = 0,
.pEngineName = "Google HotSort Gen",
.engineVersion = 0,
.apiVersion = VK_API_VERSION_1_1
};
char const * const instance_enabled_layers[] = {
"VK_LAYER_LUNARG_standard_validation"
};
char const * const instance_enabled_extensions[] = {
VK_EXT_DEBUG_REPORT_EXTENSION_NAME
};
uint32_t const instance_enabled_layer_count =
#ifndef NDEBUG
ARRAY_LENGTH_MACRO(instance_enabled_layers)
#else
0
#endif
;
uint32_t const instance_enabled_extension_count =
#ifndef NDEBUG
ARRAY_LENGTH_MACRO(instance_enabled_extensions)
#else
0
#endif
;
VkInstanceCreateInfo const instance_info = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pNext = NULL,
.flags = 0,
.pApplicationInfo = &app_info,
.enabledLayerCount = instance_enabled_layer_count,
.ppEnabledLayerNames = instance_enabled_layers,
.enabledExtensionCount = instance_enabled_extension_count,
.ppEnabledExtensionNames = instance_enabled_extensions
};
VkInstance instance;
vk(CreateInstance(&instance_info,NULL,&instance));
//
//
//
#ifndef NDEBUG
PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT =
(PFN_vkCreateDebugReportCallbackEXT)
vkGetInstanceProcAddr(instance,"vkCreateDebugReportCallbackEXT");
PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT =
(PFN_vkDestroyDebugReportCallbackEXT)
vkGetInstanceProcAddr(instance,"vkDestroyDebugReportCallbackEXT");
struct VkDebugReportCallbackCreateInfoEXT const drcci = {
.sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
.pNext = NULL,
.flags = UINT32_MAX, // enable everything for now
.pfnCallback = vk_debug_report_cb,
.pUserData = NULL
};
VkDebugReportCallbackEXT drc;
vk(CreateDebugReportCallbackEXT(instance,
&drcci,
NULL,
&drc));
#endif
//
// acquire all physical devices and select a match
//
uint32_t phy_device_count;
vk(EnumeratePhysicalDevices(instance,
&phy_device_count,
NULL));
VkPhysicalDevice * phy_devices = vk_host_alloc(NULL,phy_device_count * sizeof(*phy_devices));
vk(EnumeratePhysicalDevices(instance,
&phy_device_count,
phy_devices));
VkPhysicalDevice phy_device = VK_NULL_HANDLE;
VkPhysicalDeviceProperties phy_device_props;
struct hs_vk_target const * hs_target;
for (uint32_t ii=0; ii<phy_device_count; ii++)
{
VkPhysicalDeviceProperties tmp;
vkGetPhysicalDeviceProperties(phy_devices[ii],&tmp);
bool const is_match = is_matching_device(&tmp,
&hs_target,
vendor_id,
device_id,
key_val_words);
fprintf(stdout,"%c %4X : %4X : %s\n",
is_match ? '*' : ' ',
tmp.vendorID,
tmp.deviceID,
tmp.deviceName);
if (is_match)
{
phy_device = phy_devices[ii];
memcpy(&phy_device_props,&tmp,sizeof(tmp));
}
}
if (phy_device == VK_NULL_HANDLE)
{
fprintf(stderr,"Device %4X:%4X not found.\n",
vendor_id & 0xFFFF,
device_id & 0xFFFF);
return EXIT_FAILURE;
}
vk_host_free(NULL,phy_devices);
//
// Get rest of command line
//
uint32_t const slab_size = hs_target->config.slab.height << hs_target->config.slab.width_log2;
uint32_t const count_lo = (argc <= 4) ? slab_size : strtoul(argv[ 4],NULL,0);
uint32_t const count_hi = (argc <= 5) ? count_lo : strtoul(argv[ 5],NULL,0);
uint32_t const count_step = (argc <= 6) ? count_lo : strtoul(argv[ 6],NULL,0);
uint32_t const loops = (argc <= 7) ? HS_BENCH_LOOPS : strtoul(argv[ 7],NULL,0);
uint32_t const warmup = (argc <= 8) ? HS_BENCH_WARMUP : strtoul(argv[ 8],NULL,0);
bool const linearize = (argc <= 9) ? true : strtoul(argv[ 9],NULL,0) != 0;
bool const verify = (argc <= 10) ? true : strtoul(argv[10],NULL,0) != 0;
//
// get the physical device's memory props
//
VkPhysicalDeviceMemoryProperties phy_device_mem_props;
vkGetPhysicalDeviceMemoryProperties(phy_device,&phy_device_mem_props);
//
// get queue properties
//
VkQueueFamilyProperties queue_fam_props[2];
uint32_t queue_fam_count = ARRAY_LENGTH_MACRO(queue_fam_props);
vkGetPhysicalDeviceQueueFamilyProperties(phy_device,&queue_fam_count,queue_fam_props);
//
// create device
//
float const queue_priorities[] = { 1.0f };
VkDeviceQueueCreateInfo const queue_info = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.pNext = NULL,
.flags = 0,
.queueFamilyIndex = 0,
.queueCount = 1,
.pQueuePriorities = queue_priorities
};
//
// clumsily enable AMD GCN shader info extension
//
char const * const device_enabled_extensions[] = {
#if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
VK_AMD_SHADER_INFO_EXTENSION_NAME
#endif
};
uint32_t device_enabled_extension_count = 0;
#if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
if (phy_device_props.vendorID == 0x1002)
device_enabled_extension_count = 1;
#endif
//
//
//
VkPhysicalDeviceFeatures device_features = { false };
if (key_val_words == 2)
{
device_features.shaderInt64 = true;
}
VkDeviceCreateInfo const device_info = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.pNext = NULL,
.flags = 0,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &queue_info,
.enabledLayerCount = 0,
.ppEnabledLayerNames = NULL,
.enabledExtensionCount = device_enabled_extension_count,
.ppEnabledExtensionNames = device_enabled_extensions,
.pEnabledFeatures = &device_features
};
VkDevice device;
vk(CreateDevice(phy_device,&device_info,NULL,&device));
//
// get a queue
//
VkQueue queue;
vkGetDeviceQueue(device,0,0,&queue);
//
// get the pipeline cache
//
VkPipelineCache pipeline_cache;
vk_pipeline_cache_create(device,NULL,".vk_cache",&pipeline_cache);
//
// create a descriptor set pool
//
VkDescriptorPoolSize const dps[] = {
{
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 2
}
};
VkDescriptorPoolCreateInfo const dpci = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
.pNext = NULL,
.flags = 0, // VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
.maxSets = 1,
.poolSizeCount = ARRAY_LENGTH_MACRO(dps),
.pPoolSizes = dps
};
VkDescriptorPool desc_pool;
vk(CreateDescriptorPool(device,
&dpci,
NULL, // allocator
&desc_pool));
//
// create HotSort device instance
//
struct hs_vk * hs = hs_vk_create(hs_target,
device,
NULL,
pipeline_cache);
//
// create a HotSort descriptor set for this thread
//
VkDescriptorSet hs_ds = hs_vk_ds_alloc(hs,desc_pool);
//
// create a command pool for this thread
//
VkCommandPoolCreateInfo const cmd_pool_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.pNext = NULL,
.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
.queueFamilyIndex = 0,
};
VkCommandPool cmd_pool;
vk(CreateCommandPool(device,
&cmd_pool_info,
NULL,
&cmd_pool));
//
// create a query pool for benchmarking
//
static VkQueryPoolCreateInfo const query_pool_info = {
.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
.pNext = NULL,
.flags = 0,
.queryType = VK_QUERY_TYPE_TIMESTAMP,
.queryCount = 4,
.pipelineStatistics = 0
};
VkQueryPool query_pool;
vk(CreateQueryPool(device,
&query_pool_info,
NULL,
&query_pool));
//
// create two big buffers -- buffer_out_count is always the largest
//
uint32_t buffer_in_count, buffer_out_count;
hs_vk_pad(hs,count_hi,&buffer_in_count,&buffer_out_count);
size_t const buffer_out_size = buffer_out_count * key_val_words * sizeof(uint32_t);
VkBufferCreateInfo bci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = NULL,
.flags = 0,
.size = buffer_out_size,
.usage = 0,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = NULL
};
VkBuffer vin, vout, sorted, rand;
bci.usage =
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT,
vk(CreateBuffer(device,
&bci,
NULL,
&vin));
vk(CreateBuffer(device,
&bci,
NULL,
&sorted));
bci.usage =
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT;
vk(CreateBuffer(device,
&bci,
NULL,
&vout));
bci.usage =
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
vk(CreateBuffer(device,
&bci,
NULL,
&rand));
//
// get memory requirements for one of the buffers
//
VkMemoryRequirements mr_vin, mr_vout, mr_sorted, mr_rand;
vkGetBufferMemoryRequirements(device,vin, &mr_vin);
vkGetBufferMemoryRequirements(device,vout,&mr_vout);
vkGetBufferMemoryRequirements(device,rand,&mr_sorted);
vkGetBufferMemoryRequirements(device,rand,&mr_rand);
//
// allocate memory for the buffers
//
// for simplicity, all buffers are the same size
//
// vin and vout have the same usage
//
VkMemoryAllocateInfo const mai_vin_vout = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = NULL,
.allocationSize = mr_vin.size,
.memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
mr_vin.memoryTypeBits,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
true)
};
VkMemoryAllocateInfo const mai_sorted_rand = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = NULL,
.allocationSize = mr_sorted.size,
.memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
mr_sorted.memoryTypeBits,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
true)
};
VkDeviceMemory mem_vin, mem_vout, mem_sorted, mem_rand;
vk(AllocateMemory(device,
&mai_vin_vout,
NULL,
&mem_vin));
vk(AllocateMemory(device,
&mai_vin_vout,
NULL,
&mem_vout));
vk(AllocateMemory(device,
&mai_sorted_rand,
NULL,
&mem_sorted));
vk(AllocateMemory(device,
&mai_sorted_rand,
NULL,
&mem_rand));
//
// bind backing memory to the virtual allocations
//
vk(BindBufferMemory(device,vin, mem_vin, 0));
vk(BindBufferMemory(device,vout, mem_vout, 0));
vk(BindBufferMemory(device,sorted,mem_sorted,0));
vk(BindBufferMemory(device,rand, mem_rand, 0));
//
// map and fill the rand buffer with random values
//
void * rand_h = vk_host_alloc(NULL,buffer_out_size);
void * sorted_h = vk_host_alloc(NULL,buffer_out_size);
hs_fill_rand(rand_h,buffer_out_count,key_val_words);
void * rand_map;
vk(MapMemory(device,mem_rand,0,VK_WHOLE_SIZE,0,&rand_map));
memcpy(rand_map,rand_h,buffer_out_size);
vkUnmapMemory(device,mem_rand);
//
// create a single command buffer for this thread
//
VkCommandBufferAllocateInfo const cmd_buffer_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.pNext = NULL,
.commandPool = cmd_pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1
};
VkCommandBuffer cb;
vk(AllocateCommandBuffers(device,
&cmd_buffer_info,
&cb));
//
//
//
static VkCommandBufferBeginInfo const cb_begin_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.pNext = NULL,
.flags = 0, // VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
.pInheritanceInfo = NULL
};
struct VkSubmitInfo const submit_info = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.pNext = NULL,
.waitSemaphoreCount = 0,
.pWaitSemaphores = NULL,
.pWaitDstStageMask = NULL,
.commandBufferCount = 1,
.pCommandBuffers = &cb,
.signalSemaphoreCount = 0,
.pSignalSemaphores = NULL
};
//
// labels
//
fprintf(stdout,
"Device, "
"Driver, "
"Type, "
"Slab/Linear, "
"Verified?, "
"Keys, "
"Keys Padded In, "
"Keys Padded Out, "
"CPU, "
"Algorithm, "
"CPU Msecs, "
"CPU Mkeys/s, "
"GPU, "
"Trials, "
"Avg. Msecs, "
"Min Msecs, "
"Max Msecs, "
"Avg. Mkeys/s, "
"Max. Mkeys/s\n");
//
// test a range
//
for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
{
//
// size the vin and vout arrays
//
uint32_t count_padded_in, count_padded_out;
hs_vk_pad(hs,count,&count_padded_in,&count_padded_out);
//
// initialize vin with 'count' random keys
//
vkBeginCommandBuffer(cb,&cb_begin_info);
VkBufferCopy const copy_rand = {
.srcOffset = 0,
.dstOffset = 0,
.size = count * key_val_words * sizeof(uint32_t)
};
vkCmdCopyBuffer(cb,
rand,
vin,
1,
©_rand);
vk(EndCommandBuffer(cb));
vk(QueueSubmit(queue,
1,
&submit_info,
VK_NULL_HANDLE)); // FIXME -- put a fence here
// wait for queue to drain
vk(QueueWaitIdle(queue));
vk(ResetCommandBuffer(cb,0));
//
// build the sorting command buffer
//
vkBeginCommandBuffer(cb,&cb_begin_info);
//
// starting timestamp
//
vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,query_pool,0);
//
// bind the vin/vout buffers early
//
hs_vk_ds_bind(hs,hs_ds,cb,vin,vout);
//
// append sorting commands
//
hs_vk_sort(hs,
cb,
vin,0,0,
vout,0,0,
count,
count_padded_in,
count_padded_out,
linearize);
//
// end timestamp
//
vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,query_pool,1);
//
// end the command buffer
//
vk(EndCommandBuffer(cb));
//
// measure the min/max/avg execution time
//
uint64_t elapsed_ns_min = UINT64_MAX;
uint64_t elapsed_ns_max = 0;
uint64_t elapsed_ns_sum = 0;
for (uint32_t ii=0; ii<warmup+loops; ii++)
{
if (ii == warmup)
{
elapsed_ns_min = UINT64_MAX;
elapsed_ns_max = 0;
elapsed_ns_sum = 0;
}
vk(QueueSubmit(queue,
1,
&submit_info,
VK_NULL_HANDLE)); // FIXME -- put a fence here
// wait for queue to drain
vk(QueueWaitIdle(queue));
// get results
uint64_t timestamps[2];
vk(GetQueryPoolResults(device,query_pool,
0,ARRAY_LENGTH_MACRO(timestamps),
sizeof(timestamps),
timestamps,
sizeof(timestamps[0]),
VK_QUERY_RESULT_64_BIT |
VK_QUERY_RESULT_WAIT_BIT));
uint64_t const t = timestamps[1] - timestamps[0];
elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t);
elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t);
elapsed_ns_sum += t;
}
vk(ResetCommandBuffer(cb,0));
//
// copy the results back and, optionally, verify them
//
char const * cpu_algo = NULL;
double cpu_ns = 0.0;
bool verified = false;
if (verify)
{
size_t const size_padded_in = count_padded_in * key_val_words * sizeof(uint32_t);
vkBeginCommandBuffer(cb,&cb_begin_info);
VkBufferCopy const copy_vout = {
.srcOffset = 0,
.dstOffset = 0,
.size = size_padded_in
};
vkCmdCopyBuffer(cb,
vout,
sorted,
1,
©_vout);
vk(EndCommandBuffer(cb));
vk(QueueSubmit(queue,
1,
&submit_info,
VK_NULL_HANDLE)); // FIXME -- put a fence here
// wait for queue to drain
vk(QueueWaitIdle(queue));
vk(ResetCommandBuffer(cb,0));
size_t const size_sorted_h = count * key_val_words * sizeof(uint32_t);
// copy and sort random data
memcpy(sorted_h,rand_h,size_sorted_h);
memset((uint8_t*)sorted_h + size_sorted_h,-1,size_padded_in-size_sorted_h);
cpu_algo = hs_cpu_sort(sorted_h,key_val_words,count_padded_in,&cpu_ns);
void * sorted_map;
vk(MapMemory(device,mem_sorted,0,VK_WHOLE_SIZE,0,&sorted_map));
if (!linearize) {
hs_transpose_slabs(key_val_words,
1u<<hs_target->config.slab.width_log2,
hs_target->config.slab.height,
sorted_map,
count_padded_in);
}
// verify
verified = memcmp(sorted_h,sorted_map,size_padded_in) == 0;
#ifndef NDEBUG
if (!verified)
{
if (key_val_words == 1)
{
hs_debug_u32(1u<<hs_target->config.slab.width_log2,
hs_target->config.slab.height,
sorted_h,
count);
hs_debug_u32(1u<<hs_target->config.slab.width_log2,
hs_target->config.slab.height,
sorted_map,
count);
}
else // ulong
{
hs_debug_u64(1u<<hs_target->config.slab.width_log2,
hs_target->config.slab.height,
sorted_h,
count);
hs_debug_u64(1u<<hs_target->config.slab.width_log2,
hs_target->config.slab.height,
sorted_map,
count);
}
}
#endif
vkUnmapMemory(device,mem_sorted);
}
//
// REPORT
//
float const timestamp_period = phy_device_props.limits.timestampPeriod;
fprintf(stdout,"%s, %u.%u.%u.%u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
phy_device_props.deviceName,
(phy_device_props.driverVersion>>24)&0xFF,
(phy_device_props.driverVersion>>16)&0xFF,
(phy_device_props.driverVersion>> 8)&0xFF,
(phy_device_props.driverVersion )&0xFF,
(key_val_words == 1) ? "uint" : "ulong",
linearize ? "linear" : "slab",
verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED",
count,
count_padded_in,
count_padded_out,
// CPU
verify ? cpu_algo : "UNVERIFIED",
verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds
verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec
// GPU
loops,
timestamp_period * elapsed_ns_sum / 1e6 / loops, // avg msecs
timestamp_period * elapsed_ns_min / 1e6, // min msecs
timestamp_period * elapsed_ns_max / 1e6, // max msecs
1000.0 * count * loops / (timestamp_period * elapsed_ns_sum), // mkeys / sec - avg
1000.0 * count / (timestamp_period * elapsed_ns_min)); // mkeys / sec - max
}
// reset the descriptor pool
vk(ResetDescriptorPool(device,desc_pool,0));
//
// cleanup
//
// release shared HotSort state
hs_vk_release(hs);
// destroy the vin/vout buffers (before device memory)
vkDestroyBuffer(device,vin, NULL);
vkDestroyBuffer(device,vout, NULL);
vkDestroyBuffer(device,sorted,NULL);
vkDestroyBuffer(device,rand, NULL);
// free device memory
vkFreeMemory(device,mem_vin, NULL);
vkFreeMemory(device,mem_vout, NULL);
vkFreeMemory(device,mem_sorted,NULL);
vkFreeMemory(device,mem_rand, NULL);
// free host memory
vk_host_free(NULL,rand_h);
vk_host_free(NULL,sorted_h);
// destroy the descriptor pool
vkDestroyDescriptorPool(device,desc_pool,NULL);
// destroy remaining...
vkDestroyQueryPool(device,query_pool,NULL);
vkFreeCommandBuffers(device,cmd_pool,1,&cb);
vkDestroyCommandPool(device,cmd_pool,NULL);
vk_pipeline_cache_destroy(device,NULL,".vk_cache",pipeline_cache);
vkDestroyDevice(device,NULL);
#ifndef NDEBUG
vkDestroyDebugReportCallbackEXT(instance,drc,NULL);
#endif
vkDestroyInstance(instance,NULL);
return EXIT_SUCCESS;
}
//
//
//