/* * Copyright 2016 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ #include <stdlib.h> #include <string.h> #include <inttypes.h> #include "common/util.h" #include "common/macros.h" #include "common/vk/assert_vk.h" #include "hs_vk.h" #include "hs_vk_target.h" // // We want concurrent kernel execution to occur in a few places. // // The summary is: // // 1) If necessary, some max valued keys are written to the end of // the vin/vout buffers. // // 2) Blocks of slabs of keys are sorted. // // 3) If necesary, the blocks of slabs are merged until complete. // // 4) If requested, the slabs will be converted from slab ordering // to linear ordering. // // Below is the general "happens-before" relationship between HotSort // compute kernels. // // Note the diagram assumes vin and vout are different buffers. If // they're not, then the first merge doesn't include the pad_vout // event in the wait list. // // +----------+ +---------+ // | pad_vout | | pad_vin | // +----+-----+ +----+----+ // | | // | WAITFOR(pad_vin) // | | // | +-----v-----+ // | | | // | +----v----+ +----v----+ // | | bs_full | | bs_frac | // | +----+----+ +----+----+ // | | | // | +-----v-----+ // | | // | +------NO------JUST ONE BLOCK? // | / | // |/ YES // + | // | v // | END_WITH_EVENTS(bs_full,bs_frac) // | // | // WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<< // | // | // +-----------<------------+ // | | // +-----v-----+ | // | | | // +----v----+ +----v----+ | // | fm_full | | fm_frac | | // +----+----+ +----+----+ | // | | ^ // +-----v-----+ | // | | // WAITFOR(fm_full,fm_frac) | // | | // v | // +--v--+ WAITFOR(bc) // | hm | | // +-----+ | // | | // WAITFOR(hm) | // | ^ // +--v--+ | // | bc | | // +-----+ | // | | // v | // MERGING COMPLETE?-------NO------+ // | // YES // | // v // END_WITH_EVENTS(bc) // struct hs_vk { VkAllocationCallbacks const * allocator; VkDevice device; struct { struct { VkDescriptorSetLayout vout_vin; } layout; } desc_set; struct { struct { VkPipelineLayout vout_vin; } layout; } pipeline; struct hs_vk_target_config config; uint32_t key_val_size; uint32_t slab_keys; uint32_t bs_slabs_log2_ru; uint32_t bc_slabs_log2_max; struct { uint32_t count; VkPipeline * bs; VkPipeline * bc; VkPipeline * fm[3]; VkPipeline * hm[3]; VkPipeline * transpose; VkPipeline all[]; } pipelines; }; // // // struct hs_state { VkCommandBuffer cb; // If sorting in-place, then vout == vin VkBuffer vout; VkBuffer vin; // bx_ru is number of rounded up warps in vin uint32_t bx_ru; }; // // // static void hs_barrier_compute_w_to_compute_r(struct hs_state * const state) { static VkMemoryBarrier const shader_w_to_r = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = NULL, .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT }; vkCmdPipelineBarrier(state->cb, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &shader_w_to_r, 0, NULL, 0, NULL); } // // // static void hs_barrier_to_compute_r(struct hs_state * const state, VkPipelineStageFlags const src_stage, VkAccessFlagBits const src_access) { if (src_stage == 0) return; VkMemoryBarrier const compute_r = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = NULL, .srcAccessMask = src_access, .dstAccessMask = VK_ACCESS_SHADER_READ_BIT }; vkCmdPipelineBarrier(state->cb, src_stage, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &compute_r, 0, NULL, 0, NULL); } // // // static void hs_barrier_to_transfer_fill(struct hs_state * const state, VkPipelineStageFlags const src_stage, VkAccessFlagBits const src_access) { if (src_stage == 0) return; VkMemoryBarrier const fill_w = { .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, .pNext = NULL, .srcAccessMask = src_access, .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT }; vkCmdPipelineBarrier(state->cb, src_stage, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &fill_w, 0, NULL, 0, NULL); } // // // static void hs_transpose(struct hs_vk const * const hs, struct hs_state * const state) { hs_barrier_compute_w_to_compute_r(state); vkCmdBindPipeline(state->cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipelines.transpose[0]); vkCmdDispatch(state->cb,state->bx_ru,1,1); } // // // static void hs_bc(struct hs_vk const * const hs, struct hs_state * const state, uint32_t const down_slabs, uint32_t const clean_slabs_log2) { hs_barrier_compute_w_to_compute_r(state); // block clean the minimal number of down_slabs_log2 spans uint32_t const frac_ru = (1u << clean_slabs_log2) - 1; uint32_t const full_bc = (down_slabs + frac_ru) >> clean_slabs_log2; vkCmdBindPipeline(state->cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipelines.bc[clean_slabs_log2]); vkCmdDispatch(state->cb,full_bc,1,1); } // // // static uint32_t hs_hm(struct hs_vk const * const hs, struct hs_state * const state, uint32_t const down_slabs, uint32_t const clean_slabs_log2) { hs_barrier_compute_w_to_compute_r(state); // how many scaled half-merge spans are there? uint32_t const frac_ru = (1 << clean_slabs_log2) - 1; uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2; // for now, just clamp to the max uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max; uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem); uint32_t const log2_out = log2_rem - scale_log2; // size the grid uint32_t const slab_span = hs->config.slab.height << log2_out; vkCmdBindPipeline(state->cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipelines.hm[scale_log2][0]); vkCmdDispatch(state->cb,slab_span,spans,1); return log2_out; } // // FIXME -- some of this logic can be skipped if BS is a power-of-two // static uint32_t hs_fm(struct hs_vk const * const hs, struct hs_state * const state, uint32_t * const down_slabs, uint32_t const up_scale_log2) { // // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes // a performance win to bias toward launching the smaller flip merge // kernel in order to get more warps in flight (increased // occupancy). This is useful when merging small numbers of slabs. // // Note that HS_FM_SCALE_MIN will always be 0 or 1. // // So, for now, just clamp to the max until there is a reason to // restore the fancier and probably low-impact approach. // uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2); uint32_t const clean_log2 = up_scale_log2 - scale_log2; // number of slabs in a full-sized scaled flip-merge span uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2; // how many full-sized scaled flip-merge spans are there? uint32_t full_fm = state->bx_ru / full_span_slabs; uint32_t frac_fm = 0; // initialize down_slabs *down_slabs = full_fm * full_span_slabs; // how many half-size scaled + fractional scaled spans are there? uint32_t const span_rem = state->bx_ru - *down_slabs; uint32_t const half_span_slabs = full_span_slabs >> 1; // if we have over a half-span then fractionally merge it if (span_rem > half_span_slabs) { // the remaining slabs will be cleaned *down_slabs += span_rem; uint32_t const frac_rem = span_rem - half_span_slabs; uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem); if (frac_rem_pow2 >= half_span_slabs) { // bump it up to a full span full_fm += 1; } else { // otherwise, add fractional frac_fm = MAX_MACRO(1,frac_rem_pow2 >> clean_log2); } } // // Size the grid // // The simplifying choices below limit the maximum keys that can be // sorted with this grid scheme to around ~2B. // // .x : slab height << clean_log2 -- this is the slab span // .y : [1...65535] -- this is the slab index // .z : ( this could also be used to further expand .y ) // // Note that OpenCL declares a grid in terms of global threads and // not grids and blocks // // // size the grid // uint32_t const slab_span = hs->config.slab.height << clean_log2; if (full_fm > 0) { uint32_t const full_idx = hs->bs_slabs_log2_ru - 1 + scale_log2; vkCmdBindPipeline(state->cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipelines.fm[scale_log2][full_idx]); vkCmdDispatch(state->cb,slab_span,full_fm,1); } if (frac_fm > 0) { vkCmdBindPipeline(state->cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipelines.fm[scale_log2][msb_idx_u32(frac_fm)]); vkCmdDispatchBase(state->cb, 0,full_fm,0, slab_span,1,1); } return clean_log2; } // // // static void hs_bs(struct hs_vk const * const hs, struct hs_state * const state, uint32_t const count_padded_in) { uint32_t const slabs_in = count_padded_in / hs->slab_keys; uint32_t const full_bs = slabs_in / hs->config.block.slabs; uint32_t const frac_bs = slabs_in - full_bs * hs->config.block.slabs; if (full_bs > 0) { vkCmdBindPipeline(state->cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipelines.bs[hs->bs_slabs_log2_ru]); vkCmdDispatch(state->cb,full_bs,1,1); } if (frac_bs > 0) { uint32_t const frac_idx = msb_idx_u32(frac_bs); uint32_t const full_to_frac_log2 = hs->bs_slabs_log2_ru - frac_idx; vkCmdBindPipeline(state->cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipelines.bs[msb_idx_u32(frac_bs)]); vkCmdDispatchBase(state->cb, full_bs<<full_to_frac_log2,0,0, 1,1,1); } } // // // static void hs_keyset_pre_fm(struct hs_vk const * const hs, struct hs_state * const state, uint32_t const count_lo, uint32_t const count_hi) { uint32_t const vout_span = count_hi - count_lo; vkCmdFillBuffer(state->cb, state->vout, count_lo * hs->key_val_size, vout_span * hs->key_val_size, UINT32_MAX); } // // // static void hs_keyset_pre_bs(struct hs_vk const * const hs, struct hs_state * const state, uint32_t const count, uint32_t const count_hi) { uint32_t const vin_span = count_hi - count; vkCmdFillBuffer(state->cb, state->vin, count * hs->key_val_size, vin_span * hs->key_val_size, UINT32_MAX); } // // // void hs_vk_ds_bind(struct hs_vk const * const hs, VkDescriptorSet hs_ds, VkCommandBuffer cb, VkBuffer vin, VkBuffer vout) { // // initialize the HotSort descriptor set // VkDescriptorBufferInfo const dbi[] = { { .buffer = vout == VK_NULL_HANDLE ? vin : vout, .offset = 0, .range = VK_WHOLE_SIZE }, { .buffer = vin, .offset = 0, .range = VK_WHOLE_SIZE } }; VkWriteDescriptorSet const wds[] = { { .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, .pNext = NULL, .dstSet = hs_ds, .dstBinding = 0, .dstArrayElement = 0, .descriptorCount = 2, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .pImageInfo = NULL, .pBufferInfo = dbi, .pTexelBufferView = NULL } }; vkUpdateDescriptorSets(hs->device, ARRAY_LENGTH_MACRO(wds), wds, 0, NULL); // // All HotSort kernels can use the same descriptor set: // // { // HS_KEY_TYPE vout[]; // HS_KEY_TYPE vin[]; // } // // Note that only the bs() kernels read from vin(). // vkCmdBindDescriptorSets(cb, VK_PIPELINE_BIND_POINT_COMPUTE, hs->pipeline.layout.vout_vin, 0, 1, &hs_ds, 0, NULL); } // // // void hs_vk_sort(struct hs_vk const * const hs, VkCommandBuffer cb, VkBuffer vin, VkPipelineStageFlags const vin_src_stage, VkAccessFlagBits const vin_src_access, VkBuffer vout, VkPipelineStageFlags const vout_src_stage, VkAccessFlagBits const vout_src_access, uint32_t const count, uint32_t const count_padded_in, uint32_t const count_padded_out, bool const linearize) { // is this sort in place? bool const is_in_place = (vout == VK_NULL_HANDLE); // // create some common state // struct hs_state state = { .cb = cb, .vin = vin, .vout = is_in_place ? vin : vout, .bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys }; // initialize vin uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in; bool const is_pre_sort_reqd = count_hi > count; bool const is_pre_merge_reqd = !is_in_place && (count_padded_out > count_padded_in); // // pre-sort keyset needs to happen before bs() // pre-merge keyset needs to happen before fm() // VkPipelineStageFlags bs_src_stage = 0; VkAccessFlagBits bs_src_access = 0; // initialize any trailing keys in vin before sorting if (is_pre_sort_reqd) { hs_barrier_to_transfer_fill(&state,vin_src_stage,vin_src_access); hs_keyset_pre_bs(hs,&state,count,count_hi); bs_src_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; bs_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT; } else { bs_src_stage = vin_src_stage; bs_src_access = vin_src_access; } hs_barrier_to_compute_r(&state,bs_src_stage,bs_src_access); // sort blocks of slabs... after hs_keyset_pre_sort() hs_bs(hs,&state,count_padded_in); VkPipelineStageFlags fm_src_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; VkAccessFlagBits fm_src_access = VK_ACCESS_SHADER_READ_BIT; // initialize any trailing keys in vout before merging if (is_pre_merge_reqd) { hs_barrier_to_transfer_fill(&state,vout_src_stage,vout_src_access); hs_keyset_pre_fm(hs,&state,count_padded_in,count_padded_out); fm_src_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; fm_src_access |= VK_ACCESS_TRANSFER_WRITE_BIT; } else { fm_src_stage |= vout_src_stage; fm_src_access |= vout_src_access; } // // if this was a single bs block then there is no merging // if (state.bx_ru > hs->config.block.slabs) { hs_barrier_to_compute_r(&state,fm_src_stage,fm_src_access); // // otherwise, merge sorted spans of slabs until done // int32_t up_scale_log2 = 1; while (true) { uint32_t down_slabs; // flip merge slabs -- return span of slabs that must be cleaned uint32_t clean_slabs_log2 = hs_fm(hs,&state, &down_slabs, up_scale_log2); // if span is gt largest slab block cleaner then half merge while (clean_slabs_log2 > hs->bc_slabs_log2_max) { clean_slabs_log2 = hs_hm(hs,&state, down_slabs, clean_slabs_log2); } // launch clean slab grid -- is it the final launch? hs_bc(hs,&state,down_slabs,clean_slabs_log2); // was this the final block clean? if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru) break; // otherwise, merge twice as many slabs up_scale_log2 += 1; // drop a barrier hs_barrier_compute_w_to_compute_r(&state); } } // slabs or linear? if (linearize) hs_transpose(hs,&state); } // // // #ifdef HS_VK_VERBOSE_STATISTICS_AMD #include <stdio.h> static void hs_vk_verbose_statistics_amd(VkDevice device, struct hs_vk const * const hs) { PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD = (PFN_vkGetShaderInfoAMD) vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD"); if (vkGetShaderInfoAMD == NULL) return; fprintf(stdout, " PHY PHY AVAIL AVAIL\n" "VGPRs SGPRs LDS_MAX LDS/WG SPILL VGPRs SGPRs VGPRs SGPRs WORKGROUP_SIZE\n"); for (uint32_t ii=0; ii<hs->pipelines.count; ii++) { VkShaderStatisticsInfoAMD ssi_amd; size_t ssi_amd_size = sizeof(ssi_amd); if (vkGetShaderInfoAMD(hs->device, hs->pipelines.all[ii], VK_SHADER_STAGE_COMPUTE_BIT, VK_SHADER_INFO_TYPE_STATISTICS_AMD, &ssi_amd_size, &ssi_amd) == VK_SUCCESS) { fprintf(stdout, "%5" PRIu32 " " "%5" PRIu32 " " "%5" PRIu32 " " "%6zu " "%6zu " "%5" PRIu32 " " "%5" PRIu32 " " "%5" PRIu32 " " "%5" PRIu32 " " "( %6" PRIu32 ", " "%6" PRIu32 ", " "%6" PRIu32 " )\n", ssi_amd.resourceUsage.numUsedVgprs, ssi_amd.resourceUsage.numUsedSgprs, ssi_amd.resourceUsage.ldsSizePerLocalWorkGroup, ssi_amd.resourceUsage.ldsUsageSizeInBytes, // size_t ssi_amd.resourceUsage.scratchMemUsageInBytes, // size_t ssi_amd.numPhysicalVgprs, ssi_amd.numPhysicalSgprs, ssi_amd.numAvailableVgprs, ssi_amd.numAvailableSgprs, ssi_amd.computeWorkGroupSize[0], ssi_amd.computeWorkGroupSize[1], ssi_amd.computeWorkGroupSize[2]); } } } #endif // // // #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD #include <stdio.h> static void hs_vk_verbose_disassembly_amd(VkDevice device, struct hs_vk const * const hs) { PFN_vkGetShaderInfoAMD vkGetShaderInfoAMD = (PFN_vkGetShaderInfoAMD) vkGetDeviceProcAddr(device,"vkGetShaderInfoAMD"); if (vkGetShaderInfoAMD == NULL) return; for (uint32_t ii=0; ii<hs->pipelines.count; ii++) { size_t disassembly_amd_size; if (vkGetShaderInfoAMD(hs->device, hs->pipelines.all[ii], VK_SHADER_STAGE_COMPUTE_BIT, VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD, &disassembly_amd_size, NULL) == VK_SUCCESS) { void * disassembly_amd = malloc(disassembly_amd_size); if (vkGetShaderInfoAMD(hs->device, hs->pipelines.all[ii], VK_SHADER_STAGE_COMPUTE_BIT, VK_SHADER_INFO_TYPE_DISASSEMBLY_AMD, &disassembly_amd_size, disassembly_amd) == VK_SUCCESS) { fprintf(stdout,"%s",(char*)disassembly_amd); } free(disassembly_amd); } } } #endif // // // struct hs_vk * hs_vk_create(struct hs_vk_target const * const target, VkDevice device, VkAllocationCallbacks const * allocator, VkPipelineCache pipeline_cache) { // // we reference these values a lot // uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs)); uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs)); // // how many kernels will be created? // uint32_t const count_bs = bs_slabs_log2_ru + 1; uint32_t const count_bc = bc_slabs_log2_max + 1; uint32_t count_fm[3] = { 0 }; uint32_t count_hm[3] = { 0 }; // guaranteed to be in range [0,2] for (uint32_t scale = target->config.merge.fm.scale_min; scale <= target->config.merge.fm.scale_max; scale++) { uint32_t fm_left = (target->config.block.slabs / 2) << scale; count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1; } // guaranteed to be in range [0,2] for (uint32_t scale = target->config.merge.hm.scale_min; scale <= target->config.merge.hm.scale_max; scale++) { count_hm[scale] = 1; } uint32_t const count_bc_fm_hm_transpose = + count_bc + count_fm[0] + count_fm[1] + count_fm[2] + count_hm[0] + count_hm[1] + count_hm[2] + 1; // transpose uint32_t const count_all = count_bs + count_bc_fm_hm_transpose; // // allocate hs_vk // struct hs_vk * hs; if (allocator == NULL) { hs = malloc(sizeof(*hs) + sizeof(VkPipeline*) * count_all); } else { hs = allocator->pfnAllocation(NULL, sizeof(*hs) + sizeof(VkPipeline*) * count_all, 0, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); } // save device & allocator hs->device = device; hs->allocator = allocator; // // create one descriptor set layout // static VkDescriptorSetLayoutBinding const dslb_vout_vin[] = { { .binding = 0, // vout .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = NULL }, { .binding = 1, // vin .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = NULL } }; static VkDescriptorSetLayoutCreateInfo const dscli = { .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .pNext = NULL, .flags = 0, .bindingCount = 2, // 0:vout[], 1:vin[] .pBindings = dslb_vout_vin }; vk(CreateDescriptorSetLayout(device, &dscli, allocator, &hs->desc_set.layout.vout_vin)); // // create one pipeline layout // VkPipelineLayoutCreateInfo plci = { .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .pNext = NULL, .flags = 0, .setLayoutCount = 1, .pSetLayouts = &hs->desc_set.layout.vout_vin, .pushConstantRangeCount = 0, .pPushConstantRanges = NULL }; vk(CreatePipelineLayout(device, &plci, allocator, &hs->pipeline.layout.vout_vin)); // // copy the config from the target -- we need these values later // memcpy(&hs->config,&target->config,sizeof(hs->config)); // save some frequently used calculated values hs->key_val_size = (target->config.words.key + target->config.words.val) * 4; hs->slab_keys = target->config.slab.height << target->config.slab.width_log2; hs->bs_slabs_log2_ru = bs_slabs_log2_ru; hs->bc_slabs_log2_max = bc_slabs_log2_max; // save kernel count hs->pipelines.count = count_all; // // create all the compute pipelines by reusing this info // VkComputePipelineCreateInfo cpci = { .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = NULL, .flags = VK_PIPELINE_CREATE_DISPATCH_BASE, // | VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT, .stage = { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .pNext = NULL, .flags = 0, .stage = VK_SHADER_STAGE_COMPUTE_BIT, .module = VK_NULL_HANDLE, .pName = "main", .pSpecializationInfo = NULL }, .layout = hs->pipeline.layout.vout_vin, .basePipelineHandle = VK_NULL_HANDLE, .basePipelineIndex = 0 }; // // Create a shader module, use it to create a pipeline... and // dispose of the shader module. // // The BS compute shaders have the same layout // The non-BS compute shaders have the same layout // VkShaderModuleCreateInfo smci = { .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = NULL, .flags = 0, .codeSize = 0, .pCode = (uint32_t const *)target->modules // FIXME -- unfortunate typecast }; // // bs kernels have layout: (vout,vin) // remaining have layout: (vout) // for (uint32_t ii=0; ii<count_all; ii++) { // convert bytes to words uint32_t const * const module = smci.pCode + smci.codeSize / sizeof(*module); smci.codeSize = NTOHL_MACRO(module[0]); smci.pCode = module + 1; vk(CreateShaderModule(device, &smci, allocator, &cpci.stage.module)); vk(CreateComputePipelines(device, pipeline_cache, 1, &cpci, allocator, hs->pipelines.all+ii)); vkDestroyShaderModule(device, cpci.stage.module, allocator); } // // initialize pointers to pipeline handles // VkPipeline * pipeline_next = hs->pipelines.all; // BS hs->pipelines.bs = pipeline_next; pipeline_next += count_bs; // BC hs->pipelines.bc = pipeline_next; pipeline_next += count_bc; // FM[0] hs->pipelines.fm[0] = count_fm[0] ? pipeline_next : NULL; pipeline_next += count_fm[0]; // FM[1] hs->pipelines.fm[1] = count_fm[1] ? pipeline_next : NULL; pipeline_next += count_fm[1]; // FM[2] hs->pipelines.fm[2] = count_fm[2] ? pipeline_next : NULL; pipeline_next += count_fm[2]; // HM[0] hs->pipelines.hm[0] = count_hm[0] ? pipeline_next : NULL; pipeline_next += count_hm[0]; // HM[1] hs->pipelines.hm[1] = count_hm[1] ? pipeline_next : NULL; pipeline_next += count_hm[1]; // HM[2] hs->pipelines.hm[2] = count_hm[2] ? pipeline_next : NULL; pipeline_next += count_hm[2]; // TRANSPOSE hs->pipelines.transpose = pipeline_next; pipeline_next += 1; // // optionally dump pipeline stats // #ifdef HS_VK_VERBOSE_STATISTICS_AMD hs_vk_verbose_statistics_amd(device,hs); #endif #ifdef HS_VK_VERBOSE_DISASSEMBLY_AMD hs_vk_verbose_disassembly_amd(device,hs); #endif // // // return hs; } // // // void hs_vk_release(struct hs_vk * const hs) { vkDestroyDescriptorSetLayout(hs->device, hs->desc_set.layout.vout_vin, hs->allocator); vkDestroyPipelineLayout(hs->device, hs->pipeline.layout.vout_vin, hs->allocator); for (uint32_t ii=0; ii<hs->pipelines.count; ii++) { vkDestroyPipeline(hs->device, hs->pipelines.all[ii], hs->allocator); } if (hs->allocator == NULL) { free(hs); } else { hs->allocator->pfnFree(NULL,hs); } } // // Allocate a per-thread descriptor set for the vin and vout // VkBuffers. Note that HotSort uses only one descriptor set. // VkDescriptorSet hs_vk_ds_alloc(struct hs_vk const * const hs, VkDescriptorPool desc_pool) { VkDescriptorSetAllocateInfo const ds_alloc_info = { .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, .pNext = NULL, .descriptorPool = desc_pool, .descriptorSetCount = 1, .pSetLayouts = &hs->desc_set.layout.vout_vin }; VkDescriptorSet hs_ds; vk(AllocateDescriptorSets(hs->device, &ds_alloc_info, &hs_ds)); return hs_ds; } // // // void hs_vk_pad(struct hs_vk const * const hs, uint32_t const count, uint32_t * const count_padded_in, uint32_t * const count_padded_out) { // // round up the count to slabs // uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys; uint32_t const blocks = slabs_ru / hs->config.block.slabs; uint32_t const block_slabs = blocks * hs->config.block.slabs; uint32_t const slabs_ru_rem = slabs_ru - block_slabs; uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs); *count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys; *count_padded_out = *count_padded_in; // // will merging be required? // if (slabs_ru > hs->config.block.slabs) { // more than one block uint32_t const blocks_lo = pow2_rd_u32(blocks); uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs; uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo; if (block_slabs_rem > 0) { uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem); uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru, blocks_lo << (1 - hs->config.merge.fm.scale_min)); uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi, block_slabs_lo*2); // clamp non-pow2 blocks *count_padded_out = block_slabs_padded_out * hs->slab_keys; } } } // // //