/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
//
//
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
//
//
//
#include "common/cl/assert_cl.h"
#include "common/macros.h"
#include "common/util.h"
//
//
//
#include "hs_cl.h"
//
//
//
struct hs_cl
{
struct hs_cl_target_config config;
uint32_t key_val_size;
uint32_t slab_keys;
uint32_t bs_slabs_log2_ru;
uint32_t bc_slabs_log2_max;
struct {
uint32_t count;
cl_kernel * bs;
cl_kernel * bc;
cl_kernel * fm[3];
cl_kernel * hm[3];
cl_kernel * transpose;
cl_kernel all[];
} kernels;
};
//
//
//
struct hs_state
{
#ifndef NDEBUG
cl_ulong t_total; // 0
#endif
cl_command_queue cq;
// key buffers
cl_mem vin;
cl_mem vout; // can be vin
// enforces ordering on out-of-order queue
cl_event wait_list[3]; // worst case
uint32_t wait_list_size;
// bx_ru is number of rounded up warps in vin
uint32_t bx_ru;
};
//
//
//
static
void
hs_state_wait_list_release(struct hs_state * const state)
{
for (uint32_t ii=0; ii<state->wait_list_size; ii++)
cl(ReleaseEvent(state->wait_list[ii]));
state->wait_list_size = 0;
}
static
void
hs_state_wait_list_update(struct hs_state * const state,
uint32_t const wait_list_size,
cl_event const * const wait_list)
{
uint32_t const new_size = state->wait_list_size + wait_list_size;
for (uint32_t ii=state->wait_list_size; ii<new_size; ii++)
state->wait_list[ii] = wait_list[ii];
state->wait_list_size = new_size;
}
//
//
//
#ifdef NDEBUG
#define HS_STATE_WAIT_LIST_PROFILE(state)
#define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list)
#else
#include <stdio.h>
#define HS_STATE_WAIT_LIST_PROFILE(state) \
hs_state_wait_list_profile(state, \
state->wait_list_size, \
state->wait_list)
#define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list) \
hs_state_wait_list_profile(state, \
wait_list_size, \
wait_list)
static
void
hs_state_wait_list_profile(struct hs_state * const state,
uint32_t const wait_list_size,
cl_event const * const wait_list)
{
cl(Finish(state->cq));
cl_command_queue_properties props;
cl(GetCommandQueueInfo(state->cq,
CL_QUEUE_PROPERTIES,
sizeof(props),
&props,
NULL));
for (uint32_t ii=0; ii<wait_list_size; ii++)
{
cl_event event = wait_list[ii];
//
// profiling
//
cl_ulong t_start=0, t_end=0;
if (props & CL_QUEUE_PROFILING_ENABLE)
{
// start
cl(GetEventProfilingInfo(event,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&t_start,
NULL));
// end
cl(GetEventProfilingInfo(event,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&t_end,
NULL));
state->t_total += t_end - t_start;
}
//
// status
//
cl_int status;
cl_command_type type;
cl_get_event_info(event,&status,&type);
fprintf(stdout,"%-13s, %-28s, %20"PRIu64", %20"PRIu64", %20"PRIu64", %20"PRIu64"\n",
cl_get_event_command_status_string(status),
cl_get_event_command_type_string(type),
t_start,t_end,t_end-t_start,state->t_total);
}
}
#endif
//
//
//
#ifdef NDEBUG
#define HS_TRACE(k,g,l)
#else
#define HS_KERNEL_NAME_MAX 20
static
void
hs_trace(cl_kernel kernel,
uint32_t const dim,
size_t const * const global_work_size)
{
if (kernel == NULL)
return;
char name[HS_KERNEL_NAME_MAX];
cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL));
fprintf(stderr,"%-19s ( %6zu, %6zu, %6zu )\n",
name,
global_work_size[0],
dim < 2 ? 0 : global_work_size[1],
dim < 3 ? 0 : global_work_size[2]);
}
#define HS_TRACE(k,d,g) hs_trace(k,d,g)
#endif
//
//
//
static
void
hs_transpose(struct hs_cl const * const hs,
struct hs_state * const state)
{
size_t const size[1] = { state->bx_ru << hs->config.slab.threads_log2 };
cl_kernel kernel = hs->kernels.transpose[0];
HS_TRACE(kernel,1,size);
//
// The transpose kernel operates on a single slab. For now, let's
// rely on the driver to choose a workgroup size.
//
// size_t local_work_size[1] = { HS_SLAB_THREADS };
//
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
cl_event wait_list_out[1];
cl(EnqueueNDRangeKernel(state->cq,
kernel,
1,
NULL,
size,
NULL,
state->wait_list_size,
state->wait_list,
wait_list_out));
hs_state_wait_list_release(state);
hs_state_wait_list_update(state,1,wait_list_out);
HS_STATE_WAIT_LIST_PROFILE(state);
}
//
//
//
static
void
hs_hm_enqueue(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const scale_log2,
uint32_t const spans,
uint32_t const span_threads)
{
//
// Note that some platforms might need to use .z on large grids
//
size_t const size[3] = { span_threads, spans, 1 };
cl_kernel kernel = hs->kernels.hm[scale_log2][0];
HS_TRACE(kernel,3,size);
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
cl_event wait_list_out[1];
cl(EnqueueNDRangeKernel(state->cq,
kernel,
3,
NULL,
size,
NULL,
state->wait_list_size,
state->wait_list,
wait_list_out));
hs_state_wait_list_release(state);
hs_state_wait_list_update(state,1,wait_list_out);
HS_STATE_WAIT_LIST_PROFILE(state);
}
//
//
//
static
uint32_t
hs_hm(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const down_slabs,
uint32_t const clean_slabs_log2)
{
// how many scaled half-merge spans are there?
uint32_t const frac_ru = (1 << clean_slabs_log2) - 1;
uint32_t const spans = (down_slabs + frac_ru) >> clean_slabs_log2;
// for now, just clamp to the max
uint32_t const log2_rem = clean_slabs_log2 - hs->bc_slabs_log2_max;
uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem);
uint32_t const log2_out = log2_rem - scale_log2;
// size the grid
uint32_t const span_threads = hs->slab_keys << log2_out;
// launch the hm kernel
hs_hm_enqueue(hs,
state,
scale_log2,
spans,
span_threads);
return log2_out;
}
//
//
//
static
void
hs_bc_enqueue(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const full,
uint32_t const clean_slabs_log2)
{
size_t const size[1] = { full << hs->config.slab.threads_log2 };
cl_kernel kernel = hs->kernels.bc[clean_slabs_log2];
HS_TRACE(kernel,1,size);
cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
cl_event wait_list_out[1];
cl(EnqueueNDRangeKernel(state->cq,
kernel,
1,
NULL,
size,
NULL,
state->wait_list_size,
state->wait_list,
wait_list_out));
hs_state_wait_list_release(state);
hs_state_wait_list_update(state,1,wait_list_out);
HS_STATE_WAIT_LIST_PROFILE(state);
}
//
//
//
static
void
hs_bc(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const down_slabs,
uint32_t const clean_slabs_log2)
{
// block clean the minimal number of down_slabs_log2 spans
uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
uint32_t const full = (down_slabs + frac_ru) & ~frac_ru;
// we better be capable of cleaning at least two warps !!!
hs_bc_enqueue(hs,state,full,clean_slabs_log2);
}
//
// FIXME -- some of this logic can be skipped if BS is a power-of-two
//
static
void
hs_fm_enqueue(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const scale_log2,
uint32_t const fm_full,
uint32_t const fm_frac,
uint32_t const span_threads)
{
//
// Note that some platforms might need to use .z on large grids
//
uint32_t wait_list_out_size = 0;
cl_event wait_list_out[2];
if (fm_full > 0)
{
size_t const size_full[3] = { span_threads, fm_full, 1 };
cl_kernel kernel_full = hs->kernels.fm[scale_log2][hs->bs_slabs_log2_ru-1+scale_log2];
HS_TRACE(kernel_full,3,size_full);
cl(SetKernelArg(kernel_full,0,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(state->cq,
kernel_full,
3,
NULL,
size_full,
NULL,
state->wait_list_size,
state->wait_list,
wait_list_out+wait_list_out_size++));
}
if (fm_frac > 0)
{
size_t const offset_frac[3] = { 0, fm_full, 0 };
size_t const size_frac [3] = { span_threads, 1, 1 };
cl_kernel kernel_frac = hs->kernels.fm[scale_log2][msb_idx_u32(fm_frac)];
HS_TRACE(kernel_frac,3,size_frac);
cl(SetKernelArg(kernel_frac,0,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(state->cq,
kernel_frac,
3,
offset_frac,
size_frac,
NULL,
state->wait_list_size,
state->wait_list,
wait_list_out+wait_list_out_size++));
}
hs_state_wait_list_release(state);
hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
HS_STATE_WAIT_LIST_PROFILE(state);
}
//
//
//
static
uint32_t
hs_fm(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t * const down_slabs,
uint32_t const up_scale_log2)
{
//
// FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
// a performance win to bias toward launching the smaller flip merge
// kernel in order to get more warps in flight (increased
// occupancy). This is useful when merging small numbers of slabs.
//
// Note that HS_FM_SCALE_MIN will always be 0 or 1.
//
// So, for now, just clamp to the max until there is a reason to
// restore the fancier and probably low-impact approach.
//
uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2);
uint32_t const clean_log2 = up_scale_log2 - scale_log2;
// number of slabs in a full-sized scaled flip-merge span
uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2;
// how many full-sized scaled flip-merge spans are there?
uint32_t fm_full = state->bx_ru / full_span_slabs;
uint32_t fm_frac = 0;
// initialize down_slabs
*down_slabs = fm_full * full_span_slabs;
// how many half-size scaled + fractional scaled spans are there?
uint32_t const span_rem = state->bx_ru - *down_slabs;
uint32_t const half_span_slabs = full_span_slabs >> 1;
// if we have over a half-span then fractionally merge it
if (span_rem > half_span_slabs)
{
// the remaining slabs will be cleaned
*down_slabs += span_rem;
uint32_t const frac_rem = span_rem - half_span_slabs;
uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);
if (frac_rem_pow2 >= half_span_slabs)
{
// bump it up to a full span
fm_full += 1;
}
else
{
// otherwise, add fractional
fm_frac = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
}
}
// size the grid
uint32_t const span_threads = hs->slab_keys << clean_log2;
//
// launch the fm kernel
//
hs_fm_enqueue(hs,
state,
scale_log2,
fm_full,
fm_frac,
span_threads);
return clean_log2;
}
//
//
//
static
void
hs_bs_enqueue(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const full,
uint32_t const frac,
uint32_t const wait_list_size,
cl_event * wait_list)
{
uint32_t wait_list_out_size = 0;
cl_event wait_list_out[2];
if (full > 0)
{
size_t const size_full[1] = { full << hs->config.slab.threads_log2 };
cl_kernel kernel_full = hs->kernels.bs[hs->bs_slabs_log2_ru];
HS_TRACE(kernel_full,1,size_full);
cl(SetKernelArg(kernel_full,0,sizeof(state->vin), &state->vin));
cl(SetKernelArg(kernel_full,1,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(state->cq,
kernel_full,
1,
NULL,
size_full,
NULL,
wait_list_size,
wait_list,
wait_list_out+wait_list_out_size++));
}
if (frac > 0)
{
size_t const offset_frac[1] = { full << hs->config.slab.threads_log2 };
size_t const size_frac [1] = { frac << hs->config.slab.threads_log2 };
cl_kernel kernel_frac = hs->kernels.bs[msb_idx_u32(frac)];
HS_TRACE(kernel_frac,1,size_frac);
cl(SetKernelArg(kernel_frac,0,sizeof(state->vin), &state->vin));
cl(SetKernelArg(kernel_frac,1,sizeof(state->vout),&state->vout));
cl(EnqueueNDRangeKernel(state->cq,
kernel_frac,
1,
offset_frac,
size_frac,
NULL,
wait_list_size,
wait_list,
wait_list_out+wait_list_out_size++));
}
hs_state_wait_list_release(state);
hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
HS_STATE_WAIT_LIST_PROFILE(state);
}
//
//
//
static
void
hs_bs(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const count_padded_in,
uint32_t const wait_list_size,
cl_event * wait_list)
{
uint32_t const slabs_in = count_padded_in / hs->slab_keys;
uint32_t const full = (slabs_in / hs->config.block.slabs) * hs->config.block.slabs;
uint32_t const frac = slabs_in - full;
hs_bs_enqueue(hs,state,
full,frac,
wait_list_size,wait_list);
}
//
//
//
static
void
hs_keyset_pre_sort(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const count,
uint32_t const count_hi,
uint32_t const wait_list_size,
cl_event * wait_list,
cl_event * event)
{
uint32_t const vin_span = count_hi - count;
uint32_t const pattern = UINT32_MAX;
cl(EnqueueFillBuffer(state->cq,
state->vin,
&pattern,
sizeof(pattern),
count * hs->key_val_size,
vin_span * hs->key_val_size,
wait_list_size,
wait_list,
event));
HS_STATE_WAIT_LIST_PROFILE_EX(state,1,event);
}
//
//
//
static
void
hs_keyset_pre_merge(struct hs_cl const * const hs,
struct hs_state * const state,
uint32_t const count_lo,
uint32_t const count_hi,
uint32_t const wait_list_size,
cl_event * wait_list)
{
uint32_t const vout_span = count_hi - count_lo;
uint32_t const pattern = UINT32_MAX;
// appends event to incoming wait list
cl(EnqueueFillBuffer(state->cq,
state->vout,
&pattern,
sizeof(pattern),
count_lo * hs->key_val_size,
vout_span * hs->key_val_size,
wait_list_size,
wait_list,
state->wait_list+state->wait_list_size++));
HS_STATE_WAIT_LIST_PROFILE(state);
}
//
// We want concurrent kernel execution to occur in a few places.
//
// The summary is:
//
// 1) If necessary, some max valued keys are written to the end of
// the vin/vout buffers.
//
// 2) Blocks of slabs of keys are sorted.
//
// 3) If necesary, the blocks of slabs are merged until complete.
//
// 4) If requested, the slabs will be converted from slab ordering
// to linear ordering.
//
// Below is the general "happens-before" relationship between HotSort
// compute kernels.
//
// Note the diagram assumes vin and vout are different buffers. If
// they're not, then the first merge doesn't include the pad_vout
// event in the wait list.
//
// +----------+ +---------+
// | pad_vout | | pad_vin |
// +----+-----+ +----+----+
// | |
// | WAITFOR(pad_vin)
// | |
// | +-----v-----+
// | | |
// | +----v----+ +----v----+
// | | bs_full | | bs_frac |
// | +----+----+ +----+----+
// | | |
// | +-----v-----+
// | |
// | +------NO------JUST ONE BLOCK?
// | / |
// |/ YES
// + |
// | v
// | END_WITH_EVENTS(bs_full,bs_frac)
// |
// |
// WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
// |
// |
// +-----------<------------+
// | |
// +-----v-----+ |
// | | |
// +----v----+ +----v----+ |
// | fm_full | | fm_frac | |
// +----+----+ +----+----+ |
// | | ^
// +-----v-----+ |
// | |
// WAITFOR(fm_full,fm_frac) |
// | |
// v |
// +--v--+ WAITFOR(bc)
// | hm | |
// +-----+ |
// | |
// WAITFOR(hm) |
// | ^
// +--v--+ |
// | bc | |
// +-----+ |
// | |
// v |
// MERGING COMPLETE?-------NO------+
// |
// YES
// |
// v
// END_WITH_EVENTS(bc)
//
void
hs_cl_sort(struct hs_cl const * const hs,
cl_command_queue cq,
uint32_t const wait_list_size,
cl_event * wait_list,
cl_event * event,
cl_mem vin,
cl_mem vout,
uint32_t const count,
uint32_t const count_padded_in,
uint32_t const count_padded_out,
bool const linearize)
{
// is this sort in place?
bool const is_in_place = (vout == NULL);
// cq, buffers, wait list and slab count
struct hs_state state = {
#ifndef NDEBUG
.t_total = 0,
#endif
.cq = cq,
.vin = vin,
.vout = is_in_place ? vin : vout,
.wait_list_size = 0,
.bx_ru = (count + hs->slab_keys - 1) / hs->slab_keys
};
// initialize vin
uint32_t const count_hi = is_in_place ? count_padded_out : count_padded_in;
bool const is_pre_sort_keyset_reqd = count_hi > count;
cl_event event_keyset_pre_sort[1];
// initialize any trailing keys in vin before sorting
if (is_pre_sort_keyset_reqd)
{
hs_keyset_pre_sort(hs,&state,
count,count_hi,
wait_list_size,wait_list,
event_keyset_pre_sort);
}
// initialize any trailing keys in vout before merging
if (!is_in_place && (count_padded_out > count_padded_in))
{
hs_keyset_pre_merge(hs,&state,
count_padded_in,count_padded_out,
wait_list_size,wait_list);
}
//
// sort blocks of slabs
//
hs_bs(hs,&state,
count_padded_in,
is_pre_sort_keyset_reqd ? 1 : wait_list_size,
is_pre_sort_keyset_reqd ? event_keyset_pre_sort : wait_list);
// release the event
if (is_pre_sort_keyset_reqd)
cl(ReleaseEvent(event_keyset_pre_sort[0]));
//
// we're done if this was a single bs block...
//
// otherwise, merge sorted spans of slabs until done
//
if (state.bx_ru > hs->config.block.slabs)
{
int32_t up_scale_log2 = 1;
while (true)
{
uint32_t down_slabs;
// flip merge slabs -- return span of slabs that must be cleaned
uint32_t clean_slabs_log2 = hs_fm(hs,&state,
&down_slabs,
up_scale_log2);
// if span is gt largest slab block cleaner then half merge
while (clean_slabs_log2 > hs->bc_slabs_log2_max)
{
clean_slabs_log2 = hs_hm(hs,&state,
down_slabs,
clean_slabs_log2);
}
// launch clean slab grid -- is it the final launch?
hs_bc(hs,&state,down_slabs,clean_slabs_log2);
// was this the final block clean?
if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru)
break;
// otherwise, merge twice as many slabs
up_scale_log2 += 1;
}
}
// slabs or linear?
if (linearize) {
hs_transpose(hs,&state);
}
// does the caller want the final event?
if (event != NULL) {
*event = state.wait_list[0];
} else {
cl(ReleaseEvent(state.wait_list[0]));
}
}
//
// all grids will be computed as a function of the minimum number of slabs
//
void
hs_cl_pad(struct hs_cl const * const hs,
uint32_t const count,
uint32_t * const count_padded_in,
uint32_t * const count_padded_out)
{
//
// round up the count to slabs
//
uint32_t const slabs_ru = (count + hs->slab_keys - 1) / hs->slab_keys;
uint32_t const blocks = slabs_ru / hs->config.block.slabs;
uint32_t const block_slabs = blocks * hs->config.block.slabs;
uint32_t const slabs_ru_rem = slabs_ru - block_slabs;
uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs);
*count_padded_in = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys;
*count_padded_out = *count_padded_in;
//
// will merging be required?
//
if (slabs_ru > hs->config.block.slabs)
{
// more than one block
uint32_t const blocks_lo = pow2_rd_u32(blocks);
uint32_t const block_slabs_lo = blocks_lo * hs->config.block.slabs;
uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;
if (block_slabs_rem > 0)
{
uint32_t const block_slabs_rem_ru = pow2_ru_u32(block_slabs_rem);
uint32_t const block_slabs_hi = MAX_MACRO(block_slabs_rem_ru,
blocks_lo << (1 - hs->config.merge.fm.scale_min));
uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
block_slabs_lo*2); // clamp non-pow2 blocks
*count_padded_out = block_slabs_padded_out * hs->slab_keys;
}
}
}
//
//
//
static
void
hs_create_kernel(cl_program program,
cl_kernel * const kernel,
char const * const name)
{
cl_int err;
*kernel = clCreateKernel(program,name,&err);
cl_ok(err);
}
static
void
hs_create_kernels(cl_program program,
cl_kernel * kernels,
char name_template[],
size_t const name_template_size,
uint32_t const count)
{
char const n_max = '0'+(char)count;
for (char n = '0'; n<n_max; n++)
{
cl_int err;
name_template[name_template_size-2] = n;
*kernels++ = clCreateKernel(program,name_template,&err);
cl_ok(err);
}
}
//
//
//
struct hs_cl *
hs_cl_create(struct hs_cl_target const * const target,
cl_context context,
cl_device_id device_id)
{
//
// immediately try to build the OpenCL program
//
bool const is_binary = (target->program[0] == 0);
uint32_t const program_size = NPBTOHL_MACRO(target->program+1);
cl_program program;
if (is_binary) // program is a binary
{
cl_int status, err;
size_t const bins_sizeof[] = { program_size };
unsigned char const * bins[] = { target->program+5 };
program = clCreateProgramWithBinary(context,
1,
&device_id,
bins_sizeof,
bins,
&status,
&err);
cl_ok(err);
fprintf(stdout,"Building binary... ");
fflush(stdout);
cl(BuildProgram(program,
1,
&device_id,
NULL,
NULL,
NULL));
}
else // program is source code
{
cl_int err;
size_t const strings_sizeof[] = { program_size };
char const * strings[] = { (char*)target->program+5 };
program = clCreateProgramWithSource(context,
1,
strings,
strings_sizeof,
&err);
cl_ok(err);
char const * const options =
"-cl-std=CL1.2 -cl-fast-relaxed-math "
"-cl-no-signed-zeros -cl-mad-enable "
"-cl-denorms-are-zero "
"-cl-kernel-arg-info";
fprintf(stdout,"Building source... ");
fflush(stdout);
cl(BuildProgram(program,
1,
&device_id,
options,
NULL,
NULL));
}
//
// we reference these values a lot
//
uint32_t const bs_slabs_log2_ru = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
//
// how many kernels will be created?
//
uint32_t const count_bs = bs_slabs_log2_ru + 1;
uint32_t const count_bc = bc_slabs_log2_max + 1;
uint32_t count_fm[3] = { 0 };
uint32_t count_hm[3] = { 0 };
// guaranteed to be in range [0,2]
for (uint32_t scale = target->config.merge.fm.scale_min;
scale <= target->config.merge.fm.scale_max;
scale++)
{
uint32_t fm_left = (target->config.block.slabs / 2) << scale;
count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1;
}
// guaranteed to be in range [0,2]
for (uint32_t scale = target->config.merge.hm.scale_min;
scale <= target->config.merge.hm.scale_max;
scale++)
{
count_hm[scale] = 1;
}
uint32_t const count_all =
1
+ count_bs
+ count_bc
+ count_fm[0] + count_fm[1] + count_fm[2]
+ count_hm[0] + count_hm[1] + count_hm[2];
//
// allocate hs_cl
//
struct hs_cl * hs = malloc(sizeof(*hs) + sizeof(cl_kernel) * count_all);
memcpy(&hs->config,&target->config,sizeof(hs->config));
// save some frequently used calculated values
hs->key_val_size = (target->config.words.key + target->config.words.val) * 4;
hs->slab_keys = target->config.slab.height << target->config.slab.width_log2;
hs->bs_slabs_log2_ru = bs_slabs_log2_ru;
hs->bc_slabs_log2_max = bc_slabs_log2_max;
// save kernel count
hs->kernels.count = count_all;
//
// create all the kernels and release the program
//
cl_kernel * kernel_next = hs->kernels.all;
//
// BS
//
{
hs->kernels.bs = kernel_next;
char bs_name[] = { "hs_kernel_bs_X" };
hs_create_kernels(program,
kernel_next,
bs_name,sizeof(bs_name),
count_bs);
kernel_next += count_bs;
}
//
// BC
//
{
hs->kernels.bc = kernel_next;
char bc_name[] = { "hs_kernel_bc_X" };
hs_create_kernels(program,
kernel_next,
bc_name,sizeof(bc_name),
count_bc);
kernel_next += count_bc;
}
//
// FM
//
if (count_fm[0] > 0)
{
hs->kernels.fm[0] = kernel_next;
char fm_0_name[] = { "hs_kernel_fm_0_X" };
hs_create_kernels(program,
kernel_next,
fm_0_name,sizeof(fm_0_name),
count_fm[0]);
kernel_next += count_fm[0];
}
if (count_fm[1] > 0)
{
hs->kernels.fm[1] = kernel_next;
char fm_1_name[] = { "hs_kernel_fm_1_X" };
hs_create_kernels(program,
kernel_next,
fm_1_name,sizeof(fm_1_name),
count_fm[1]);
kernel_next += count_fm[1];
}
if (count_fm[2] > 0)
{
hs->kernels.fm[2] = kernel_next;
char fm_2_name[] = { "hs_kernel_fm_2_X" };
hs_create_kernels(program,
kernel_next,
fm_2_name,sizeof(fm_2_name),
count_fm[2]);
kernel_next += count_fm[2];
}
//
// HM
//
if (count_hm[0] > 0)
{
hs->kernels.hm[0] = kernel_next;
hs_create_kernel(program,
kernel_next,
"hs_kernel_hm_0");
kernel_next += count_hm[0];
}
if (count_hm[1] > 0)
{
hs->kernels.hm[1] = kernel_next;
hs_create_kernel(program,
kernel_next,
"hs_kernel_hm_1");
kernel_next += count_hm[1];
}
if (count_hm[2] > 0)
{
hs->kernels.hm[2] = kernel_next;
hs_create_kernel(program,
kernel_next,
"hs_kernel_hm_2");
kernel_next += count_hm[2]; // unnecessary
}
//
// TRANSPOSE
//
{
hs->kernels.transpose = kernel_next;
hs_create_kernel(program,
kernel_next,
"hs_kernel_transpose");
kernel_next += 1;
}
return hs;
}
//
//
//
void
hs_cl_release(struct hs_cl * const hs)
{
for (uint32_t ii=0; ii<hs->kernels.count; ii++)
cl(ReleaseKernel(hs->kernels.all[ii]));
free(hs);
}
//
//
//