/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
#include <stdio.h>
#include <stdlib.h>
//
//
//
#include "gen.h"
#include "transpose.h"
#include "common/util.h"
#include "common/macros.h"
//
//
//
struct hsg_transpose_state
{
FILE * header;
struct hsg_config const * config;
};
static
char
hsg_transpose_reg_prefix(uint32_t const cols_log2)
{
return 'a' + (('r' + cols_log2 - 'a') % 26);
}
static
void
hsg_transpose_blend(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
void * blend)
{
struct hsg_transpose_state * const state = blend;
// we're starting register names at '1' for now
fprintf(state->header,
" HS_TRANSPOSE_BLEND( %c, %c, %2u, %3u, %3u ) \\\n",
hsg_transpose_reg_prefix(cols_log2-1),
hsg_transpose_reg_prefix(cols_log2),
cols_log2,row_ll+1,row_ur+1);
}
static
void
hsg_transpose_remap(uint32_t const row_from,
uint32_t const row_to,
void * remap)
{
struct hsg_transpose_state * const state = remap;
// we're starting register names at '1' for now
fprintf(state->header,
" HS_TRANSPOSE_REMAP( %c, %3u, %3u ) \\\n",
hsg_transpose_reg_prefix(state->config->warp.lanes_log2),
row_from+1,row_to+1);
}
//
//
//
static
void
hsg_copyright(FILE * file)
{
fprintf(file,
"// \n"
"// Copyright 2016 Google Inc. \n"
"// \n"
"// Use of this source code is governed by a BSD-style \n"
"// license that can be found in the LICENSE file. \n"
"// \n"
"\n");
}
static
void
hsg_macros(FILE * file)
{
fprintf(file,
"// target-specific config \n"
"#include \"hs_config.h\" \n"
" \n"
"// arch/target-specific macros\n"
"#include \"hs_cl_macros.h\" \n"
" \n"
"// \n"
"// \n"
"// \n");
}
//
//
//
struct hsg_target_state
{
FILE * header;
FILE * source;
};
//
//
//
void
hsg_target_opencl(struct hsg_target * const target,
struct hsg_config const * const config,
struct hsg_merge const * const merge,
struct hsg_op const * const ops,
uint32_t const depth)
{
switch (ops->type)
{
case HSG_OP_TYPE_END:
fprintf(target->state->source,
"}\n");
break;
case HSG_OP_TYPE_BEGIN:
fprintf(target->state->source,
"{\n");
break;
case HSG_OP_TYPE_ELSE:
fprintf(target->state->source,
"else\n");
break;
case HSG_OP_TYPE_TARGET_BEGIN:
{
// allocate state
target->state = malloc(sizeof(*target->state));
// allocate files
target->state->header = fopen("hs_config.h", "wb");
target->state->source = fopen("hs_kernels.cl","wb");
// initialize header
uint32_t const bc_max = msb_idx_u32(pow2_rd_u32(merge->warps));
hsg_copyright(target->state->header);
fprintf(target->state->header,
"#ifndef HS_CL_ONCE \n"
"#define HS_CL_ONCE \n"
" \n"
"#define HS_SLAB_THREADS_LOG2 %u \n"
"#define HS_SLAB_THREADS (1 << HS_SLAB_THREADS_LOG2) \n"
"#define HS_SLAB_WIDTH_LOG2 %u \n"
"#define HS_SLAB_WIDTH (1 << HS_SLAB_WIDTH_LOG2) \n"
"#define HS_SLAB_HEIGHT %u \n"
"#define HS_SLAB_KEYS (HS_SLAB_WIDTH * HS_SLAB_HEIGHT)\n"
"#define HS_REG_LAST(c) c##%u \n"
"#define HS_KEY_WORDS %u \n"
"#define HS_VAL_WORDS 0 \n"
"#define HS_BS_SLABS %u \n"
"#define HS_BS_SLABS_LOG2_RU %u \n"
"#define HS_BC_SLABS_LOG2_MAX %u \n"
"#define HS_FM_BLOCK_HEIGHT %u \n"
"#define HS_FM_SCALE_MIN %u \n"
"#define HS_FM_SCALE_MAX %u \n"
"#define HS_HM_BLOCK_HEIGHT %u \n"
"#define HS_HM_SCALE_MIN %u \n"
"#define HS_HM_SCALE_MAX %u \n"
"#define HS_EMPTY \n"
" \n",
config->warp.lanes_log2, // FIXME - may be different on a SIMD target
config->warp.lanes_log2,
config->thread.regs,
config->thread.regs,
config->type.words,
merge->warps,
msb_idx_u32(pow2_ru_u32(merge->warps)),
bc_max,
config->merge.flip.warps,
config->merge.flip.lo,
config->merge.flip.hi,
config->merge.half.warps,
config->merge.half.lo,
config->merge.half.hi);
if (target->define != NULL)
fprintf(target->state->header,"#define %s\n\n",target->define);
fprintf(target->state->header,
"#define HS_SLAB_ROWS() \\\n");
for (uint32_t ii=1; ii<=config->thread.regs; ii++)
fprintf(target->state->header,
" HS_SLAB_ROW( %3u, %3u ) \\\n",ii,ii-1);
fprintf(target->state->header,
" HS_EMPTY\n"
" \n");
fprintf(target->state->header,
"#define HS_TRANSPOSE_SLAB() \\\n");
for (uint32_t ii=1; ii<=config->warp.lanes_log2; ii++)
fprintf(target->state->header,
" HS_TRANSPOSE_STAGE( %u ) \\\n",ii);
struct hsg_transpose_state state[1] =
{
{ .header = target->state->header,
.config = config
}
};
hsg_transpose(config->warp.lanes_log2,
config->thread.regs,
hsg_transpose_blend,state,
hsg_transpose_remap,state);
fprintf(target->state->header,
" HS_EMPTY\n"
" \n");
hsg_copyright(target->state->source);
hsg_macros(target->state->source);
}
break;
case HSG_OP_TYPE_TARGET_END:
// decorate the files
fprintf(target->state->header,
"#endif \n"
" \n"
"// \n"
"// \n"
"// \n"
" \n");
fprintf(target->state->source,
" \n"
"// \n"
"// \n"
"// \n"
" \n");
// close files
fclose(target->state->header);
fclose(target->state->source);
// free state
free(target->state);
break;
case HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO:
{
fprintf(target->state->source,
"\nHS_TRANSPOSE_KERNEL_PROTO()\n");
}
break;
case HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE:
{
fprintf(target->state->source,
"HS_SLAB_GLOBAL_PREAMBLE();\n");
}
break;
case HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY:
{
fprintf(target->state->source,
"HS_TRANSPOSE_SLAB()\n");
}
break;
case HSG_OP_TYPE_BS_KERNEL_PROTO:
{
struct hsg_merge const * const m = merge + ops->a;
uint32_t const bs = pow2_ru_u32(m->warps);
uint32_t const msb = msb_idx_u32(bs);
fprintf(target->state->source,
"\nHS_BS_KERNEL_PROTO(%u,%u)\n",
m->warps,msb);
}
break;
case HSG_OP_TYPE_BS_KERNEL_PREAMBLE:
{
struct hsg_merge const * const m = merge + ops->a;
if (m->warps > 1)
{
fprintf(target->state->source,
"HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
m->warps * config->warp.lanes,
m->rows_bs);
}
fprintf(target->state->source,
"HS_SLAB_GLOBAL_PREAMBLE();\n");
}
break;
case HSG_OP_TYPE_BC_KERNEL_PROTO:
{
struct hsg_merge const * const m = merge + ops->a;
uint32_t const msb = msb_idx_u32(m->warps);
fprintf(target->state->source,
"\nHS_BC_KERNEL_PROTO(%u,%u)\n",
m->warps,msb);
}
break;
case HSG_OP_TYPE_BC_KERNEL_PREAMBLE:
{
struct hsg_merge const * const m = merge + ops->a;
if (m->warps > 1)
{
fprintf(target->state->source,
"HS_BLOCK_LOCAL_MEM_DECL(%u,%u);\n\n",
m->warps * config->warp.lanes,
m->rows_bc);
}
fprintf(target->state->source,
"HS_SLAB_GLOBAL_PREAMBLE();\n");
}
break;
case HSG_OP_TYPE_FM_KERNEL_PROTO:
fprintf(target->state->source,
"\nHS_FM_KERNEL_PROTO(%u,%u)\n",
ops->a,ops->b);
break;
case HSG_OP_TYPE_FM_KERNEL_PREAMBLE:
fprintf(target->state->source,
"HS_FM_PREAMBLE(%u);\n",
ops->a);
break;
case HSG_OP_TYPE_HM_KERNEL_PROTO:
{
fprintf(target->state->source,
"\nHS_HM_KERNEL_PROTO(%u)\n",
ops->a);
}
break;
case HSG_OP_TYPE_HM_KERNEL_PREAMBLE:
fprintf(target->state->source,
"HS_HM_PREAMBLE(%u);\n",
ops->a);
break;
case HSG_OP_TYPE_BX_REG_GLOBAL_LOAD:
{
static char const * const vstr[] = { "vin", "vout" };
fprintf(target->state->source,
"HS_KEY_TYPE r%-3u = HS_SLAB_GLOBAL_LOAD(%s,%u);\n",
ops->n,vstr[ops->v],ops->n-1);
}
break;
case HSG_OP_TYPE_BX_REG_GLOBAL_STORE:
fprintf(target->state->source,
"HS_SLAB_GLOBAL_STORE(%u,r%u);\n",
ops->n-1,ops->n);
break;
case HSG_OP_TYPE_HM_REG_GLOBAL_LOAD:
fprintf(target->state->source,
"HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
ops->a,ops->b);
break;
case HSG_OP_TYPE_HM_REG_GLOBAL_STORE:
fprintf(target->state->source,
"HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
ops->b,ops->a);
break;
case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT:
fprintf(target->state->source,
"HS_KEY_TYPE r%-3u = HS_XM_GLOBAL_LOAD_L(%u);\n",
ops->a,ops->b);
break;
case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT:
fprintf(target->state->source,
"HS_XM_GLOBAL_STORE_L(%-3u,r%u);\n",
ops->b,ops->a);
break;
case HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT:
fprintf(target->state->source,
"HS_KEY_TYPE r%-3u = HS_FM_GLOBAL_LOAD_R(%u);\n",
ops->b,ops->a);
break;
case HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT:
fprintf(target->state->source,
"HS_FM_GLOBAL_STORE_R(%-3u,r%u);\n",
ops->a,ops->b);
break;
case HSG_OP_TYPE_FM_MERGE_RIGHT_PRED:
{
if (ops->a <= ops->b)
{
fprintf(target->state->source,
"if (HS_FM_IS_NOT_LAST_SPAN() || (fm_frac == 0))\n");
}
else if (ops->b > 1)
{
fprintf(target->state->source,
"else if (fm_frac == %u)\n",
ops->b);
}
else
{
fprintf(target->state->source,
"else\n");
}
}
break;
case HSG_OP_TYPE_SLAB_FLIP:
fprintf(target->state->source,
"HS_SLAB_FLIP_PREAMBLE(%u);\n",
ops->n-1);
break;
case HSG_OP_TYPE_SLAB_HALF:
fprintf(target->state->source,
"HS_SLAB_HALF_PREAMBLE(%u);\n",
ops->n / 2);
break;
case HSG_OP_TYPE_CMP_FLIP:
fprintf(target->state->source,
"HS_CMP_FLIP(%-3u,r%-3u,r%-3u);\n",ops->a,ops->b,ops->c);
break;
case HSG_OP_TYPE_CMP_HALF:
fprintf(target->state->source,
"HS_CMP_HALF(%-3u,r%-3u);\n",ops->a,ops->b);
break;
case HSG_OP_TYPE_CMP_XCHG:
if (ops->c == UINT32_MAX)
{
fprintf(target->state->source,
"HS_CMP_XCHG(r%-3u,r%-3u);\n",
ops->a,ops->b);
}
else
{
fprintf(target->state->source,
"HS_CMP_XCHG(r%u_%u,r%u_%u);\n",
ops->c,ops->a,ops->c,ops->b);
}
break;
case HSG_OP_TYPE_BS_REG_SHARED_STORE_V:
fprintf(target->state->source,
"HS_BX_LOCAL_V(%-3u * HS_SLAB_THREADS * %-3u) = r%u;\n",
merge[ops->a].warps,ops->c,ops->b);
break;
case HSG_OP_TYPE_BS_REG_SHARED_LOAD_V:
fprintf(target->state->source,
"r%-3u = HS_BX_LOCAL_V(%-3u * HS_SLAB_THREADS * %-3u);\n",
ops->b,merge[ops->a].warps,ops->c);
break;
case HSG_OP_TYPE_BC_REG_SHARED_LOAD_V:
fprintf(target->state->source,
"HS_KEY_TYPE r%-3u = HS_BX_LOCAL_V(%-3u * HS_SLAB_THREADS * %-3u);\n",
ops->b,ops->a,ops->c);
break;
case HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT:
fprintf(target->state->source,
"HS_SLAB_LOCAL_L(%5u) = r%u_%u;\n",
ops->b * config->warp.lanes,
ops->c,
ops->a);
break;
case HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT:
fprintf(target->state->source,
"HS_SLAB_LOCAL_R(%5u) = r%u_%u;\n",
ops->b * config->warp.lanes,
ops->c,
ops->a);
break;
case HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT:
fprintf(target->state->source,
"HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_L(%u);\n",
ops->c,
ops->a,
ops->b * config->warp.lanes);
break;
case HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT:
fprintf(target->state->source,
"HS_KEY_TYPE r%u_%-3u = HS_SLAB_LOCAL_R(%u);\n",
ops->c,
ops->a,
ops->b * config->warp.lanes);
break;
case HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT:
fprintf(target->state->source,
"HS_KEY_TYPE r%u_%-3u = HS_BC_GLOBAL_LOAD_L(%u);\n",
ops->c,
ops->a,
ops->b);
break;
case HSG_OP_TYPE_BLOCK_SYNC:
fprintf(target->state->source,
"HS_BLOCK_BARRIER();\n");
//
// FIXME - Named barriers to allow coordinating warps to proceed?
//
break;
case HSG_OP_TYPE_BS_FRAC_PRED:
{
if (ops->m == 0)
{
fprintf(target->state->source,
"if (warp_idx < bs_full)\n");
}
else
{
fprintf(target->state->source,
"else if (bs_frac == %u)\n",
ops->w);
}
}
break;
case HSG_OP_TYPE_BS_MERGE_H_PREAMBLE:
{
struct hsg_merge const * const m = merge + ops->a;
fprintf(target->state->source,
"HS_BS_MERGE_H_PREAMBLE(%u);\n",
m->warps);
}
break;
case HSG_OP_TYPE_BC_MERGE_H_PREAMBLE:
{
struct hsg_merge const * const m = merge + ops->a;
fprintf(target->state->source,
"HS_BC_MERGE_H_PREAMBLE(%u);\n",
m->warps);
}
break;
case HSG_OP_TYPE_BX_MERGE_H_PRED:
fprintf(target->state->source,
"if (HS_SUBGROUP_ID() < %u)\n",
ops->a);
break;
case HSG_OP_TYPE_BS_ACTIVE_PRED:
{
struct hsg_merge const * const m = merge + ops->a;
if (m->warps <= 32)
{
fprintf(target->state->source,
"if (((1u << HS_SUBGROUP_ID()) & 0x%08X) != 0)\n",
m->levels[ops->b].active.b32a2[0]);
}
else
{
fprintf(target->state->source,
"if (((1UL << HS_SUBGROUP_ID()) & 0x%08X%08XL) != 0L)\n",
m->levels[ops->b].active.b32a2[1],
m->levels[ops->b].active.b32a2[0]);
}
}
break;
default:
fprintf(stderr,"type not found: %s\n",hsg_op_type_string[ops->type]);
exit(EXIT_FAILURE);
break;
}
}
//
//
//