/*
* Copyright 2016 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can
* be found in the LICENSE file.
*
*/
#pragma once
//
// TODO:
//
// Add Key-Val sorting support -- easy.
//
#include <stdio.h>
#include <stdint.h>
//
// All code generation is driven by the specified architectural
// details and host platform API.
//
// In general, the warps-per-block and keys-per-thread are the
// critical knobs for tuning performance.
//
struct hsg_config
{
struct {
struct {
uint32_t warps;
uint32_t lo;
uint32_t hi;
} flip;
struct {
uint32_t warps;
uint32_t lo;
uint32_t hi;
} half;
uint32_t max_log2;
} merge;
struct {
uint32_t warps_min;
uint32_t warps_max;
uint32_t warps_mod;
uint32_t smem_min;
uint32_t smem_quantum;
uint32_t smem_bs;
uint32_t smem_bc;
} block;
struct {
uint32_t lanes;
uint32_t lanes_log2;
uint32_t skpw_bs;
} warp;
struct {
uint32_t regs;
uint32_t xtra;
} thread;
struct {
uint32_t words;
} type;
};
//
// HotSort can merge non-power-of-two blocks of warps
//
struct hsg_level
{
uint32_t count; // networks >= 2
uint32_t diffs [2];
uint32_t diff_masks [2];
uint32_t evenodds [2];
uint32_t evenodd_masks[2];
uint32_t networks [2];
union {
uint64_t b64;
uint32_t b32a2[2];
} active;
};
//
//
//
#define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps
#define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2)
//
// This is computed
//
struct hsg_merge
{
uint32_t offsets [MERGE_LEVELS_MAX_SIZE];
uint32_t networks[MERGE_LEVELS_MAX_SIZE];
struct hsg_level levels[MERGE_LEVELS_MAX_LOG2];
uint32_t index;
uint32_t warps;
uint32_t rows_bs;
uint32_t rows_bc;
uint32_t skpw_bc;
};
//
//
//
#if 0
#define HSG_FILE_NAME_SIZE 80
struct hsg_file
{
FILE * file;
char const * prefix;
char name[HSG_FILE_NAME_SIZE];
};
//
//
//
typedef enum hsg_file_type {
HSG_FILE_TYPE_HEADER,
HSG_FILE_TYPE_SOURCE,
HSG_FILE_TYPE_COUNT
} hsg_file_type;
#endif
//
//
//
#define HSG_OP_EXPAND_ALL() \
HSG_OP_EXPAND_X(HSG_OP_TYPE_EXIT) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_END) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PROTO) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PREAMBLE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PROTO) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PREAMBLE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PROTO) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PREAMBLE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PROTO) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PREAMBLE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_LOAD) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_STORE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_XCHG) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_V) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_V) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_SHARED_LOAD_V) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BLOCK_SYNC) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_FRAC_PRED) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_MERGE_H_PREAMBLE) \
HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_MERGE_H_PREAMBLE) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_MERGE_H_PRED) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \
\
HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT)
//
//
//
#undef HSG_OP_EXPAND_X
#define HSG_OP_EXPAND_X(t) t ,
typedef enum hsg_op_type {
HSG_OP_EXPAND_ALL()
} hsg_op_type;
//
//
//
struct hsg_op
{
hsg_op_type type;
union {
struct {
int32_t a;
int32_t b;
int32_t c;
};
struct {
int32_t n;
int32_t v;
};
struct {
int32_t m;
int32_t w;
};
};
};
//
//
//
extern char const * const hsg_op_type_string[];
//
//
//
struct hsg_target
{
char const * define;
struct hsg_target_state * state;
};
//
// All targets share this prototype
//
typedef
void
(*hsg_target_pfn)(struct hsg_target * const target,
struct hsg_config const * const config,
struct hsg_merge const * const merge,
struct hsg_op const * const ops,
uint32_t const depth);
//
//
//
extern
void
hsg_target_debug(struct hsg_target * const target,
struct hsg_config const * const config,
struct hsg_merge const * const merge,
struct hsg_op const * const ops,
uint32_t const depth);
extern
void
hsg_target_cuda(struct hsg_target * const target,
struct hsg_config const * const config,
struct hsg_merge const * const merge,
struct hsg_op const * const ops,
uint32_t const depth);
extern
void
hsg_target_opencl(struct hsg_target * const target,
struct hsg_config const * const config,
struct hsg_merge const * const merge,
struct hsg_op const * const ops,
uint32_t const depth);
extern
void
hsg_target_glsl(struct hsg_target * const target,
struct hsg_config const * const config,
struct hsg_merge const * const merge,
struct hsg_op const * const ops,
uint32_t const depth);
//
//
//