/* * Copyright 2016 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ #pragma once // // TODO: // // Add Key-Val sorting support -- easy. // #include <stdio.h> #include <stdint.h> // // All code generation is driven by the specified architectural // details and host platform API. // // In general, the warps-per-block and keys-per-thread are the // critical knobs for tuning performance. // struct hsg_config { struct { struct { uint32_t warps; uint32_t lo; uint32_t hi; } flip; struct { uint32_t warps; uint32_t lo; uint32_t hi; } half; uint32_t max_log2; } merge; struct { uint32_t warps_min; uint32_t warps_max; uint32_t warps_mod; uint32_t smem_min; uint32_t smem_quantum; uint32_t smem_bs; uint32_t smem_bc; } block; struct { uint32_t lanes; uint32_t lanes_log2; uint32_t skpw_bs; } warp; struct { uint32_t regs; uint32_t xtra; } thread; struct { uint32_t words; } type; }; // // HotSort can merge non-power-of-two blocks of warps // struct hsg_level { uint32_t count; // networks >= 2 uint32_t diffs [2]; uint32_t diff_masks [2]; uint32_t evenodds [2]; uint32_t evenodd_masks[2]; uint32_t networks [2]; union { uint64_t b64; uint32_t b32a2[2]; } active; }; // // // #define MERGE_LEVELS_MAX_LOG2 7 // merge up to 128 warps #define MERGE_LEVELS_MAX_SIZE (1 << MERGE_LEVELS_MAX_LOG2) // // This is computed // struct hsg_merge { uint32_t offsets [MERGE_LEVELS_MAX_SIZE]; uint32_t networks[MERGE_LEVELS_MAX_SIZE]; struct hsg_level levels[MERGE_LEVELS_MAX_LOG2]; uint32_t index; uint32_t warps; uint32_t rows_bs; uint32_t rows_bc; uint32_t skpw_bc; }; // // // #if 0 #define HSG_FILE_NAME_SIZE 80 struct hsg_file { FILE * file; char const * prefix; char name[HSG_FILE_NAME_SIZE]; }; // // // typedef enum hsg_file_type { HSG_FILE_TYPE_HEADER, HSG_FILE_TYPE_SOURCE, HSG_FILE_TYPE_COUNT } hsg_file_type; #endif // // // #define HSG_OP_EXPAND_ALL() \ HSG_OP_EXPAND_X(HSG_OP_TYPE_EXIT) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_END) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PROTO) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PREAMBLE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PROTO) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PREAMBLE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PROTO) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PREAMBLE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PROTO) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PREAMBLE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_LOAD) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_STORE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_XCHG) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_V) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_V) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_SHARED_LOAD_V) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BLOCK_SYNC) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_FRAC_PRED) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_MERGE_H_PREAMBLE) \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_MERGE_H_PREAMBLE) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_MERGE_H_PRED) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED) \ \ HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT) // // // #undef HSG_OP_EXPAND_X #define HSG_OP_EXPAND_X(t) t , typedef enum hsg_op_type { HSG_OP_EXPAND_ALL() } hsg_op_type; // // // struct hsg_op { hsg_op_type type; union { struct { int32_t a; int32_t b; int32_t c; }; struct { int32_t n; int32_t v; }; struct { int32_t m; int32_t w; }; }; }; // // // extern char const * const hsg_op_type_string[]; // // // struct hsg_target { char const * define; struct hsg_target_state * state; }; // // All targets share this prototype // typedef void (*hsg_target_pfn)(struct hsg_target * const target, struct hsg_config const * const config, struct hsg_merge const * const merge, struct hsg_op const * const ops, uint32_t const depth); // // // extern void hsg_target_debug(struct hsg_target * const target, struct hsg_config const * const config, struct hsg_merge const * const merge, struct hsg_op const * const ops, uint32_t const depth); extern void hsg_target_cuda(struct hsg_target * const target, struct hsg_config const * const config, struct hsg_merge const * const merge, struct hsg_op const * const ops, uint32_t const depth); extern void hsg_target_opencl(struct hsg_target * const target, struct hsg_config const * const config, struct hsg_merge const * const merge, struct hsg_op const * const ops, uint32_t const depth); extern void hsg_target_glsl(struct hsg_target * const target, struct hsg_config const * const config, struct hsg_merge const * const merge, struct hsg_op const * const ops, uint32_t const depth); // // //