/*
* Copyright 2018 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*
*/
//
//
//
#include "transpose.h"
#include "common/macros.h"
//
// Rows must be an even number. This is enforced elsewhere.
//
// The transpose requires (cols_log2 * rows/2) row-pair blends.
//
void
hsg_transpose(uint32_t const cols_log2,
uint32_t const rows,
void (*pfn_blend)(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
void * blend),
void * blend,
void (*pfn_remap)(uint32_t const row_from,
uint32_t const row_to,
void * remap),
void * remap)
{
// get mapping array
uint32_t * map_curr = ALLOCA_MACRO(rows * sizeof(*map_curr));
uint32_t * map_next = ALLOCA_MACRO(rows * sizeof(*map_next));
// init the mapping array
for (uint32_t ii=0; ii<rows; ii++)
map_curr[ii] = ii;
// successively transpose rows using blends
for (uint32_t cc=1; cc<=cols_log2; cc++)
{
uint32_t const mask = BITS_TO_MASK(cc);
for (uint32_t ii=0; ii<rows; ii++)
{
uint32_t const left = map_curr[ii];
uint32_t const stay = left & ~mask;
if (left != stay) // will be swapped away
{
for (uint32_t jj=0; jj<rows; jj++)
{
if (map_curr[jj] == stay)
{
map_next[jj] = stay;
map_next[ii] = stay + (rows << (cc-1));
pfn_blend(cc,ii,jj,blend); // log2,left,right,payload
break;
}
}
}
}
uint32_t * tmp = map_curr;
map_curr = map_next;
map_next = tmp;
}
// write out the remapping
for (uint32_t ii=0; ii<rows; ii++)
pfn_remap(ii,map_curr[ii] >> cols_log2,remap);
}
//
// test it!
//
#ifdef HS_TRANSPOSE_DEBUG
#include <stdio.h>
static uint32_t cols; // implicit on SIMD/GPU
static
void
hsg_debug_blend(uint32_t const cols_log2,
uint32_t const row_ll, // lower-left
uint32_t const row_ur, // upper-right
uint32_t * b)
{
fprintf(stdout,"BLEND( %u, %3u, %3u )\n",cols_log2,row_ll,row_ur);
uint32_t * const ll = ALLOCA_MACRO(cols * sizeof(*b));
uint32_t * const ur = ALLOCA_MACRO(cols * sizeof(*b));
memcpy(ll,b+row_ll*cols,cols * sizeof(*b));
memcpy(ur,b+row_ur*cols,cols * sizeof(*b));
for (uint32_t ii=0; ii<cols; ii++)
b[row_ll*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii] : ur[ii^(1<<cols_log2-1)];
for (uint32_t ii=0; ii<cols; ii++)
b[row_ur*cols+ii] = ((ii >> cols_log2-1) & 1) ? ll[ii^(1<<cols_log2-1)] : ur[ii];
}
static
void
hsg_debug_remap(uint32_t const row_from,
uint32_t const row_to,
uint32_t * const r)
{
fprintf(stdout,"REMAP( %3u, %3u )\n",row_from,row_to);
r[row_to] = row_from;
}
static
void
hsg_debug_print(uint32_t const rows,
uint32_t const * const m,
uint32_t const * const r)
{
for (uint32_t rr=0; rr<rows; rr++) {
for (uint32_t cc=0; cc<cols; cc++)
fprintf(stdout,"%4u ",m[r[rr]*cols + cc]);
fprintf(stdout,"\n");
}
}
int
main(int argc, char * argv[])
{
uint32_t const cols_log2 = (argc <= 1) ? 3 : strtoul(argv[1],NULL,0);
uint32_t const rows = (argc <= 2) ? 6 : strtoul(argv[2],NULL,0);
if (rows & 1)
return;
cols = 1 << cols_log2;
uint32_t * const b = ALLOCA_MACRO(cols * rows * sizeof(*b));
uint32_t * const r = ALLOCA_MACRO( rows * sizeof(*r));
for (uint32_t rr=0; rr<rows; rr++) {
r[rr] = rr;
for (uint32_t cc=0; cc<cols; cc++)
b[rr*cols+cc] = cc*rows+rr;
}
hsg_debug_print(rows,b,r);
hsg_transpose(cols_log2,rows,
hsg_debug_blend,b,
hsg_debug_remap,r);
hsg_debug_print(rows,b,r);
return EXIT_SUCCESS;
}
#endif
//
//
//