/*
 * Copyright (C) 2008 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* ---- includes ----------------------------------------------------------- */

#include "b_TensorEm/CompactMat.h"
#include "b_TensorEm/Functions.h"
#include "b_BasicEm/Math.h"
#include "b_BasicEm/Functions.h"
#include "b_BasicEm/Memory.h"

/* ------------------------------------------------------------------------- */

/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ auxiliary functions } ---------------------------------------- */
/*                                                                           */
/* ========================================================================= */

/* ------------------------------------------------------------------------- */

/** Returns dot product of inVec with indexed row 
    The result is a floating point expresstion:
		upper 16 bit: signed value
		lower 16 bit: signed exponent
 */
int32 bts_CompactMat_fltDotPrdRow( struct bbs_Context* cpA, 
								   struct bts_CompactMat* ptrA, 
							       const int16* inVecA,
							       uint32 inNormBitsA,
							       uint32 rowA )
{
	const int16* rowPtrL = ptrA->cpsArrE.arrPtrE + ptrA->wordsPerRowE * rowA;

	/* extract row-header info */
	uint32 offsL = *rowPtrL++;
	uint32 sizeL = *rowPtrL++;
	int32 factorManL = *rowPtrL++;
	int32 factorExpL = *rowPtrL++;
	uint32 rowNormBitsL = *rowPtrL++;

	/* consider possible overflow */
	uint16 overflowBitsL = ( inNormBitsA + rowNormBitsL >= 31 ) ? inNormBitsA + rowNormBitsL - 31 : 0;

	const int16* inPtrL = inVecA + offsL;

	count_t iL;
	int32 sumL = 0;

	if( overflowBitsL == 0 ) /* raw dot product fits in int32 */
	{
		switch( ptrA->bitsPerValueE )
		{
			case 16:
			{
				for( iL = sizeL; iL > 0; iL-- ) sumL += ( ( int32 )*rowPtrL++ * ( int32 )*inPtrL++ );
			}
			break;

			#ifndef HW_TMS320C5x /* platforms that don't have int8 must use the 'default' implementation */

			case 8:
			{
				const uint16* dpL = ( uint16* )rowPtrL;
				for( iL = sizeL; iL >= 8; iL -= 8 )
				{
					sumL += ( ( int8 )  dpL[ 0 ]         * ( int32 )inPtrL[ 0 ] );
					sumL += ( ( int8 )( dpL[ 0 ] >>  8 ) * ( int32 )inPtrL[ 1 ] );
					sumL += ( ( int8 )  dpL[ 1 ]         * ( int32 )inPtrL[ 2 ] );
					sumL += ( ( int8 )( dpL[ 1 ] >>  8 ) * ( int32 )inPtrL[ 3 ] );
					sumL += ( ( int8 )  dpL[ 2 ]         * ( int32 )inPtrL[ 4 ] );
					sumL += ( ( int8 )( dpL[ 2 ] >>  8 ) * ( int32 )inPtrL[ 5 ] );
					sumL += ( ( int8 )  dpL[ 3 ]         * ( int32 )inPtrL[ 6 ] );
					sumL += ( ( int8 )( dpL[ 3 ] >>  8 ) * ( int32 )inPtrL[ 7 ] );
					dpL += 4;
					inPtrL += 8;
				}
				for( ; iL >= 2; iL -= 2 )
				{
					sumL += ( ( int8 )  *dpL         * ( int32 )inPtrL[ 0 ] );
					sumL += ( ( int8 )( *dpL >>  8 ) * ( int32 )inPtrL[ 1 ] );
					dpL++;
					inPtrL += 2;
				}
				if( iL > 0 )
				{
					sumL += ( ( int8 )*dpL++ * ( int32 )inPtrL[ 0 ] );
				}
			}
			break;

			case 6:
			{
				const uint16* dpL = ( uint16* )rowPtrL;
				for( iL = sizeL; iL >= 8; iL -= 8 )
				{
					int32 lSumL = 0;
					lSumL += ( ( int8 )     ( dpL[ 0 ] <<  2 )                                  * ( int32 )inPtrL[ 0 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  4 )                       & 0x00FC ) * ( int32 )inPtrL[ 1 ] );
					lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 10 ) | ( dpL[ 1 ] << 6 ) ) & 0x00FC ) * ( int32 )inPtrL[ 2 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 1 ]       )                       & 0x00FC ) * ( int32 )inPtrL[ 3 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00FC ) * ( int32 )inPtrL[ 4 ] );
					lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 12 ) | ( dpL[ 2 ] << 4 ) ) & 0x00FC ) * ( int32 )inPtrL[ 5 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  2 )                       & 0x00FC ) * ( int32 )inPtrL[ 6 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  8 )                       & 0x00FC ) * ( int32 )inPtrL[ 7 ] );
					sumL += ( lSumL >> 2 );
					dpL += 3;
					inPtrL += 8;
				}

				{
					int32 lSumL = 0;
					if( iL > 0 ) lSumL += ( ( int8 )     ( dpL[ 0 ] <<  2 )                                  * ( int32 )inPtrL[ 0 ] );
					if( iL > 1 ) lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  4 )                       & 0x00FC ) * ( int32 )inPtrL[ 1 ] );
					if( iL > 2 ) lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 10 ) | ( dpL[ 1 ] << 6 ) ) & 0x00FC ) * ( int32 )inPtrL[ 2 ] );
					if( iL > 3 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ]       )                       & 0x00FC ) * ( int32 )inPtrL[ 3 ] );
					if( iL > 4 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00FC ) * ( int32 )inPtrL[ 4 ] );
					if( iL > 5 ) lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 12 ) | ( dpL[ 2 ] << 4 ) ) & 0x00FC ) * ( int32 )inPtrL[ 5 ] );
					if( iL > 6 ) lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  2 )                       & 0x00FC ) * ( int32 )inPtrL[ 6 ] );
					sumL += ( lSumL >> 2 );
				}
			}
			break;

			case 5: 
			{
				const uint16* dpL = ( uint16* )rowPtrL;
				for( iL = sizeL; iL >= 16; iL -= 16 )
				{
					int32 lSumL = 0;
					lSumL += ( ( int8 )     ( dpL[ 0 ] <<  3 )                                  * ( int32 )inPtrL[  0 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  2 )                       & 0x00F8 ) * ( int32 )inPtrL[  1 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  7 )                       & 0x00F8 ) * ( int32 )inPtrL[  2 ] );
					lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 12 ) | ( dpL[ 1 ] << 4 ) ) & 0x00F8 ) * ( int32 )inPtrL[  3 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  1 )                       & 0x00F8 ) * ( int32 )inPtrL[  4 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00F8 ) * ( int32 )inPtrL[  5 ] );
					lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 11 ) | ( dpL[ 2 ] << 5 ) ) & 0x00F8 ) * ( int32 )inPtrL[  6 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 2 ]       )                       & 0x00F8 ) * ( int32 )inPtrL[  7 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  5 )                       & 0x00F8 ) * ( int32 )inPtrL[  8 ] );
					lSumL += ( ( int8 ) ( ( ( dpL[ 2 ] >> 10 ) | ( dpL[ 3 ] << 6 ) ) & 0x00F8 ) * ( int32 )inPtrL[  9 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 3 ] <<  1 )                       & 0x00F8 ) * ( int32 )inPtrL[ 10 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 3 ] >>  4 )                       & 0x00F8 ) * ( int32 )inPtrL[ 11 ] );
					lSumL += ( ( int8 ) ( ( ( dpL[ 3 ] >>  9 ) | ( dpL[ 4 ] << 7 ) ) & 0x00F8 ) * ( int32 )inPtrL[ 12 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 4 ] <<  2 )                       & 0x00F8 ) * ( int32 )inPtrL[ 13 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 4 ] >>  3 )                       & 0x00F8 ) * ( int32 )inPtrL[ 14 ] );
					lSumL += ( ( int8 ) (   ( dpL[ 4 ] >>  8 )                       & 0x00F8 ) * ( int32 )inPtrL[ 15 ] );
					sumL += ( lSumL >> 3 );
					dpL += 5;
					inPtrL += 16;
				}

				{
					int32 lSumL = 0;
					if( iL >  0 ) lSumL += ( ( int8 )     ( dpL[ 0 ] <<  3 )                                  * ( int32 )inPtrL[  0 ] );
					if( iL >  1 ) lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  2 )                       & 0x00F8 ) * ( int32 )inPtrL[  1 ] );
					if( iL >  2 ) lSumL += ( ( int8 ) (   ( dpL[ 0 ] >>  7 )                       & 0x00F8 ) * ( int32 )inPtrL[  2 ] );
					if( iL >  3 ) lSumL += ( ( int8 ) ( ( ( dpL[ 0 ] >> 12 ) | ( dpL[ 1 ] << 4 ) ) & 0x00F8 ) * ( int32 )inPtrL[  3 ] );
					if( iL >  4 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  1 )                       & 0x00F8 ) * ( int32 )inPtrL[  4 ] );
					if( iL >  5 ) lSumL += ( ( int8 ) (   ( dpL[ 1 ] >>  6 )                       & 0x00F8 ) * ( int32 )inPtrL[  5 ] );
					if( iL >  6 ) lSumL += ( ( int8 ) ( ( ( dpL[ 1 ] >> 11 ) | ( dpL[ 2 ] << 5 ) ) & 0x00F8 ) * ( int32 )inPtrL[  6 ] );
					if( iL >  7 ) lSumL += ( ( int8 ) (   ( dpL[ 2 ]       )                       & 0x00F8 ) * ( int32 )inPtrL[  7 ] );
					if( iL >  8 ) lSumL += ( ( int8 ) (   ( dpL[ 2 ] >>  5 )                       & 0x00F8 ) * ( int32 )inPtrL[  8 ] );
					if( iL >  9 ) lSumL += ( ( int8 ) ( ( ( dpL[ 2 ] >> 10 ) | ( dpL[ 3 ] << 6 ) ) & 0x00F8 ) * ( int32 )inPtrL[  9 ] );
					if( iL > 10 ) lSumL += ( ( int8 ) (   ( dpL[ 3 ] <<  1 )                       & 0x00F8 ) * ( int32 )inPtrL[ 10 ] );
					if( iL > 11 ) lSumL += ( ( int8 ) (   ( dpL[ 3 ] >>  4 )                       & 0x00F8 ) * ( int32 )inPtrL[ 11 ] );
					if( iL > 12 ) lSumL += ( ( int8 ) ( ( ( dpL[ 3 ] >>  9 ) | ( dpL[ 4 ] << 7 ) ) & 0x00F8 ) * ( int32 )inPtrL[ 12 ] );
					if( iL > 13 ) lSumL += ( ( int8 ) (   ( dpL[ 4 ] <<  2 )                       & 0x00F8 ) * ( int32 )inPtrL[ 13 ] );
					if( iL > 14 ) lSumL += ( ( int8 ) (   ( dpL[ 4 ] >>  3 )                       & 0x00F8 ) * ( int32 )inPtrL[ 14 ] );
					sumL += ( lSumL >> 3 );
				}
			}
			break;

			case 4: 
			{
				for( iL = sizeL; iL >= 4; iL -= 4 )
				{
					uint16 v1L = *rowPtrL++;
					int32 lSumL = 0;
					lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
					lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
					lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
					lSumL += ( ( int8 )( ( v1L >> 8 ) & 0xF0 ) * ( int32 )inPtrL[ 3 ] );
					inPtrL += 4;
					sumL += ( lSumL >> 4 );
				}
				{
					uint16 v1L = *rowPtrL++;
					int32 lSumL = 0;
					if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
					if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
					if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
					sumL += ( lSumL >> 4 );
				}
			}
			break;

			#endif /*ifndef HW_TMS320C5x*/

			/* The default case can process all bit sizes including those that are explicitly encoded above
			 * Use the default for all bit sizes when the platform cannot handle the int8 data type (e.g. HW_TMS320C5x)
			 */
			default:
			{
				uint32 bfL = ( ( uint32 )*rowPtrL++ ) << 16;
				uint32 bitsL = ptrA->bitsPerValueE;
				uint16 adjL = 16 - bitsL;
				uint32 mkL = ( ( 1 << bitsL ) - 1 ) << adjL;
				uint32 srL = bitsL;
				for( iL = 0; iL < sizeL; iL++ )
				{
					if( srL > 16 )
					{
						bfL = ( ( ( uint32 )*rowPtrL++ ) << 16 ) | ( bfL >> 16 );
						srL -= 16;
					}
					sumL += ( ( int16 )( ( bfL >> srL ) & mkL ) * ( int32 )inPtrL[ iL ] ) >> adjL;
					srL += bitsL;
				}
			}
		}
	}
	else /* raw dot product does not fit in int32 */
	{
		int32 roundL = 1 << ( overflowBitsL - 1 );
		switch( ptrA->bitsPerValueE )
		{
			case 16:
			{
				for( iL = sizeL; iL > 0; iL-- ) sumL += ( ( ( int32 )*rowPtrL++ * ( int32 )*inPtrL++ ) + roundL ) >> overflowBitsL;
			}
			break;

			case 8: 
			{
				for( iL = sizeL; iL >= 2; iL -= 2 )
				{
					uint16 v1L = *rowPtrL++;
					int32 lSumL =   ( ( int8 )  v1L         * ( int32 )inPtrL[ 0 ] )
						          + ( ( int8 )( v1L >>  8 ) * ( int32 )inPtrL[ 1 ] );
					sumL += ( lSumL + roundL ) >> overflowBitsL;
					inPtrL += 2;
				}
				if( iL > 0 )
				{
					sumL += ( ( ( int8 )*rowPtrL++ * ( int32 )inPtrL[ 0 ] ) + roundL ) >> overflowBitsL;
				}
			}
			break;

			case 4: 
			{
				for( iL = sizeL; iL >= 4; iL -= 4 )
				{
					uint16 v1L = *rowPtrL++;
					int32 lSumL = 0;
					lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
					lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
					lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
					lSumL += ( ( int8 )( ( v1L >> 8 ) & 0xF0 ) * ( int32 )inPtrL[ 3 ] );
					inPtrL += 4;
					sumL += ( ( lSumL >> 4 ) + roundL ) >> overflowBitsL;
				}
				{
					uint16 v1L = *rowPtrL++;
					int32 lSumL = 0;
					if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L << 4 )        ) * ( int32 )inPtrL[ 0 ] );
					if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L      ) & 0xF0 ) * ( int32 )inPtrL[ 1 ] );
					if( iL-- > 0 ) lSumL += ( ( int8 )( ( v1L >> 4 ) & 0xF0 ) * ( int32 )inPtrL[ 2 ] );
					sumL += ( ( lSumL >> 4 ) + roundL ) >> overflowBitsL;
				}
			}
			break;

			default:
			{
				uint32 bfL = ( ( uint32 )*rowPtrL++ ) << 16;
				uint32 bitsL = ptrA->bitsPerValueE;
				uint16 adjL = 16 - bitsL;
				uint32 mkL = ( ( 1 << bitsL ) - 1 ) << adjL;
				uint32 srL = bitsL;
				int32 lRoundL = roundL << adjL;
				int32 lAdjL = overflowBitsL + adjL;
				for( iL = 0; iL < sizeL; iL++ )
				{
					if( srL > 16 )
					{
						bfL = ( ( ( uint32 )*rowPtrL++ ) << 16 ) | ( bfL >> 16 );
						srL -= 16;
					}
					sumL += ( ( int16 )( ( bfL >> srL ) & mkL ) * ( int32 )inPtrL[ iL ] + lRoundL ) >> lAdjL;
					srL += bitsL;
				}
			}
		}
	}

	/* compute result */
	{
		int32 resultManL;
		int32 resultExpL;
		int32 resultLogL;
		bbs_mulS32( sumL, factorManL, &resultManL, &resultExpL );
		resultExpL += factorExpL + overflowBitsL;
		resultLogL = bbs_intLog2( resultManL > 0 ? resultManL : -resultManL );
		if( resultLogL < 30 )
		{
			resultManL <<= 30 - resultLogL;
			resultExpL  -= 30 - resultLogL;
		}

		resultManL = ( ( resultManL >> 15 ) + 1 ) >> 1;
		resultExpL = resultExpL + 16;

		return ( ( resultManL & 0x0000FFFF ) << 16 ) | ( resultExpL & 0x0000FFFF );
	}
}

/* ------------------------------------------------------------------------- */

/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ constructor / destructor } ----------------------------------- */
/*                                                                           */
/* ========================================================================= */

/* ------------------------------------------------------------------------- */

void bts_CompactMat_init( struct bbs_Context* cpA,
					      struct bts_CompactMat* ptrA )
{
	ptrA->widthE = 0;
	ptrA->heightE = 0;
	ptrA->bitsPerValueE = 0;
	ptrA->wordsPerRowE = 0;
	ptrA->maxRowBitsE = 0;
	bbs_Int16Arr_init( cpA, &ptrA->cpsArrE );
	bbs_Int16Arr_init( cpA, &ptrA->expArrE );
	
}

/* ------------------------------------------------------------------------- */

void bts_CompactMat_exit( struct bbs_Context* cpA,
					    struct bts_CompactMat* ptrA )
{
	ptrA->widthE = 0;
	ptrA->heightE = 0;
	ptrA->bitsPerValueE = 0;
	ptrA->wordsPerRowE = 0;
	ptrA->maxRowBitsE = 0;
	bbs_Int16Arr_exit( cpA, &ptrA->cpsArrE );
	bbs_Int16Arr_exit( cpA, &ptrA->expArrE );
}
/* ------------------------------------------------------------------------- */

/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ operators } -------------------------------------------------- */
/*                                                                           */
/* ========================================================================= */

/* ------------------------------------------------------------------------- */

/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ query functions } -------------------------------------------- */
/*                                                                           */
/* ========================================================================= */

/* ------------------------------------------------------------------------- */

/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ modify functions } ------------------------------------------- */
/*                                                                           */
/* ========================================================================= */

/* ------------------------------------------------------------------------- */
	
void bts_CompactMat_create( struct bbs_Context* cpA,
						    struct bts_CompactMat* ptrA, 
						    uint32 widthA,
						    uint32 heightA,
						    uint32 bitsA,
							uint32 maxRowSizeA,
				            struct bbs_MemSeg* mspA )
{
	if( bbs_Context_error( cpA ) ) return;
	if( bitsA < 2 || bitsA > 16 )
	{
		bbs_ERROR0( "bts_CompactMat_create:\nbitsA must be between 2 and 16" );
		return;
	}

	ptrA->widthE = widthA;
	ptrA->heightE = heightA;
	ptrA->bitsPerValueE = bitsA;
	ptrA->wordsPerRowE = 6 /*header + 1*/ + ( ( maxRowSizeA * bitsA ) / ( 8 * sizeof( short ) ) );
	ptrA->maxRowBitsE = 0;
	if( ( ptrA->wordsPerRowE & 1 ) != 0 ) ptrA->wordsPerRowE++;
	bbs_Int16Arr_create( cpA, &ptrA->cpsArrE, heightA * ptrA->wordsPerRowE, mspA );
	bbs_Int16Arr_fill( cpA, &ptrA->cpsArrE, 0 );
	bbs_Int16Arr_create( cpA, &ptrA->expArrE, ptrA->heightE, mspA );
	bbs_Int16Arr_fill( cpA, &ptrA->expArrE, 0 );
}

/* ------------------------------------------------------------------------- */
	
void bts_CompactMat_copy( struct bbs_Context* cpA,
					      struct bts_CompactMat* ptrA, 
						  const struct bts_CompactMat* srcPtrA )
{
	ptrA->widthE = srcPtrA->widthE;
	ptrA->heightE = srcPtrA->heightE;
	ptrA->bitsPerValueE = srcPtrA->bitsPerValueE;
	ptrA->wordsPerRowE = srcPtrA->wordsPerRowE;
	ptrA->maxRowBitsE = srcPtrA->maxRowBitsE;
	bbs_Int16Arr_copy( cpA, &ptrA->cpsArrE, &srcPtrA->cpsArrE );
	bbs_Int16Arr_size( cpA, &ptrA->expArrE, ptrA->heightE );
}

/* ------------------------------------------------------------------------- */
	
/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ I/O } -------------------------------------------------------- */
/*                                                                           */
/* ========================================================================= */

/* ------------------------------------------------------------------------- */
	
uint32 bts_CompactMat_memSize( struct bbs_Context* cpA,
							 const struct bts_CompactMat *ptrA )
{
	return  bbs_SIZEOF16( uint32 )
		  + bbs_SIZEOF16( uint32 ) /* version */
		  + bbs_SIZEOF16( ptrA->widthE ) 
		  + bbs_SIZEOF16( ptrA->heightE ) 
		  + bbs_SIZEOF16( ptrA->bitsPerValueE ) 
		  + bbs_SIZEOF16( ptrA->wordsPerRowE )
		  + bbs_SIZEOF16( ptrA->maxRowBitsE )
		  + bbs_Int16Arr_memSize( cpA, &ptrA->cpsArrE );
}

/* ------------------------------------------------------------------------- */
	
uint32 bts_CompactMat_memWrite( struct bbs_Context* cpA,
							  const struct bts_CompactMat* ptrA, 
							  uint16* memPtrA )
{
	uint32 memSizeL = bts_CompactMat_memSize( cpA, ptrA );
	memPtrA += bbs_memWrite32( &memSizeL, memPtrA );
	memPtrA += bbs_memWriteUInt32( bts_COMPACT_MAT_VERSION, memPtrA );
	memPtrA += bbs_memWrite32( &ptrA->widthE, memPtrA );
	memPtrA += bbs_memWrite32( &ptrA->heightE, memPtrA );
	memPtrA += bbs_memWrite32( &ptrA->bitsPerValueE, memPtrA );
	memPtrA += bbs_memWrite32( &ptrA->wordsPerRowE, memPtrA );
	memPtrA += bbs_memWrite32( &ptrA->maxRowBitsE, memPtrA );
	memPtrA += bbs_Int16Arr_memWrite( cpA, &ptrA->cpsArrE, memPtrA );
	return memSizeL;
}

/* ------------------------------------------------------------------------- */
	
uint32 bts_CompactMat_memRead( struct bbs_Context* cpA,
							 struct bts_CompactMat* ptrA, 
							 const uint16* memPtrA,
				             struct bbs_MemSeg* mspA )
{
	uint32 memSizeL, versionL;
	if( bbs_Context_error( cpA ) ) return 0;
	memPtrA += bbs_memRead32( &memSizeL, memPtrA );
	memPtrA += bbs_memReadVersion32( cpA, &versionL, bts_COMPACT_MAT_VERSION, memPtrA );
	memPtrA += bbs_memRead32( &ptrA->widthE, memPtrA );
	memPtrA += bbs_memRead32( &ptrA->heightE, memPtrA );
	memPtrA += bbs_memRead32( &ptrA->bitsPerValueE, memPtrA );
	memPtrA += bbs_memRead32( &ptrA->wordsPerRowE, memPtrA );
	memPtrA += bbs_memRead32( &ptrA->maxRowBitsE, memPtrA );
	memPtrA += bbs_Int16Arr_memRead( cpA, &ptrA->cpsArrE, memPtrA, mspA );

	if( memSizeL != bts_CompactMat_memSize( cpA, ptrA ) )
	{
		bbs_ERR0( bbs_ERR_CORRUPT_DATA, "uint32 bts_CompactMat_memRead( const struct bts_CompactMat* ptrA, const void* memPtrA ):\n"
                  "size mismatch" ); 
	}

	bbs_Int16Arr_create( cpA, &ptrA->expArrE, ptrA->heightE, mspA );
	bbs_Int16Arr_fill( cpA, &ptrA->expArrE, 0 );

	return memSizeL;
}

/* ------------------------------------------------------------------------- */
	
/* ========================================================================= */
/*                                                                           */
/* ---- \ghd{ exec functions } --------------------------------------------- */
/*                                                                           */
/* ========================================================================= */

/* ------------------------------------------------------------------------- */

void bts_CompactMat_map( struct bbs_Context* cpA, 
						 const struct bts_CompactMat* ptrA, 
						 const int16* inVecA,
						 int16* outVecA,
						 int16* outExpPtrA )
{
	uint32 inNormBitsL = bbs_intLog2( bbs_vecNorm16( inVecA, ptrA->widthE ) ) + 1;
	uint32 iL;

	int16* expArrL = ( ( struct bts_CompactMat* )ptrA )->expArrE.arrPtrE;
	int16 maxExpL = -32767;

	for( iL = 0; iL < ptrA->heightE; iL++ )
	{
		int32 fltL = bts_CompactMat_fltDotPrdRow( cpA, ( struct bts_CompactMat* )ptrA, inVecA, inNormBitsL, iL );
		outVecA[ iL ] = fltL >> 16; 
		expArrL[ iL ] = fltL & 0x0000FFFF;

		maxExpL = ( expArrL[ iL ] > maxExpL ) ? expArrL[ iL ] : maxExpL;
	}

	if( outExpPtrA != NULL ) *outExpPtrA = maxExpL;

	for( iL = 0; iL < ptrA->heightE; iL++ )
	{
		int32 shrL = maxExpL - expArrL[ iL ];
		if( shrL > 0 )
		{
			outVecA[ iL ] = ( ( outVecA[ iL ] >> ( shrL - 1 ) ) + 1 ) >> 1;
		}
	}
}

/* ------------------------------------------------------------------------- */

/* ========================================================================= */