// Copyright 2015 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// output.h: processing the 32-bit accumulators output by the unpack
// stage, obtaining the final result matrix entries and storing them into
// the destination matrix.
#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
#define GEMMLOWP_INTERNAL_OUTPUT_H_
#include <cmath>
#include <tuple>
#include <type_traits>
#include "../public/output_stages.h"
#include "fixedpoint.h"
namespace gemmlowp {
// A Fragment is a small fixed-size matrix typically stored in one or
// a few architecture-specific SIMD vectors. Besides plain old scalar types
// such as int32_t, Fragment types are what can be used as input/output data
// types for output pipeline stages.
//
// More details:
//
// In the generic scalar code in this file, we have only implemented
// evaluation of output stages for scalar inputs (e.g. plain int32_t values).
// Other files (e.g. output_neon.h) are to provide SIMD paths by implementing
// evaluation of output stages for SIMD vector types. However, this raises
// the question of how the different values ("lanes") in a SIMD vector
// correspond to different values in the whole matrices. For simple entry-wise
// output stages, this doesn't matter, but for other output stages depending
// on position within the whole matrix, this does matter. To solve this
// problem, rather than implementing evaluation of output stages for raw
// SIMD vector types, we wrap SIMD vector types in "fragment" structs that
// bring the additional structure of "shape" i.e. mapping SIMD lanes to
// matrix entries, and we specialize evaluation of output stage for such
// fragment types. The Fragment template struct here is how we generate
// all fragment structs. For example, in output_neon.h, it may be specialized
// with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for
// vector shapes. While Fragment is only used for SIMD paths, we leave it
// here in this platform-generic file because this same template should
// cover the needs of any SIMD architectures.
template <typename tDataType, int tRows, int tCols, MapOrder tOrder>
struct Fragment {
typedef tDataType DataType;
static const int kRows = tRows;
static const int kCols = tCols;
static const MapOrder kOrder = tOrder;
Fragment() {}
Fragment(const DataType& d) : data(d) {}
operator DataType() const { return data; }
DataType data;
};
typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1;
typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1;
// OutputStageEvalImpl is the template that we specialize to provide
// implementations of each output stage for each type of input data.
//
// Each specialization provides a OutputType typedef and an Eval function
// returning OutputType. The OutputType typically depends on the InputType.
//
// There are two dimensions in which input data types can vary:
// 1. Different output stages may expect different data types. The
// only hard constraint is that the first stage accepts int32, as
// the unpack stage produces int32 accumulators.
// 2. For a given scalar data type such as int32, there is still the
// possibility of having SIMD vector types such as NEON int32x4_t,
// typically wrapped as "fragment" types, see struct Fragment.
// Thus, there can be several OutputStageEvalImpl
// specializations for a single OutputStageType, for different
// InputType's.
template <typename OutputStageType, typename InputType>
struct OutputStageEvalImpl {
// This generic template body should never be hit.
static_assert(
std::is_same<InputType, void>::value,
"Unimplemented: missing implementation of this output pipeline stage "
"for this data type. This would happen if some architecture-specific "
"SIMD back-end (output_$arch.h) were incomplete.");
OutputStageEvalImpl(const OutputStageType&) {}
};
// Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data
template <>
struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
FragmentInt32x1x1> {
typedef FragmentInt32x1x1 InputType;
typedef FragmentInt32x1x1 OutputType;
typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
OutputType Eval(InputType input, int, int) const {
const std::int32_t result_shift = output_stage.result_shift;
const std::int32_t result_mult_int = output_stage.result_mult_int;
const std::int32_t result_offset = output_stage.result_offset;
const std::int32_t kRoundingTerm =
(result_shift < 1) ? 0 : (1 << (result_shift - 1));
return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
result_shift;
}
const OutputStage& output_stage;
};
template <>
struct OutputStageEvalImpl<
OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
FragmentInt32x1x1> {
typedef FragmentInt32x1x1 InputType;
typedef FragmentInt32x1x1 OutputType;
typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
OutputStage;
OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
OutputType Eval(InputType input, int row, int col) const {
const std::int32_t result_shift = output_stage.result_shift;
const std::int32_t result_mult_int = output_stage.result_mult_int(row);
const std::int32_t result_offset = output_stage.result_offset(row);
const std::int32_t kRoundingTerm =
(result_shift < 1) ? 0 : (1 << (result_shift - 1));
return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
result_shift;
}
const OutputStage& output_stage;
};
template <>
struct OutputStageEvalImpl<
OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
FragmentInt32x1x1> {
typedef FragmentInt32x1x1 InputType;
typedef FragmentInt32x1x1 OutputType;
typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
OutputStage;
OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
OutputType Eval(InputType input, int row, int col) const {
const std::int32_t result_shift = output_stage.result_shift;
const std::int32_t result_mult_int = output_stage.result_mult_int(col);
const std::int32_t result_offset = output_stage.result_offset(col);
const std::int32_t kRoundingTerm =
(result_shift < 1) ? 0 : (1 << (result_shift - 1));
return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
result_shift;
}
const OutputStage& output_stage;
};
// Implementation of OutputStageSaturatingCastToUint8 for scalar data
template <>
struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
FragmentInt32x1x1> {
typedef FragmentInt32x1x1 InputType;
typedef FragmentUint8x1x1 OutputType;
typedef OutputStageSaturatingCastToUint8 OutputStage;
OutputStageEvalImpl(const OutputStage&) {}
OutputType Eval(InputType input, int, int) const {
std::int32_t data = input.data;
return data > 255 ? 255 : data < 0 ? 0 : data;
}
};
// Implementation of OutputStageBiasAddition for scalar data
template <typename VectorType>
struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
FragmentInt32x1x1> {
typedef FragmentInt32x1x1 InputType;
typedef FragmentInt32x1x1 OutputType;
typedef OutputStageBiasAddition<VectorType> OutputStage;
OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
OutputType Eval(InputType input, int row, int col) const {
if (VectorType::kShape == VectorShape::Row) {
return input + output_stage.bias_vector(col);
} else {
return input + output_stage.bias_vector(row);
}
}
const OutputStage& output_stage;
};
// Implementation of OutputStageClamp for scalar data
template <>
struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> {
typedef FragmentInt32x1x1 InputType;
typedef FragmentInt32x1x1 OutputType;
typedef OutputStageClamp OutputStage;
OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
OutputType Eval(InputType input, int, int) const {
const std::int32_t min = output_stage.min;
const std::int32_t max = output_stage.max;
return std::min(std::max(input.data, min), max);
}
const OutputStage& output_stage;
};
// Implementation of OutputStageTanh for either scalar or SIMD data
template <typename tInputType>
struct OutputStageTanhEvalImpl {
typedef tInputType InputType;
typedef InputType OutputType;
typedef typename InputType::DataType DataType;
typedef OutputStageTanh OutputStage;
OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) {
const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
const std::int32_t real_amplitude_as_int32 =
output_stage.real_amplitude_as_int32;
input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
output_min = real_zero_as_int32 - real_amplitude_as_int32;
output_max = real_zero_as_int32 + real_amplitude_as_int32;
double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
inverse_amplitude_neg_exponent = 0;
while (inverse_amplitude_normalized_double < 0.5) {
inverse_amplitude_normalized_double *= 2;
inverse_amplitude_neg_exponent++;
}
inverse_amplitude_normalized =
ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double);
double amplitude_normalized_double = real_amplitude_as_int32;
amplitude_exponent = 0;
while (amplitude_normalized_double >= 1.0) {
amplitude_normalized_double *= 0.5;
amplitude_exponent++;
}
amplitude_normalized =
ToFixedPoint<DataType, 0>(amplitude_normalized_double);
}
OutputType Eval(InputType input, int, int) const {
const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
typedef FixedPoint<DataType, 3> F3;
typedef FixedPoint<DataType, 0> F0;
// fixed-point affine transformation
DataType input_centered =
Sub(input.data, Dup<DataType>(real_zero_as_int32));
F3 fixedpoint_input =
F3::FromRaw(input_centered) * inverse_amplitude_normalized;
// left shift
fixedpoint_input.raw() =
ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent);
// fixed-point tanh and multiplication
F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
// right shift
DataType int32_output =
Add(Dup<DataType>(real_zero_as_int32),
ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));
DataType mask_if_below_cutoff_min =
MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min));
DataType mask_if_above_cutoff_max =
MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max));
return SelectUsingMask(
mask_if_below_cutoff_min, Dup<DataType>(output_min),
SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
int32_output));
}
const OutputStage& output_stage;
std::int32_t input_cutoff_min, input_cutoff_max;
std::int32_t output_min, output_max;
FixedPoint<DataType, 0> inverse_amplitude_normalized;
int inverse_amplitude_neg_exponent;
FixedPoint<DataType, 0> amplitude_normalized;
int amplitude_exponent;
};
template <>
struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1>
: OutputStageTanhEvalImpl<FragmentInt32x1x1> {
OutputStageEvalImpl(const OutputStageTanh& output_stage)
: OutputStageTanhEvalImpl(output_stage) {}
};
// OutputPipelineOutputType is a helper to determine the output data type of a
// pipeline, for a
// given input data type. It is a recursive template; see the explanation on
// OutputPipelineEvalImpl below.
template <typename OutputPipelineType, int FirstStage, typename InputType,
bool StopRecursion =
FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineOutputType {
typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
FirstStageType;
typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
FirstStageOutputType;
typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
FirstStageOutputType>::Type Type;
};
template <typename OutputPipelineType, int FirstStage, typename InputType>
struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
true> {
typedef InputType Type;
};
// OutputPipelineEvalImpl is a helper to implement the evaluation of
// the whole pipeline. It is a recursive template to implement compile-time
// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
// is how we implement recursion: each specialization implements only
// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
// helper to implement the termination of the recursion as a partial
// specialization below.
template <typename OutputPipelineType, int FirstStage, typename InputType,
bool StopRecursion =
FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineEvalImpl {
typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
FirstStageType;
typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
FirstStageOutputType;
typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
InputType>::Type OutputType;
OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
: head_impl(std::get<FirstStage>(output_pipeline)),
tail_impl(output_pipeline) {}
OutputType Eval(InputType input, int row, int col) const {
// Evaluate the first stage.
FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
// Recurse into the remaining stages.
return tail_impl.Eval(first_stage_output, row, col);
}
const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
FirstStageOutputType>
tail_impl;
};
// Specialization on 'StopRecursion' for terminating the recursion.
template <typename OutputPipelineType, int FirstStage, typename InputType>
struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
OutputPipelineEvalImpl(const OutputPipelineType&) {}
InputType Eval(InputType input, int, int) const {
// Terminating the recursion.
return input;
}
};
// StoreFinalOutput takes the final value at the end of the output pipeline and
// stores it into the destination matrix. It can be specialized for different
// data types; the generic implementation here is typically used only for plain
// old scalar (not SIMD) types.
template <typename OutputType, typename DstType>
void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) {
*dst->data(row, col) = value;
}
template <typename OutputPipelineType, typename InputType>
struct OutputPipelineExecutor {
OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
: output_pipeline_eval_impl_(output_pipeline) {}
// RunOutputPipeline is the entry point into the output pipeline evaluation
// code. It should be the only thing that unpack code calls. It takes the
// result
// of the unpack stage and stores it into the destination matrix.
template <typename DstType>
void Execute(InputType input, DstType* dst, int row, int col) {
// Statically assert that the output pipeline matches the given destination
// matrix's scalar type.
typedef typename OutputPipelineOutputType<OutputPipelineType, 0,
FragmentInt32x1x1>::Type::DataType
ScalarOutputType;
typedef typename DstType::Scalar ScalarDstType;
static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
"mismatched destination scalar type and output pipeline");
// Evaluate the output pipeline.
auto output = output_pipeline_eval_impl_.Eval(input, row, col);
// Store the result into the destination matrix.
StoreFinalOutput(output, dst, row, col);
}
const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
output_pipeline_eval_impl_;
};
} // namespace gemmlowp
#endif // GEMMLOWP_INTERNAL_OUTPUT_H_