// Copyright 2015 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // output.h: processing the 32-bit accumulators output by the unpack // stage, obtaining the final result matrix entries and storing them into // the destination matrix. #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_ #define GEMMLOWP_INTERNAL_OUTPUT_H_ #include <cmath> #include <tuple> #include <type_traits> #include "../public/output_stages.h" #include "fixedpoint.h" namespace gemmlowp { // A Fragment is a small fixed-size matrix typically stored in one or // a few architecture-specific SIMD vectors. Besides plain old scalar types // such as int32_t, Fragment types are what can be used as input/output data // types for output pipeline stages. // // More details: // // In the generic scalar code in this file, we have only implemented // evaluation of output stages for scalar inputs (e.g. plain int32_t values). // Other files (e.g. output_neon.h) are to provide SIMD paths by implementing // evaluation of output stages for SIMD vector types. However, this raises // the question of how the different values ("lanes") in a SIMD vector // correspond to different values in the whole matrices. For simple entry-wise // output stages, this doesn't matter, but for other output stages depending // on position within the whole matrix, this does matter. To solve this // problem, rather than implementing evaluation of output stages for raw // SIMD vector types, we wrap SIMD vector types in "fragment" structs that // bring the additional structure of "shape" i.e. mapping SIMD lanes to // matrix entries, and we specialize evaluation of output stage for such // fragment types. The Fragment template struct here is how we generate // all fragment structs. For example, in output_neon.h, it may be specialized // with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for // vector shapes. While Fragment is only used for SIMD paths, we leave it // here in this platform-generic file because this same template should // cover the needs of any SIMD architectures. template <typename tDataType, int tRows, int tCols, MapOrder tOrder> struct Fragment { typedef tDataType DataType; static const int kRows = tRows; static const int kCols = tCols; static const MapOrder kOrder = tOrder; Fragment() {} Fragment(const DataType& d) : data(d) {} operator DataType() const { return data; } DataType data; }; typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1; typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1; // OutputStageEvalImpl is the template that we specialize to provide // implementations of each output stage for each type of input data. // // Each specialization provides a OutputType typedef and an Eval function // returning OutputType. The OutputType typically depends on the InputType. // // There are two dimensions in which input data types can vary: // 1. Different output stages may expect different data types. The // only hard constraint is that the first stage accepts int32, as // the unpack stage produces int32 accumulators. // 2. For a given scalar data type such as int32, there is still the // possibility of having SIMD vector types such as NEON int32x4_t, // typically wrapped as "fragment" types, see struct Fragment. // Thus, there can be several OutputStageEvalImpl // specializations for a single OutputStageType, for different // InputType's. template <typename OutputStageType, typename InputType> struct OutputStageEvalImpl { // This generic template body should never be hit. static_assert( std::is_same<InputType, void>::value, "Unimplemented: missing implementation of this output pipeline stage " "for this data type. This would happen if some architecture-specific " "SIMD back-end (output_$arch.h) were incomplete."); OutputStageEvalImpl(const OutputStageType&) {} }; // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data template <> struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale, FragmentInt32x1x1> { typedef FragmentInt32x1x1 InputType; typedef FragmentInt32x1x1 OutputType; typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int, int) const { const std::int32_t result_shift = output_stage.result_shift; const std::int32_t result_mult_int = output_stage.result_mult_int; const std::int32_t result_offset = output_stage.result_offset; const std::int32_t kRoundingTerm = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); return ((input + result_offset) * result_mult_int + kRoundingTerm) >> result_shift; } const OutputStage& output_stage; }; template <> struct OutputStageEvalImpl< OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>, FragmentInt32x1x1> { typedef FragmentInt32x1x1 InputType; typedef FragmentInt32x1x1 OutputType; typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col> OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int row, int col) const { const std::int32_t result_shift = output_stage.result_shift; const std::int32_t result_mult_int = output_stage.result_mult_int(row); const std::int32_t result_offset = output_stage.result_offset(row); const std::int32_t kRoundingTerm = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); return ((input + result_offset) * result_mult_int + kRoundingTerm) >> result_shift; } const OutputStage& output_stage; }; template <> struct OutputStageEvalImpl< OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>, FragmentInt32x1x1> { typedef FragmentInt32x1x1 InputType; typedef FragmentInt32x1x1 OutputType; typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row> OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int row, int col) const { const std::int32_t result_shift = output_stage.result_shift; const std::int32_t result_mult_int = output_stage.result_mult_int(col); const std::int32_t result_offset = output_stage.result_offset(col); const std::int32_t kRoundingTerm = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); return ((input + result_offset) * result_mult_int + kRoundingTerm) >> result_shift; } const OutputStage& output_stage; }; // Implementation of OutputStageSaturatingCastToUint8 for scalar data template <> struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8, FragmentInt32x1x1> { typedef FragmentInt32x1x1 InputType; typedef FragmentUint8x1x1 OutputType; typedef OutputStageSaturatingCastToUint8 OutputStage; OutputStageEvalImpl(const OutputStage&) {} OutputType Eval(InputType input, int, int) const { std::int32_t data = input.data; return data > 255 ? 255 : data < 0 ? 0 : data; } }; // Implementation of OutputStageBiasAddition for scalar data template <typename VectorType> struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, FragmentInt32x1x1> { typedef FragmentInt32x1x1 InputType; typedef FragmentInt32x1x1 OutputType; typedef OutputStageBiasAddition<VectorType> OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int row, int col) const { if (VectorType::kShape == VectorShape::Row) { return input + output_stage.bias_vector(col); } else { return input + output_stage.bias_vector(row); } } const OutputStage& output_stage; }; // Implementation of OutputStageClamp for scalar data template <> struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> { typedef FragmentInt32x1x1 InputType; typedef FragmentInt32x1x1 OutputType; typedef OutputStageClamp OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int, int) const { const std::int32_t min = output_stage.min; const std::int32_t max = output_stage.max; return std::min(std::max(input.data, min), max); } const OutputStage& output_stage; }; // Implementation of OutputStageTanh for either scalar or SIMD data template <typename tInputType> struct OutputStageTanhEvalImpl { typedef tInputType InputType; typedef InputType OutputType; typedef typename InputType::DataType DataType; typedef OutputStageTanh OutputStage; OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) { const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; const std::int32_t real_amplitude_as_int32 = output_stage.real_amplitude_as_int32; input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32; input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32; output_min = real_zero_as_int32 - real_amplitude_as_int32; output_max = real_zero_as_int32 + real_amplitude_as_int32; double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32; inverse_amplitude_neg_exponent = 0; while (inverse_amplitude_normalized_double < 0.5) { inverse_amplitude_normalized_double *= 2; inverse_amplitude_neg_exponent++; } inverse_amplitude_normalized = ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double); double amplitude_normalized_double = real_amplitude_as_int32; amplitude_exponent = 0; while (amplitude_normalized_double >= 1.0) { amplitude_normalized_double *= 0.5; amplitude_exponent++; } amplitude_normalized = ToFixedPoint<DataType, 0>(amplitude_normalized_double); } OutputType Eval(InputType input, int, int) const { const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; typedef FixedPoint<DataType, 3> F3; typedef FixedPoint<DataType, 0> F0; // fixed-point affine transformation DataType input_centered = Sub(input.data, Dup<DataType>(real_zero_as_int32)); F3 fixedpoint_input = F3::FromRaw(input_centered) * inverse_amplitude_normalized; // left shift fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent); // fixed-point tanh and multiplication F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized; // right shift DataType int32_output = Add(Dup<DataType>(real_zero_as_int32), ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent)); DataType mask_if_below_cutoff_min = MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min)); DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max)); return SelectUsingMask( mask_if_below_cutoff_min, Dup<DataType>(output_min), SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max), int32_output)); } const OutputStage& output_stage; std::int32_t input_cutoff_min, input_cutoff_max; std::int32_t output_min, output_max; FixedPoint<DataType, 0> inverse_amplitude_normalized; int inverse_amplitude_neg_exponent; FixedPoint<DataType, 0> amplitude_normalized; int amplitude_exponent; }; template <> struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1> : OutputStageTanhEvalImpl<FragmentInt32x1x1> { OutputStageEvalImpl(const OutputStageTanh& output_stage) : OutputStageTanhEvalImpl(output_stage) {} }; // OutputPipelineOutputType is a helper to determine the output data type of a // pipeline, for a // given input data type. It is a recursive template; see the explanation on // OutputPipelineEvalImpl below. template <typename OutputPipelineType, int FirstStage, typename InputType, bool StopRecursion = FirstStage == std::tuple_size<OutputPipelineType>::value> struct OutputPipelineOutputType { typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type FirstStageType; typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType FirstStageOutputType; typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1, FirstStageOutputType>::Type Type; }; template <typename OutputPipelineType, int FirstStage, typename InputType> struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, true> { typedef InputType Type; }; // OutputPipelineEvalImpl is a helper to implement the evaluation of // the whole pipeline. It is a recursive template to implement compile-time // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter // is how we implement recursion: each specialization implements only // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a // helper to implement the termination of the recursion as a partial // specialization below. template <typename OutputPipelineType, int FirstStage, typename InputType, bool StopRecursion = FirstStage == std::tuple_size<OutputPipelineType>::value> struct OutputPipelineEvalImpl { typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type FirstStageType; typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType FirstStageOutputType; typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType>::Type OutputType; OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline) : head_impl(std::get<FirstStage>(output_pipeline)), tail_impl(output_pipeline) {} OutputType Eval(InputType input, int row, int col) const { // Evaluate the first stage. FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col); // Recurse into the remaining stages. return tail_impl.Eval(first_stage_output, row, col); } const OutputStageEvalImpl<FirstStageType, InputType> head_impl; const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1, FirstStageOutputType> tail_impl; }; // Specialization on 'StopRecursion' for terminating the recursion. template <typename OutputPipelineType, int FirstStage, typename InputType> struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> { OutputPipelineEvalImpl(const OutputPipelineType&) {} InputType Eval(InputType input, int, int) const { // Terminating the recursion. return input; } }; // StoreFinalOutput takes the final value at the end of the output pipeline and // stores it into the destination matrix. It can be specialized for different // data types; the generic implementation here is typically used only for plain // old scalar (not SIMD) types. template <typename OutputType, typename DstType> void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) { *dst->data(row, col) = value; } template <typename OutputPipelineType, typename InputType> struct OutputPipelineExecutor { OutputPipelineExecutor(const OutputPipelineType& output_pipeline) : output_pipeline_eval_impl_(output_pipeline) {} // RunOutputPipeline is the entry point into the output pipeline evaluation // code. It should be the only thing that unpack code calls. It takes the // result // of the unpack stage and stores it into the destination matrix. template <typename DstType> void Execute(InputType input, DstType* dst, int row, int col) { // Statically assert that the output pipeline matches the given destination // matrix's scalar type. typedef typename OutputPipelineOutputType<OutputPipelineType, 0, FragmentInt32x1x1>::Type::DataType ScalarOutputType; typedef typename DstType::Scalar ScalarDstType; static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value, "mismatched destination scalar type and output pipeline"); // Evaluate the output pipeline. auto output = output_pipeline_eval_impl_.Eval(input, row, col); // Store the result into the destination matrix. StoreFinalOutput(output, dst, row, col); } const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType> output_pipeline_eval_impl_; }; } // namespace gemmlowp #endif // GEMMLOWP_INTERNAL_OUTPUT_H_