// Copyright 2015 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // output_neon.h: optimized NEON specializations of the templates in output.h. #ifndef GEMMLOWP_INTERNAL_OUTPUT_NEON_H_ #define GEMMLOWP_INTERNAL_OUTPUT_NEON_H_ #include "output.h" #include <arm_neon.h> namespace gemmlowp { // Definitions of Fragment types wrapping NEON vector types. typedef Fragment<int32x4_t, 4, 1, MapOrder::ColMajor> NEONFragmentInt32x4x1; typedef Fragment<int32x4x4_t, 16, 1, MapOrder::ColMajor> NEONFragmentInt32x16x1; typedef Fragment<uint8x8_t, 4, 1, MapOrder::ColMajor> NEONFragmentUint8x4x1; typedef Fragment<uint8x16_t, 16, 1, MapOrder::ColMajor> NEONFragmentUint8x16x1; // The code in unpack_neon.h will whenever possible process // 16 entries at once (4 SIMD vectors of 4 entries each at once), // to offer the compiler better optimization opportunities, reducing // register dependencies. From the perspective of interfacing with the output // pipeline, this takes the form of passing Fragment types wrapping int32x4x4_t // data. In most cases, such data is handled simply by handling separately its // 4 int32x4_t components. This partial specialization handles that for // arbitrary output stages implementing a int32x4_t path. Only some output // stages below will override this to use custom code to handle int32x4x4_t // data all at once (see OutputStageSaturatingCastToUint8 below). template <typename OutputStageType> struct OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x16x1> { typedef NEONFragmentInt32x16x1 InputType; typedef NEONFragmentInt32x16x1 OutputType; typedef OutputStageEvalImpl<OutputStageType, NEONFragmentInt32x4x1> ImplInt32x4; OutputStageEvalImpl(const OutputStageType& s) : impl_int32x4(s) {} OutputType Eval(InputType input, int row, int col) const { OutputType output; for (int i = 0; i < 4; i++) { output.data.val[i] = impl_int32x4.Eval(input.data.val[i], row + 4 * i, col); } return output; } ImplInt32x4 impl_int32x4; }; // Implementation of OutputStageQuantizeDownInt32ToUint8Scale for // NEONFragmentInt32x4x1 template <> struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale, NEONFragmentInt32x4x1> { typedef NEONFragmentInt32x4x1 InputType; typedef NEONFragmentInt32x4x1 OutputType; typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int, int) const { const std::int32_t result_shift = output_stage.result_shift; const std::int32_t result_mult_int = output_stage.result_mult_int; const std::int32_t result_offset = output_stage.result_offset; const std::int32_t preshift_offset = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); const int32x4_t a = vaddq_s32(input, vdupq_n_s32(result_offset)); const int32x4_t b = vmlaq_n_s32(vdupq_n_s32(preshift_offset), a, result_mult_int); return vshlq_s32(b, vdupq_n_s32(-result_shift)); } const OutputStage& output_stage; }; // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for // NEONFragmentInt32x4x1 template <> struct OutputStageEvalImpl< OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>, NEONFragmentInt32x4x1> { typedef NEONFragmentInt32x4x1 InputType; typedef NEONFragmentInt32x4x1 OutputType; typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col> OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int row, int col) const { const std::int32_t result_shift = output_stage.result_shift; const std::int32_t preshift_offset = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); const int32x4_t result_mult_int = vld1q_s32(output_stage.result_mult_int.data(row)); const int32x4_t result_offset = vld1q_s32(output_stage.result_offset.data(row)); const int32x4_t a = vaddq_s32(input, result_offset); const int32x4_t b = vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int); return vshlq_s32(b, vdupq_n_s32(-result_shift)); } const OutputStage& output_stage; }; // Implementation of OutputStageQuantizeDownInt32ToUint8ScalePC for // NEONFragmentInt32x4x1 template <> struct OutputStageEvalImpl< OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>, NEONFragmentInt32x4x1> { typedef NEONFragmentInt32x4x1 InputType; typedef NEONFragmentInt32x4x1 OutputType; typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row> OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int row, int col) const { const std::int32_t result_shift = output_stage.result_shift; const std::int32_t preshift_offset = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); const int32x4_t result_mult_int = vld1q_s32(output_stage.result_mult_int.data(col)); const int32x4_t result_offset = vld1q_s32(output_stage.result_offset.data(row)); const int32x4_t a = vaddq_s32(input, result_offset); const int32x4_t b = vmlaq_s32(vdupq_n_s32(preshift_offset), a, result_mult_int); return vshlq_s32(b, vdupq_n_s32(-result_shift)); } const OutputStage& output_stage; }; // Implementation of OutputStageSaturatingCastToUint8 for NEONFragmentInt32x4x1 template <> struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8, NEONFragmentInt32x4x1> { typedef NEONFragmentInt32x4x1 InputType; typedef NEONFragmentUint8x4x1 OutputType; typedef OutputStageSaturatingCastToUint8 OutputStage; OutputStageEvalImpl(const OutputStage&) {} OutputType Eval(InputType input, int, int) const { int16x8_t q16 = vcombine_s16(vqmovn_s32(input), vdup_n_s16(0)); return vqmovun_s16(q16); } }; // In the case of OutputStageSaturatingCastToUint8, the handling of // NEONFragmentInt32x16x1 data can be made much more efficient by handling // it all at once, instead of as 4 separate int32x4 values as in the above // generic partial specialization. This also avoids the poor (50%) register // utilization of FragmentUint8x4x1: by handling 16 scalar values at once, // we are able to fill a uint8x16_t. template <> struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8, NEONFragmentInt32x16x1> { typedef NEONFragmentInt32x16x1 InputType; typedef NEONFragmentUint8x16x1 OutputType; typedef OutputStageSaturatingCastToUint8 OutputStage; OutputStageEvalImpl(const OutputStage&) {} OutputType Eval(InputType input, int, int) const { int16x8_t q16[2]; for (int i = 0; i < 2; i++) { q16[i] = vcombine_s16(vqmovn_s32(input.data.val[2 * i]), vqmovn_s32(input.data.val[2 * i + 1])); } return vcombine_u8(vqmovun_s16(q16[0]), vqmovun_s16(q16[1])); } }; // Implementation of OutputStageBiasAddition for NEONFragmentInt32x4x1 template <typename VectorType> struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, NEONFragmentInt32x4x1> { typedef NEONFragmentInt32x4x1 InputType; typedef NEONFragmentInt32x4x1 OutputType; typedef OutputStageBiasAddition<VectorType> OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int row, int col) const { int32x4_t bias; if (VectorType::kShape == VectorShape::Row) { bias = vdupq_n_s32(output_stage.bias_vector(col)); } else { bias = vld1q_s32(output_stage.bias_vector.data(row)); } return vaddq_s32(input, bias); } const OutputStage& output_stage; }; // Implementation of OutputStageClamp for NEONFragmentInt32x4x1 template <> struct OutputStageEvalImpl<OutputStageClamp, NEONFragmentInt32x4x1> { typedef NEONFragmentInt32x4x1 InputType; typedef NEONFragmentInt32x4x1 OutputType; typedef OutputStageClamp OutputStage; OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} OutputType Eval(InputType input, int, int) const { const int32x4_t min = vdupq_n_s32(output_stage.min); const int32x4_t max = vdupq_n_s32(output_stage.max); return vminq_s32(vmaxq_s32(input, min), max); } const OutputStage& output_stage; }; // Implementation of OutputStageTanh for NEONFragmentInt32x4x1 template <> struct OutputStageEvalImpl<OutputStageTanh, NEONFragmentInt32x4x1> : OutputStageTanhEvalImpl<NEONFragmentInt32x4x1> { OutputStageEvalImpl(const OutputStageTanh& output_stage) : OutputStageTanhEvalImpl(output_stage) {} }; // Specialization of StoreFinalOutput for NEONFragmentUint8x4x1. // This is quite inefficient, but we have no choice: instructions storing 32bit // at once also assume 32bit alignment. In practice, this slowness is not a // problem because we use the x16 path for most values. template <typename DstType> inline void StoreFinalOutput(NEONFragmentUint8x4x1 value, DstType* dst, int row, int col) { vst1_lane_u8(dst->data(row + 0, col), value, 0); vst1_lane_u8(dst->data(row + 1, col), value, 1); vst1_lane_u8(dst->data(row + 2, col), value, 2); vst1_lane_u8(dst->data(row + 3, col), value, 3); } // Specialization of StoreFinalOutput for NEONFragmentUint8x16x1. template <typename DstType> inline void StoreFinalOutput(NEONFragmentUint8x16x1 value, DstType* dst, int row, int col) { vst1q_u8(dst->data(row, col), value); } // Specialization of StoreFinalOutput for NEONFragmentInt32x4x1, storing into a // int32 destination. template <typename DstType> inline void StoreFinalOutput(NEONFragmentInt32x4x1 value, DstType* dst, int row, int col) { vst1q_s32(dst->data(row, col), value); } // Specialization of StoreFinalOutput for NEONFragmentInt32x16x1, storing into // a int32 destination. template <typename DstType> inline void StoreFinalOutput(NEONFragmentInt32x16x1 value, DstType* dst, int row, int col) { for (int i = 0; i < 4; i++) { vst1q_s32(dst->data(row + 4 * i, col), value.data.val[i]); } } } // namespace gemmlowp #endif // GEMMLOWP_INTERNAL_OUTPUT_NEON_H_