// Copyright 2015 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // unpack_neon.h: optimized NEON specializations of the templates in unpack.h. #ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_ #define GEMMLOWP_INTERNAL_UNPACK_NEON_H_ #include "output_neon.h" #include "unpack.h" #include <arm_neon.h> namespace gemmlowp { template <std::uint32_t numerator, std::uint32_t denominator> int32x4_t RoundingMultiplyByConstantFraction(int32x4_t x) { static_assert(numerator > 0 && denominator > 0, "only supporting positive num/denom"); if (numerator == denominator) { return x; } static const std::int32_t int_quotient = (numerator + denominator / 2) / denominator; static const std::int32_t remaining_numerator = numerator - int_quotient * denominator; static const std::int32_t scaled_remaining_numerator = static_cast<std::int32_t>( (static_cast<std::int64_t>(remaining_numerator) * (1ll << 31)) / denominator); // Note: vqrdmulh instruction is rounding doubling multiply high. const int32x4_t remaining_product = vqrdmulhq_n_s32(x, scaled_remaining_numerator); return vmlaq_n_s32(remaining_product, x, int_quotient); } template <typename tScalar, VectorShape tShape> int32x4_t get_int32x4_t_and_inc( ConstIterator<VectorMap<tScalar, tShape>>* iterator) { const int32x4_t result = vld1q_s32(iterator->get()); *iterator += 4; return result; } template <typename tScalar, VectorShape tShape> int32x4_t get_int32x4_t_and_inc( ConstIterator<VectorDup<tScalar, tShape>>* iterator) { const int32x4_t result = vdupq_n_s32(**iterator); // Increment really does nothing for VectorDup. *iterator += 4; return result; } template <typename BitDepthParams, typename PackedResultType, typename OutputScalar, typename LhsOffset, typename RhsOffset, typename OutputPipelineType> struct UnpackResultImpl<BitDepthParams, MatrixMap<OutputScalar, MapOrder::ColMajor>, PackedResultType, LhsOffset, RhsOffset, OutputPipelineType> { typedef MatrixMap<OutputScalar, MapOrder::ColMajor> ResultBlockType; static void Unpack(ResultBlockType* dst, const PackedResultType& src, int depth, const std::int32_t* lhs_sums_of_each_slice, const std::int32_t* rhs_sums_of_each_slice, const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, const OutputPipelineType& output_pipeline) { ScopedProfilingLabel label("optimized path (NEON)"); const int kLhsBits = BitDepthParams::LhsBitDepth::kBits; const int kRhsBits = BitDepthParams::RhsBitDepth::kBits; const std::int32_t kLhsMax = (1 << kLhsBits) - 1; const std::int32_t kRhsMax = (1 << kRhsBits) - 1; auto src_map = src.Map(); OutputPipelineExecutor<OutputPipelineType, FragmentInt32x1x1> output_pipeline_executor_int32x1x1(output_pipeline); OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x4x1> output_pipeline_executor_int32x4x1(output_pipeline); OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x16x1> output_pipeline_executor_int32x16x1(output_pipeline); for (int c = 0; c < dst->cols(); c++) { const std::int32_t* src_ptr = src_map.data(0, c); const std::int32_t* sums_of_each_slice_ptr = lhs_sums_of_each_slice; auto lhs_offset_iter = const_iterator(lhs_offset); const std::int32_t rhs_offset_c = rhs_offset(c); const std::int32_t rhs_sums_of_each_slice_c = rhs_sums_of_each_slice[c]; // Handle 16 values at once for higher performance int dst_rows_aligned16 = RoundDown<16>(dst->rows()); for (int r = 0; r < dst_rows_aligned16; r += 16) { // Compute the sum of the 4 terms, // q = term_xx + term_x1 + term_1x_plus_term_11 // Refer to the generic code in unpack.h. int32x4_t raw_xx[4]; for (int i = 0; i < 4; i++) { raw_xx[i] = vld1q_s32(src_ptr); src_ptr += 4; } int32x4_t raw_x1[4]; for (int i = 0; i < 4; i++) { const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr); raw_x1[i] = vmulq_n_s32(sum_x1, rhs_offset_c); sums_of_each_slice_ptr += 4; } int32x4_t raw_1x[4]; int32x4_t term_11[4]; for (int i = 0; i < 4; i++) { const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter); raw_1x[i] = vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c); term_11[i] = vmulq_n_s32(lhs_offsets, rhs_offset_c * depth); } int32x4_t term_xx[4]; for (int i = 0; i < 4; i++) { term_xx[i] = RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( raw_xx[i]); } int32x4_t term_x1[4]; for (int i = 0; i < 4; i++) { term_x1[i] = RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1[i]); } int32x4_t term_1x[4]; for (int i = 0; i < 4; i++) { term_1x[i] = RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x[i]); } int32x4x4_t q; for (int i = 0; i < 4; i++) { q.val[i] = vaddq_s32(vaddq_s32(term_xx[i], term_x1[i]), vaddq_s32(term_1x[i], term_11[i])); } NEONFragmentInt32x16x1 f(q); output_pipeline_executor_int32x16x1.Execute(f, dst, r, c); } // We have finished handling groups of 16 entries at once; now // try to handle 4 entries at once. int dst_rows_aligned4 = RoundDown<4>(dst->rows()); for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) { // Compute the sum of the 4 terms, // q = term_xx + term_x1 + term_1x_plus_term_11 // Refer to the generic code in unpack.h. const int32x4_t raw_xx = vld1q_s32(src_ptr); src_ptr += 4; const int32x4_t term_xx = RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( raw_xx); const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr); const int32x4_t raw_x1 = vmulq_n_s32(sum_x1, rhs_offset_c); sums_of_each_slice_ptr += 4; const int32x4_t term_x1 = RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1); const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter); const int32x4_t raw_1x = vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c); const int32x4_t term_1x = RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x); const int32x4_t term_11 = vmulq_n_s32(lhs_offsets, rhs_offset_c * depth); int32x4_t q = vaddq_s32(vaddq_s32(term_xx, term_x1), vaddq_s32(term_1x, term_11)); NEONFragmentInt32x4x1 f(q); output_pipeline_executor_int32x4x1.Execute(f, dst, r, c); } // We have finished handling 4 entries at once; now handle // remaining entries one by one. This scalar code is similar // to the code in unpack.h, see comments there. for (int r = dst_rows_aligned4; r < dst->rows(); r++) { const std::int32_t raw_xx = src_map(r, c); const std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset_c; const std::int32_t raw_1x = rhs_sums_of_each_slice_c * lhs_offset(r); const std::int32_t term_xx = RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( raw_xx); const std::int32_t term_x1 = RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1); const std::int32_t term_1x = RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x); const std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth; FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11; output_pipeline_executor_int32x1x1.Execute(sum, dst, r, c); } } } }; } // namespace gemmlowp #endif // GEMMLOWP_INTERNAL_UNPACK_NEON_H_