/****************************************************************************** * * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * ihevce_scale_by_2_neon.c * * @brief * Contains definitions of functions for scale by 2 * * @author * Ittiam * * @par List of Functions: * * @remarks * None * ******************************************************************************** */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ /* System include files */ #include <stdio.h> #include <string.h> #include <assert.h> #include <arm_neon.h> /* System user files */ #include "ihevc_typedefs.h" #include "ihevc_macros.h" #include "itt_video_api.h" #include "ihevce_ipe_instr_set_router.h" /*****************************************************************************/ /* Constant Macros */ /*****************************************************************************/ #define FILT_TAP_Q 7 /*****************************************************************************/ /* Function Definitions */ /*****************************************************************************/ static void ihevce_horz_scale_neon_w16( UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht) { const int16x8_t prec = vdupq_n_s16(8192); const int16x8_t inv_prec = vdupq_n_s16(64); const uint8x8_t wt_0 = vdup_n_u8(66); const int8_t wt_1 = 40; const int8_t wt_2 = 9; WORD32 i, j; for(j = 0; j < ht; j++) { UWORD8 *pu1_src_tmp = pu1_src + j * src_strd - 3; UWORD8 *pu1_dst_tmp = pu1_dst + j * dst_strd; for(i = 0; i < wd;) { uint8x16x2_t src = vld2q_u8(pu1_src_tmp); uint8x8_t c, l0, r0, r3; int16x8_t p, q, r; int16x8_t sum; c = vext_u8(vget_low_u8(src.val[1]), vget_high_u8(src.val[1]), 1); l0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 1); r0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 2); r3 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 3); p = vreinterpretq_s16_u16(vmull_u8(c, wt_0)); // a[0] * 66 q = vreinterpretq_s16_u16(vaddl_u8(l0, r0)); q = vmulq_n_s16(q, wt_1); // (a[-1] + a[1]) * 40 r = vreinterpretq_s16_u16(vaddl_u8(r3, vget_low_u8(src.val[0]))); r = vmulq_n_s16(r, wt_2); // (a[-3] + a[3]) * 9 // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9 sum = vsubq_s16(p, prec); q = vsubq_s16(q, r); sum = vaddq_s16(q, sum); sum = vrshrq_n_s16(sum, FILT_TAP_Q); sum = vaddq_s16(sum, inv_prec); // result c = vqmovun_s16(sum); vst1_u8(pu1_dst_tmp, c); i += 16; pu1_src_tmp += 16; pu1_dst_tmp += 8; } } } static void ihevce_vert_scale_neon_w16( UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht) { const int16x8_t prec = vdupq_n_s16(8192); const int16x8_t inv_prec = vdupq_n_s16(64); const uint8x8_t wt_0 = vdup_n_u8(66); const int8_t wt_1 = 40; const int8_t wt_2 = 9; WORD32 i, j; #define LOAD_ROW() \ { \ src[mod8] = vld1q_u8(pu1_src_tmp); \ pu1_src_tmp += src_strd; \ mod8++; \ mod8 &= 7; \ } for(i = 0; i < wd; i += 16) { UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd + i; WORD32 lut_id = 0; UWORD8 mod8 = 0; uint8x16_t src[8]; LOAD_ROW() // r[-3] LOAD_ROW() // r[-2] LOAD_ROW() // r[-1] LOAD_ROW() // r[0] LOAD_ROW() // r[1] for(j = 0; j < ht; j += 2) { UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd + i; UWORD8 c, t1, b1, t2, b2; int16x8_t p, q, r; int16x8_t sum; LOAD_ROW() // r[2] LOAD_ROW() // r[3] t2 = (lut_id & 7); t1 = (lut_id + 2) & 7; c = (lut_id + 3) & 7; b1 = (lut_id + 4) & 7; b2 = (lut_id + 6) & 7; lut_id += 2; // a[0] * 66 p = vreinterpretq_s16_u16(vmull_u8(vget_low_u8(src[c]), wt_0)); // (a[-1] + a[1]) * 40 q = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t1]), vget_low_u8(src[b1]))); q = vmulq_n_s16(q, wt_1); // (a[-3] + a[3]) * 9 r = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t2]), vget_low_u8(src[b2]))); r = vmulq_n_s16(r, wt_2); // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9 sum = vsubq_s16(p, prec); q = vsubq_s16(q, r); sum = vaddq_s16(q, sum); sum = vrshrq_n_s16(sum, FILT_TAP_Q); sum = vaddq_s16(sum, inv_prec); vst1_u8(pu1_dst_tmp, vqmovun_s16(sum)); // a[0] * 66 p = vreinterpretq_s16_u16(vmull_u8(vget_high_u8(src[c]), wt_0)); // (a[-1] + a[1]) * 40 q = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t1]), vget_high_u8(src[b1]))); q = vmulq_n_s16(q, wt_1); // (a[-3] + a[3]) * 9 r = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t2]), vget_high_u8(src[b2]))); r = vmulq_n_s16(r, wt_2); // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9 sum = vsubq_s16(p, prec); q = vsubq_s16(q, r); sum = vaddq_s16(q, sum); sum = vrshrq_n_s16(sum, FILT_TAP_Q); sum = vaddq_s16(sum, inv_prec); vst1_u8(pu1_dst_tmp + 8, vqmovun_s16(sum)); pu1_dst_tmp += 16; } } } void ihevce_scaling_filter_mxn_neon( UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_scrtch, WORD32 scrtch_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 ht, WORD32 wd) { WORD32 i, j; assert(wd >= 16 && wd % 16 == 0); assert(ht % 2 == 0); for(j = 0; j < ht;) { UWORD8 *pu1_src_tmp = pu1_src + j * src_strd; UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd; WORD32 rows = MIN(64, (ht - j)); for(i = 0; i < wd;) { WORD32 cols; if((wd - i) >= 64) cols = 64; else if((wd - i) >= 32) cols = 32; else cols = 16; ihevce_horz_scale_neon_w16( pu1_src_tmp - 3 * src_strd + i, src_strd, pu1_scrtch, scrtch_strd, cols, (3 + rows + 2)); ihevce_vert_scale_neon_w16( pu1_scrtch + 3 * scrtch_strd, scrtch_strd, pu1_dst_tmp + (i >> 1), dst_strd, (cols >> 1), rows); i += cols; } j += rows; } }