/****************************************************************************** * * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ /** ******************************************************************************* * @file * ihevcd_frm_cvt_x86_intr.c * * @brief * Platform specific intrinsic implementation of certain functions * * @author * Ittiam * @par List of Functions: * - ihevcd_itrans_recon_dc * - ihevcd_fmt_conv_420sp_to_420p * * @remarks * None * ******************************************************************************* */ #include "string.h" #include "ihevc_typedefs.h" #include "ihevc_defs.h" #include "ihevc_macros.h" #include "ihevc_platform_macros.h" #include "ihevcd_function_selector.h" #include <string.h> #include <immintrin.h> void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src, UWORD8 *pu1_uv_src, UWORD8 *pu1_y_dst, UWORD8 *pu1_u_dst, UWORD8 *pu1_v_dst, WORD32 wd, WORD32 ht, WORD32 src_y_strd, WORD32 src_uv_strd, WORD32 dst_y_strd, WORD32 dst_uv_strd, WORD32 is_u_first, WORD32 disable_luma_copy) { UWORD8 *pu1_src, *pu1_dst; UWORD8 *pu1_u_src, *pu1_v_src; WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows; WORD32 i, j; cols = 0; pu1_u_src = (UWORD8 *)pu1_uv_src; pu1_v_src = (UWORD8 *)pu1_uv_src + 1; if(0 == disable_luma_copy) { /* copy luma */ pu1_src = (UWORD8 *)pu1_y_src; pu1_dst = (UWORD8 *)pu1_y_dst; num_rows = ht; num_cols = wd; src_strd = src_y_strd; dst_strd = dst_y_strd; for(i = 0; i < num_rows; i++) { memcpy(pu1_dst, pu1_src, num_cols); pu1_dst += dst_strd; pu1_src += src_strd; } } /* de-interleave U and V and copy to destination */ if(!is_u_first) { UWORD8 *temp = pu1_u_dst; pu1_u_dst = pu1_v_dst; pu1_v_dst = temp; pu1_u_src = (UWORD8 *)pu1_uv_src + 1; pu1_v_src = (UWORD8 *)pu1_uv_src; } { __m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b; __m128i temp0_8x16b, temp1_8x16b, alt_first_mask; UWORD8 FIRST_ALT_SHUFFLE[16] = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F }; PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0) num_rows = ht >> 1; num_cols = wd >> 1; src_strd = src_uv_strd; dst_strd = dst_uv_strd; alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]); if(num_cols > 15) { cols = num_cols >> 4; for(i = 0; i < (num_rows >> 2); i++) { UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp; PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0) PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0) pu1_uv_src_temp = pu1_uv_src; pu1_u_dst_temp = pu1_u_dst; pu1_v_dst_temp = pu1_v_dst; for(j = 0; j < cols; j++) { /**** Row 0 ***/ src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp); src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16)); temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b); _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b); /**** Row 1 ***/ src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd))); src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16)); temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b); _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b); /**** Row 2 ***/ src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd))); src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16)); temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b); _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b); /**** Row 3 ***/ src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd))); src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16)); temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b); _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b); pu1_u_dst_temp += 16; pu1_v_dst_temp += 16; pu1_uv_src_temp += 32; } pu1_u_dst += 4 * dst_strd; pu1_v_dst += 4 * dst_strd; pu1_uv_src += 4 * src_strd; //pu1_v_src += src_strd; } rows = num_rows & 0x3; if(rows) { for(i = 0; i < rows; i++) { UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp; pu1_uv_src_temp = pu1_uv_src; pu1_u_dst_temp = pu1_u_dst; pu1_v_dst_temp = pu1_v_dst; for(j = 0; j < cols; j++) { src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp); src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16)); temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask); temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask); src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b); src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b); _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b); _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b); pu1_u_dst_temp += 16; pu1_v_dst_temp += 16; pu1_uv_src_temp += 32; } pu1_u_dst += dst_strd; pu1_v_dst += dst_strd; pu1_uv_src += src_strd; } } pu1_u_dst -= (num_rows * dst_strd); pu1_v_dst -= (num_rows * dst_strd); num_cols &= 0x0F; } if(num_cols) { pu1_u_dst += (cols << 4); pu1_v_dst += (cols << 4); pu1_u_src += 2 * (cols << 4); pu1_v_src += 2 * (cols << 4); for(i = 0; i < num_rows; i++) { for(j = 0; j < num_cols; j++) { pu1_u_dst[j] = pu1_u_src[j * 2]; pu1_v_dst[j] = pu1_v_src[j * 2]; } pu1_u_dst += dst_strd; pu1_v_dst += dst_strd; pu1_u_src += src_strd; pu1_v_src += src_strd; } } } return; }