/****************************************************************************** * * Copyright (C) 2015 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * icv_variance_sse42.c * * @brief * This file contains the functions to compute variance * * @author * Ittiam * * @par List of Functions: * icv_variance_8x4_ssse3() * * @remarks * None * ******************************************************************************* */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ /* System include files */ #include <stdio.h> #include <stdint.h> #include <string.h> #include <stdlib.h> #include <assert.h> #include <immintrin.h> /* User include files */ #include "icv_datatypes.h" #include "icv_macros.h" #include "icv_platform_macros.h" #include "icv.h" /** ******************************************************************************* * * @brief * Computes variance of a given 8x4 block * * @par Description * Compute variance of a given 8x4 block * * @param[in] pu1_src * Source * * @param[in] src_strd * Source stride * * @param[in] wd * Assumed to be 8 * * @param[in] ht * Assumed to be 4 * * @returns * Variance * * @remarks * ******************************************************************************* */ WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht) { WORD32 sum; WORD32 sum_sqr; WORD32 blk_sz; WORD32 vrnc; __m128 src_r0, src_r1; __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3; __m128i sum_r0, sum_r1; __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3; __m128i vsum, vsum_sqr; __m128i zero; UNUSED(wd); UNUSED(ht); ASSERT(wd == 8); ASSERT(ht == 4); sum = 0; sum_sqr = 0; blk_sz = 8 * 4; zero = _mm_setzero_si128(); /* Load source */ src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); pu1_src += src_strd; src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); pu1_src += src_strd; src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src)); pu1_src += src_strd; src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src)); pu1_src += src_strd; /* Compute sum of all elements */ /* Use SAD with 0, since there is no pairwise addition */ sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero); sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero); /* Accumulate SAD */ vsum = _mm_add_epi64(sum_r0, sum_r1); vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8)); sum = _mm_cvtsi128_si32(vsum); /* Unpack to 16 bits */ ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero); ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero); ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero); ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero); /* Compute sum of squares */ sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0); sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1); sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2); sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3); vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1); vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2); vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3); vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8)); vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4)); sum_sqr = _mm_cvtsi128_si32(vsum_sqr); /* Compute variance */ vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz); return vrnc; }