HELLO·Android
系统源代码
IT资讯
技术文章
我的收藏
注册
登录
-
我收藏的文章
创建代码块
我的代码块
我的账号
Nougat 7.1
|
7.1.1_r28
下载
查看原文件
收藏
根目录
external
opencv3
modules
core
src
arithm.cpp
/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ /* //////////////////////////////////////////////////////////////////// // // Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ... // // */ #include "precomp.hpp" #include "opencl_kernels_core.hpp" namespace cv { struct NOP {}; #if CV_SSE2 || CV_NEON #define FUNCTOR_TEMPLATE(name) \ template
struct name {} FUNCTOR_TEMPLATE(VLoadStore128); #if CV_SSE2 FUNCTOR_TEMPLATE(VLoadStore64); FUNCTOR_TEMPLATE(VLoadStore128Aligned); #if CV_AVX2 FUNCTOR_TEMPLATE(VLoadStore256); FUNCTOR_TEMPLATE(VLoadStore256Aligned); #endif #endif #endif template
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) { #if CV_SSE2 || CV_NEON VOp vop; #endif Op op; for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), src2 = (const T *)((const uchar *)src2 + step2), dst = (T *)((uchar *)dst + step) ) { int x = 0; #if CV_NEON || CV_SSE2 #if CV_AVX2 if( USE_AVX2 ) { for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) { typename VLoadStore256
::reg_type r0 = VLoadStore256
::load(src1 + x); r0 = vop(r0, VLoadStore256
::load(src2 + x)); VLoadStore256
::store(dst + x, r0); } } #else #if CV_SSE2 if( USE_SSE2 ) { #endif // CV_SSE2 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) ) { typename VLoadStore128
::reg_type r0 = VLoadStore128
::load(src1 + x ); typename VLoadStore128
::reg_type r1 = VLoadStore128
::load(src1 + x + 16/sizeof(T)); r0 = vop(r0, VLoadStore128
::load(src2 + x )); r1 = vop(r1, VLoadStore128
::load(src2 + x + 16/sizeof(T))); VLoadStore128
::store(dst + x , r0); VLoadStore128
::store(dst + x + 16/sizeof(T), r1); } #if CV_SSE2 } #endif // CV_SSE2 #endif // CV_AVX2 #endif // CV_NEON || CV_SSE2 #if CV_AVX2 // nothing #elif CV_SSE2 if( USE_SSE2 ) { for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) ) { typename VLoadStore64
::reg_type r = VLoadStore64
::load(src1 + x); r = vop(r, VLoadStore64
::load(src2 + x)); VLoadStore64
::store(dst + x, r); } } #endif #if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { T v0 = op(src1[x], src2[x]); T v1 = op(src1[x+1], src2[x+1]); dst[x] = v0; dst[x+1] = v1; v0 = op(src1[x+2], src2[x+2]); v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } #endif for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } } template
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) { #if CV_SSE2 || CV_NEON Op32 op32; #endif Op op; for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), src2 = (const T *)((const uchar *)src2 + step2), dst = (T *)((uchar *)dst + step) ) { int x = 0; #if CV_AVX2 if( USE_AVX2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) { for( ; x <= sz.width - 8; x += 8 ) { typename VLoadStore256Aligned
::reg_type r0 = VLoadStore256Aligned
::load(src1 + x); r0 = op32(r0, VLoadStore256Aligned
::load(src2 + x)); VLoadStore256Aligned
::store(dst + x, r0); } } } #elif CV_SSE2 if( USE_SSE2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) { for( ; x <= sz.width - 8; x += 8 ) { typename VLoadStore128Aligned
::reg_type r0 = VLoadStore128Aligned
::load(src1 + x ); typename VLoadStore128Aligned
::reg_type r1 = VLoadStore128Aligned
::load(src1 + x + 4); r0 = op32(r0, VLoadStore128Aligned
::load(src2 + x )); r1 = op32(r1, VLoadStore128Aligned
::load(src2 + x + 4)); VLoadStore128Aligned
::store(dst + x , r0); VLoadStore128Aligned
::store(dst + x + 4, r1); } } } #endif // CV_AVX2 #if CV_NEON || CV_SSE2 #if CV_AVX2 if( USE_AVX2 ) { for( ; x <= sz.width - 8; x += 8 ) { typename VLoadStore256
::reg_type r0 = VLoadStore256
::load(src1 + x); r0 = op32(r0, VLoadStore256
::load(src2 + x)); VLoadStore256
::store(dst + x, r0); } } #else #if CV_SSE2 if( USE_SSE2 ) { #endif // CV_SSE2 for( ; x <= sz.width - 8; x += 8 ) { typename VLoadStore128
::reg_type r0 = VLoadStore128
::load(src1 + x ); typename VLoadStore128
::reg_type r1 = VLoadStore128
::load(src1 + x + 4); r0 = op32(r0, VLoadStore128
::load(src2 + x )); r1 = op32(r1, VLoadStore128
::load(src2 + x + 4)); VLoadStore128
::store(dst + x , r0); VLoadStore128
::store(dst + x + 4, r1); } #if CV_SSE2 } #endif // CV_SSE2 #endif // CV_AVX2 #endif // CV_NEON || CV_SSE2 #if CV_ENABLE_UNROLLED for( ; x <= sz.width - 4; x += 4 ) { T v0 = op(src1[x], src2[x]); T v1 = op(src1[x+1], src2[x+1]); dst[x] = v0; dst[x+1] = v1; v0 = op(src1[x+2], src2[x+2]); v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } #endif for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } } template
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz) { #if CV_SSE2 Op64 op64; #endif Op op; for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1), src2 = (const T *)((const uchar *)src2 + step2), dst = (T *)((uchar *)dst + step) ) { int x = 0; #if CV_AVX2 if( USE_AVX2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) { for( ; x <= sz.width - 4; x += 4 ) { typename VLoadStore256Aligned
::reg_type r0 = VLoadStore256Aligned
::load(src1 + x); r0 = op64(r0, VLoadStore256Aligned
::load(src2 + x)); VLoadStore256Aligned
::store(dst + x, r0); } } } #elif CV_SSE2 if( USE_SSE2 ) { if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) { for( ; x <= sz.width - 4; x += 4 ) { typename VLoadStore128Aligned
::reg_type r0 = VLoadStore128Aligned
::load(src1 + x ); typename VLoadStore128Aligned
::reg_type r1 = VLoadStore128Aligned
::load(src1 + x + 2); r0 = op64(r0, VLoadStore128Aligned
::load(src2 + x )); r1 = op64(r1, VLoadStore128Aligned
::load(src2 + x + 2)); VLoadStore128Aligned
::store(dst + x , r0); VLoadStore128Aligned
::store(dst + x + 2, r1); } } } #endif for( ; x <= sz.width - 4; x += 4 ) { T v0 = op(src1[x], src2[x]); T v1 = op(src1[x+1], src2[x+1]); dst[x] = v0; dst[x+1] = v1; v0 = op(src1[x+2], src2[x+2]); v1 = op(src1[x+3], src2[x+3]); dst[x+2] = v0; dst[x+3] = v1; } for( ; x < sz.width; x++ ) dst[x] = op(src1[x], src2[x]); } } #if CV_AVX2 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \ template <> \ struct name
{ \ typedef register_type reg_type; \ static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ } #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \ template <> \ struct name
{ \ typedef register_type reg_type; \ static reg_type load(const template_arg * p) { return load_body (p); } \ static void store(template_arg * p, reg_type v) { store_body (p, v); } \ } #define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \ template<> \ struct name
\ { \ VLoadStore256
::reg_type operator()( \ const VLoadStore256
::reg_type & a, \ const VLoadStore256
::reg_type & b) const \ { \ body; \ } \ } #define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \ template<> \ struct name
\ { \ VLoadStore256
::reg_type operator()( \ const VLoadStore256
::reg_type & a, \ const VLoadStore256
::reg_type & ) const \ { \ body; \ } \ } FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256); FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps ); FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd ); FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256); FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps ); FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd ); FUNCTOR_TEMPLATE(VAdd); FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b)); FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b)); FUNCTOR_TEMPLATE(VSub); FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b)); FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b)); FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b)); FUNCTOR_TEMPLATE(VMin); FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b)); FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b)); FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b)); FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b)); FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b)); FUNCTOR_TEMPLATE(VMax); FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b)); FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b)); FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b)); FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b)); FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b)); FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b)); static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; FUNCTOR_TEMPLATE(VAbsDiff); FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a)); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, __m256i d = _mm256_subs_epi8(a, b); __m256i m = _mm256_cmpgt_epi8(b, a); return _mm256_subs_epi8(_mm256_xor_si256(d, m), m); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a)); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, short, __m256i M = _mm256_max_epi16(a, b); __m256i m = _mm256_min_epi16(a, b); return _mm256_subs_epi16(M, m); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, int, __m256i d = _mm256_sub_epi32(a, b); __m256i m = _mm256_cmpgt_epi32(b, a); return _mm256_sub_epi32(_mm256_xor_si256(d, m), m); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, float, return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, double, return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask); ); FUNCTOR_TEMPLATE(VAnd); FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b)); FUNCTOR_TEMPLATE(VOr); FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b)); FUNCTOR_TEMPLATE(VXor); FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b)); FUNCTOR_TEMPLATE(VNot); FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a)); #elif CV_SSE2 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\ template <> \ struct name
{ \ typedef register_type reg_type; \ static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \ static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \ } #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ template <> \ struct name
{ \ typedef register_type reg_type; \ static reg_type load(const template_arg * p) { return load_body (p); } \ static void store(template_arg * p, reg_type v) { store_body (p, v); } \ } #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ template<> \ struct name
\ { \ VLoadStore128
::reg_type operator()( \ const VLoadStore128
::reg_type & a, \ const VLoadStore128
::reg_type & b) const \ { \ body; \ } \ } #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ template<> \ struct name
\ { \ VLoadStore128
::reg_type operator()( \ const VLoadStore128
::reg_type & a, \ const VLoadStore128
::reg_type & ) const \ { \ body; \ } \ } FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128); FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128); FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128); FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128); FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128); FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps ); FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd ); FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64); FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64); FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64); FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128); FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps ); FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd ); FUNCTOR_TEMPLATE(VAdd); FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b)); FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b)); FUNCTOR_TEMPLATE(VSub); FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b)); FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b)); FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b)); FUNCTOR_TEMPLATE(VMin); FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b)); FUNCTOR_CLOSURE_2arg(VMin, schar, __m128i m = _mm_cmpgt_epi8(a, b); return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); ); FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b))); FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VMin, int, __m128i m = _mm_cmpgt_epi32(a, b); return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); ); FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b)); FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b)); FUNCTOR_TEMPLATE(VMax); FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b)); FUNCTOR_CLOSURE_2arg(VMax, schar, __m128i m = _mm_cmpgt_epi8(b, a); return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); ); FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b)); FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b)); FUNCTOR_CLOSURE_2arg(VMax, int, __m128i m = _mm_cmpgt_epi32(b, a); return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m)); ); FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b)); FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b)); static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff }; FUNCTOR_TEMPLATE(VAbsDiff); FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, __m128i d = _mm_subs_epi8(a, b); __m128i m = _mm_cmpgt_epi8(b, a); return _mm_subs_epi8(_mm_xor_si128(d, m), m); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, short, __m128i M = _mm_max_epi16(a, b); __m128i m = _mm_min_epi16(a, b); return _mm_subs_epi16(M, m); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, int, __m128i d = _mm_sub_epi32(a, b); __m128i m = _mm_cmpgt_epi32(b, a); return _mm_sub_epi32(_mm_xor_si128(d, m), m); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, float, return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask); ); FUNCTOR_CLOSURE_2arg(VAbsDiff, double, return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask); ); FUNCTOR_TEMPLATE(VAnd); FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b)); FUNCTOR_TEMPLATE(VOr); FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b)); FUNCTOR_TEMPLATE(VXor); FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b)); FUNCTOR_TEMPLATE(VNot); FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a)); #endif #if CV_NEON #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\ template <> \ struct name
{ \ typedef register_type reg_type; \ static reg_type load(const template_arg * p) { return load_body (p);}; \ static void store(template_arg * p, reg_type v) { store_body (p, v);}; \ } #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\ template<> \ struct name
\ { \ VLoadStore128
::reg_type operator()( \ VLoadStore128
::reg_type a, \ VLoadStore128
::reg_type b) const \ { \ return body; \ }; \ } #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\ template<> \ struct name
\ { \ VLoadStore128
::reg_type operator()( \ VLoadStore128
::reg_type a, \ VLoadStore128
::reg_type ) const \ { \ return body; \ }; \ } FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 ); FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 ); FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16); FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16); FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32); FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32); FUNCTOR_TEMPLATE(VAdd); FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b)); FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b)); FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b)); FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b)); FUNCTOR_TEMPLATE(VSub); FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b)); FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b)); FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b)); FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b)); FUNCTOR_TEMPLATE(VMin); FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b)); FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b)); FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b)); FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b)); FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b)); FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b)); FUNCTOR_TEMPLATE(VMax); FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b)); FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b)); FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b)); FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b)); FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b)); FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b)); FUNCTOR_TEMPLATE(VAbsDiff); FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b)); FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b))); FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b)); FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b))); FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b)); FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b)); FUNCTOR_TEMPLATE(VAnd); FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b)); FUNCTOR_TEMPLATE(VOr); FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b)); FUNCTOR_TEMPLATE(VXor); FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b)); FUNCTOR_TEMPLATE(VNot); FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a )); #endif #if CV_SSE2 || CV_NEON #define IF_SIMD(op) op #else #define IF_SIMD(op) NOP #endif template<> inline uchar OpAdd
::operator ()(uchar a, uchar b) const { return CV_FAST_CAST_8U(a + b); } template<> inline uchar OpSub
::operator ()(uchar a, uchar b) const { return CV_FAST_CAST_8U(a - b); } template
struct OpAbsDiff { typedef T type1; typedef T type2; typedef T rtype; T operator()(T a, T b) const { return (T)std::abs(a - b); } }; template<> inline short OpAbsDiff
::operator ()(short a, short b) const { return saturate_cast
(std::abs(a - b)); } template<> inline schar OpAbsDiff
::operator ()(schar a, schar b) const { return saturate_cast
(std::abs(a - b)); } template
struct OpAbsDiffS { typedef T type1; typedef WT type2; typedef T rtype; T operator()(T a, WT b) const { return saturate_cast
(std::abs(a - b)); } }; template
struct OpAnd { typedef T type1; typedef T type2; typedef T rtype; T operator()( T a, T b ) const { return a & b; } }; template
struct OpOr { typedef T type1; typedef T type2; typedef T rtype; T operator()( T a, T b ) const { return a | b; } }; template
struct OpXor { typedef T type1; typedef T type2; typedef T rtype; T operator()( T a, T b ) const { return a ^ b; } }; template
struct OpNot { typedef T type1; typedef T type2; typedef T rtype; T operator()( T a, T ) const { return ~a; } }; #if (ARITHM_USE_IPP == 1) static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step) { if( sz.height == 1 ) step1 = step2 = step = sz.width*elemSize; } #endif static void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VAdd
)>(src1, step1, src2, step2, dst, step, sz)); } static void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VAdd
)>(src1, step1, src2, step2, dst, step, sz); } static void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VAdd
)>(src1, step1, src2, step2, dst, step, sz)); } static void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VAdd
)>(src1, step1, src2, step2, dst, step, sz)); } static void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { vBinOp32
, IF_SIMD(VAdd
)>(src1, step1, src2, step2, dst, step, sz); } static void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp32
, IF_SIMD(VAdd
)>(src1, step1, src2, step2, dst, step, sz)); } static void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { vBinOp64
, IF_SIMD(VAdd
)>(src1, step1, src2, step2, dst, step, sz); } static void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VSub
)>(src1, step1, src2, step2, dst, step, sz)); } static void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VSub
)>(src1, step1, src2, step2, dst, step, sz); } static void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VSub
)>(src1, step1, src2, step2, dst, step, sz)); } static void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VSub
)>(src1, step1, src2, step2, dst, step, sz)); } static void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { vBinOp32
, IF_SIMD(VSub
)>(src1, step1, src2, step2, dst, step, sz); } static void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp32
, IF_SIMD(VSub
)>(src1, step1, src2, step2, dst, step, sz)); } static void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { vBinOp64
, IF_SIMD(VSub
)>(src1, step1, src2, step2, dst, step, sz); } template<> inline uchar OpMin
::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } template<> inline uchar OpMax
::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } static void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { uchar* s1 = (uchar*)src1; uchar* s2 = (uchar*)src2; uchar* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width)) break; s1 += step1; s2 += step2; d += step; } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp
, IF_SIMD(VMax
)>(src1, step1, src2, step2, dst, step, sz); } static void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VMax
)>(src1, step1, src2, step2, dst, step, sz); } static void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { ushort* s1 = (ushort*)src1; ushort* s2 = (ushort*)src2; ushort* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width)) break; s1 = (ushort*)((uchar*)s1 + step1); s2 = (ushort*)((uchar*)s2 + step2); d = (ushort*)((uchar*)d + step); } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp
, IF_SIMD(VMax
)>(src1, step1, src2, step2, dst, step, sz); } static void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VMax
)>(src1, step1, src2, step2, dst, step, sz); } static void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { vBinOp32
, IF_SIMD(VMax
)>(src1, step1, src2, step2, dst, step, sz); } static void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { float* s1 = (float*)src1; float* s2 = (float*)src2; float* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width)) break; s1 = (float*)((uchar*)s1 + step1); s2 = (float*)((uchar*)s2 + step2); d = (float*)((uchar*)d + step); } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp32
, IF_SIMD(VMax
)>(src1, step1, src2, step2, dst, step, sz); } static void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { #if ARITHM_USE_IPP == 1 CV_IPP_CHECK() { double* s1 = (double*)src1; double* s2 = (double*)src2; double* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width)) break; s1 = (double*)((uchar*)s1 + step1); s2 = (double*)((uchar*)s2 + step2); d = (double*)((uchar*)d + step); } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp64
, IF_SIMD(VMax
)>(src1, step1, src2, step2, dst, step, sz); } static void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { uchar* s1 = (uchar*)src1; uchar* s2 = (uchar*)src2; uchar* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMinEvery_8u(s1, s2, d, sz.width)) break; s1 += step1; s2 += step2; d += step; } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp
, IF_SIMD(VMin
)>(src1, step1, src2, step2, dst, step, sz); } static void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VMin
)>(src1, step1, src2, step2, dst, step, sz); } static void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { ushort* s1 = (ushort*)src1; ushort* s2 = (ushort*)src2; ushort* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMinEvery_16u(s1, s2, d, sz.width)) break; s1 = (ushort*)((uchar*)s1 + step1); s2 = (ushort*)((uchar*)s2 + step2); d = (ushort*)((uchar*)d + step); } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp
, IF_SIMD(VMin
)>(src1, step1, src2, step2, dst, step, sz); } static void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VMin
)>(src1, step1, src2, step2, dst, step, sz); } static void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { vBinOp32
, IF_SIMD(VMin
)>(src1, step1, src2, step2, dst, step, sz); } static void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { float* s1 = (float*)src1; float* s2 = (float*)src2; float* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMinEvery_32f(s1, s2, d, sz.width)) break; s1 = (float*)((uchar*)s1 + step1); s2 = (float*)((uchar*)s2 + step2); d = (float*)((uchar*)d + step); } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp32
, IF_SIMD(VMin
)>(src1, step1, src2, step2, dst, step, sz); } static void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { #if ARITHM_USE_IPP == 1 CV_IPP_CHECK() { double* s1 = (double*)src1; double* s2 = (double*)src2; double* d = dst; fixSteps(sz, sizeof(dst[0]), step1, step2, step); int i = 0; for(; i < sz.height; i++) { if (0 > ippsMinEvery_64f(s1, s2, d, sz.width)) break; s1 = (double*)((uchar*)s1 + step1); s2 = (double*)((uchar*)s2 + step2); d = (double*)((uchar*)d + step); } if (i == sz.height) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif vBinOp64
, IF_SIMD(VMin
)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VAbsDiff
)>(src1, step1, src2, step2, dst, step, sz)); } static void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VAbsDiff
)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VAbsDiff
)>(src1, step1, src2, step2, dst, step, sz)); } static void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* ) { vBinOp
, IF_SIMD(VAbsDiff
)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* ) { vBinOp32
, IF_SIMD(VAbsDiff
)>(src1, step1, src2, step2, dst, step, sz); } static void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp32
, IF_SIMD(VAbsDiff
)>(src1, step1, src2, step2, dst, step, sz)); } static void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* ) { vBinOp64
, IF_SIMD(VAbsDiff
)>(src1, step1, src2, step2, dst, step, sz); } static void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VAnd
)>(src1, step1, src2, step2, dst, step, sz)); } static void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VOr
)>(src1, step1, src2, step2, dst, step, sz)); } static void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VXor
)>(src1, step1, src2, step2, dst, step, sz)); } static void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* ) { #if (ARITHM_USE_IPP == 1) CV_IPP_CHECK() { fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2; if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz))) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } #endif (vBinOp
, IF_SIMD(VNot
)>(src1, step1, src2, step2, dst, step, sz)); } /****************************************************************************************\ * logical operations * \****************************************************************************************/ void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize ) { int scn = (int)sc.total(), cn = CV_MAT_CN(buftype); size_t esz = CV_ELEM_SIZE(buftype); getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0); // unroll the scalar if( scn < cn ) { CV_Assert( scn == 1 ); size_t esz1 = CV_ELEM_SIZE1(buftype); for( size_t i = esz1; i < esz; i++ ) scbuf[i] = scbuf[i - esz1]; } for( size_t i = esz; i < blocksize*esz; i++ ) scbuf[i] = scbuf[i - esz]; } enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4, OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8, OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14, OCL_OP_RDIV_SCALE=15 }; #ifdef HAVE_OPENCL static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF", "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE", "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 }; static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, bool bitwise, int oclop, bool haveScalar ) { bool haveMask = !_mask.empty(); int srctype = _src1.type(); int srcdepth = CV_MAT_DEPTH(srctype); int cn = CV_MAT_CN(srctype); const ocl::Device d = ocl::Device::getDefault(); bool doubleSupport = d.doubleFPConfig() > 0; if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) || (!doubleSupport && srcdepth == CV_64F && !bitwise)) return false; char opts[1024]; int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int scalarcn = kercn == 3 ? 4 : kercn; int rowsPerWI = d.isIntel() ? 4 : 1; sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d", haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop], bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) : ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "", bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) : ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)), bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) : ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)), kercn, rowsPerWI); ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); if (k.empty()) return false; UMat src1 = _src1.getUMat(), src2; UMat dst = _dst.getUMat(), mask = _mask.getUMat(); ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : ocl::KernelArg::WriteOnly(dst, cn, kercn); ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); if( haveScalar ) { size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn; double buf[4] = {0,0,0,0}; if( oclop != OCL_OP_NOT ) { Mat src2sc = _src2.getMat(); convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1); } ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); if( !haveMask ) k.args(src1arg, dstarg, scalararg); else k.args(src1arg, maskarg, dstarg, scalararg); } else { src2 = _src2.getUMat(); ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); if( !haveMask ) k.args(src1arg, src2arg, dstarg); else k.args(src1arg, src2arg, maskarg, dstarg); } size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, 0, false); } #endif static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, const BinaryFunc* tab, bool bitwise, int oclop ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; int kind1 = psrc1->kind(), kind2 = psrc2->kind(); int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); int dims1 = psrc1->dims(), dims2 = psrc2->dims(); Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); #ifdef HAVE_OPENCL bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) && dims1 <= 2 && dims2 <= 2; #endif bool haveMask = !_mask.empty(), haveScalar = false; BinaryFunc func; if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask ) { _dst.create(sz1, type1); CV_OCL_RUN(use_opencl, ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false)) if( bitwise ) { func = *tab; cn = (int)CV_ELEM_SIZE(type1); } else func = tab[depth1]; Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize(src1, src2, dst); size_t len = sz.width*(size_t)cn; if( len == (size_t)(int)len ) { sz.width = (int)len; func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0); return; } } if( oclop == OCL_OP_NOT ) haveScalar = true; else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 || !psrc1->sameSize(*psrc2) || type1 != type2 ) { if( checkScalar(*psrc1, type2, kind1, kind2) ) { // src1 is a scalar; swap it with src2 swap(psrc1, psrc2); swap(type1, type2); swap(depth1, depth2); swap(cn, cn2); swap(sz1, sz2); } else if( !checkScalar(*psrc2, type1, kind2, kind1) ) CV_Error( CV_StsUnmatchedSizes, "The operation is neither 'array op array' (where arrays have the same size and type), " "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; } else { CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 ); } size_t esz = CV_ELEM_SIZE(type1); size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz; BinaryFunc copymask = 0; bool reallocate = false; if( haveMask ) { int mtype = _mask.type(); CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1)); copymask = getCopyMaskFunc(esz); reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1; } AutoBuffer
_buf; uchar *scbuf = 0, *maskbuf = 0; _dst.createSameSize(*psrc1, type1); // if this is mask operation and dst has been reallocated, // we have to clear the destination if( haveMask && reallocate ) _dst.setTo(0.); CV_OCL_RUN(use_opencl, ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar)) Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(); Mat dst = _dst.getMat(), mask = _mask.getMat(); if( bitwise ) { func = *tab; cn = (int)esz; } else func = tab[depth1]; if( !haveScalar ) { const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; uchar* ptrs[4]; NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = total; if( blocksize*cn > INT_MAX ) blocksize = INT_MAX/cn; if( haveMask ) { blocksize = std::min(blocksize, blocksize0); _buf.allocate(blocksize*esz); maskbuf = _buf; } for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)MIN(total - j, blocksize); func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz ); ptrs[3] += bsz; } bsz *= (int)esz; ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz; } } } else { const Mat* arrays[] = { &src1, &dst, &mask, 0 }; uchar* ptrs[3]; NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = std::min(total, blocksize0); _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32); scbuf = _buf; maskbuf = alignPtr(scbuf + blocksize*esz, 16); convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize); for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)MIN(total - j, blocksize); func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 ); if( haveMask ) { copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz ); ptrs[2] += bsz; } bsz *= (int)esz; ptrs[0] += bsz; ptrs[1] += bsz; } } } } static BinaryFunc* getMaxTab() { static BinaryFunc maxTab[] = { (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s), (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s), (BinaryFunc)GET_OPTIMIZED(max32s), (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f, 0 }; return maxTab; } static BinaryFunc* getMinTab() { static BinaryFunc minTab[] = { (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s), (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s), (BinaryFunc)GET_OPTIMIZED(min32s), (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f, 0 }; return minTab; } } void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u); binary_op(a, b, c, mask, &f, true, OCL_OP_AND); } void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u); binary_op(a, b, c, mask, &f, true, OCL_OP_OR); } void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u); binary_op(a, b, c, mask, &f, true, OCL_OP_XOR); } void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask) { BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u); binary_op(a, a, c, mask, &f, true, OCL_OP_NOT); } void cv::max( InputArray src1, InputArray src2, OutputArray dst ) { binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } void cv::min( InputArray src1, InputArray src2, OutputArray dst ) { binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } void cv::max(const Mat& src1, const Mat& src2, Mat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } void cv::min(const Mat& src1, const Mat& src2, Mat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } void cv::max(const UMat& src1, const UMat& src2, UMat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX ); } void cv::min(const UMat& src1, const UMat& src2, UMat& dst) { OutputArray _dst(dst); binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN ); } /****************************************************************************************\ * add/subtract * \****************************************************************************************/ namespace cv { static int actualScalarDepth(const double* data, int len) { int i = 0, minval = INT_MAX, maxval = INT_MIN; for(; i < len; ++i) { int ival = cvRound(data[i]); if( ival != data[i] ) break; minval = MIN(minval, ival); maxval = MAX(maxval, ival); } return i < len ? CV_64F : minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U : minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S : minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U : minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S : CV_32S; } #ifdef HAVE_OPENCL static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, int wtype, void* usrdata, int oclop, bool haveScalar ) { const ocl::Device d = ocl::Device::getDefault(); bool doubleSupport = d.doubleFPConfig() > 0; int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); bool haveMask = !_mask.empty(); if ( (haveMask || haveScalar) && cn > 4 ) return false; int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype)); if (!doubleSupport) wdepth = std::min(wdepth, CV_32F); wtype = CV_MAKETYPE(wdepth, cn); int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2); if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F)) return false; int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst); int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1; char cvtstr[4][32], opts[1024]; sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s " "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s " "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s", (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)), ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)), ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)), ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)), ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)), ocl::typeToStr(wdepth), wdepth, ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]), ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]), ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]), doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI, oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert"); size_t usrdata_esz = CV_ELEM_SIZE(wdepth); const uchar* usrdata_p = (const uchar*)usrdata; const double* usrdata_d = (const double*)usrdata; float usrdata_f[3]; int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE || oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0; if( n > 0 && wdepth == CV_32F ) { for( i = 0; i < n; i++ ) usrdata_f[i] = (float)usrdata_d[i]; usrdata_p = (const uchar*)usrdata_f; } ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts); if (k.empty()) return false; UMat src1 = _src1.getUMat(), src2; UMat dst = _dst.getUMat(), mask = _mask.getUMat(); ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn); ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) : ocl::KernelArg::WriteOnly(dst, cn, kercn); ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1); if( haveScalar ) { size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn; double buf[4]={0,0,0,0}; Mat src2sc = _src2.getMat(); if( !src2sc.empty() ) convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1); ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz); if( !haveMask ) { if(n == 0) k.args(src1arg, dstarg, scalararg); else if(n == 1) k.args(src1arg, dstarg, scalararg, ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); else CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); } else k.args(src1arg, maskarg, dstarg, scalararg); } else { src2 = _src2.getUMat(); ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn); if( !haveMask ) { if (n == 0) k.args(src1arg, src2arg, dstarg); else if (n == 1) k.args(src1arg, src2arg, dstarg, ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz)); else if (n == 3) k.args(src1arg, src2arg, dstarg, ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz), ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz), ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz)); else CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters"); } else k.args(src1arg, src2arg, maskarg, dstarg); } size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } #endif static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, void* usrdata=0, int oclop=-1 ) { const _InputArray *psrc1 = &_src1, *psrc2 = &_src2; int kind1 = psrc1->kind(), kind2 = psrc2->kind(); bool haveMask = !_mask.empty(); bool reallocate = false; int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1); int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2); int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims(); Size sz1 = dims1 <= 2 ? psrc1->size() : Size(); Size sz2 = dims2 <= 2 ? psrc2->size() : Size(); #ifdef HAVE_OPENCL bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2; #endif bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2); bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1); if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 && !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) || (_dst.fixedType() && _dst.type() == type1)) && ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) ) { _dst.createSameSize(*psrc1, type1); CV_OCL_RUN(use_opencl, ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, (!usrdata ? type1 : std::max(depth1, CV_32F)), usrdata, oclop, false)) Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(); Size sz = getContinuousSize(src1, src2, dst, src1.channels()); tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata); return; } bool haveScalar = false, swapped12 = false; if( dims1 != dims2 || sz1 != sz2 || cn != cn2 || (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) || (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) ) { if( checkScalar(*psrc1, type2, kind1, kind2) ) { // src1 is a scalar; swap it with src2 swap(psrc1, psrc2); swap(sz1, sz2); swap(type1, type2); swap(depth1, depth2); swap(cn, cn2); swap(dims1, dims2); swapped12 = true; if( oclop == OCL_OP_SUB ) oclop = OCL_OP_RSUB; if ( oclop == OCL_OP_DIV_SCALE ) oclop = OCL_OP_RDIV_SCALE; } else if( !checkScalar(*psrc2, type1, kind2, kind1) ) CV_Error( CV_StsUnmatchedSizes, "The operation is neither 'array op array' " "(where arrays have the same size and the same number of channels), " "nor 'array op scalar', nor 'scalar op array'" ); haveScalar = true; CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4)); if (!muldiv) { Mat sc = psrc2->getMat(); depth2 = actualScalarDepth(sc.ptr
(), cn); if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) ) depth2 = CV_32F; } else depth2 = CV_64F; } if( dtype < 0 ) { if( _dst.fixedType() ) dtype = _dst.type(); else { if( !haveScalar && type1 != type2 ) CV_Error(CV_StsBadArg, "When the input arrays in add/subtract/multiply/divide functions have different types, " "the output array type must be explicitly specified"); dtype = type1; } } dtype = CV_MAT_DEPTH(dtype); if( depth1 == depth2 && dtype == depth1 ) wtype = dtype; else if( !muldiv ) { wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S : depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2); wtype = std::max(wtype, dtype); // when the result of addition should be converted to an integer type, // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation, // instead of converting the other input to floating-point and then converting the operation result back to integers. if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) ) wtype = CV_32S; } else { wtype = std::max(depth1, std::max(depth2, CV_32F)); wtype = std::max(wtype, dtype); } dtype = CV_MAKETYPE(dtype, cn); wtype = CV_MAKETYPE(wtype, cn); if( haveMask ) { int mtype = _mask.type(); CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) ); reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype; } _dst.createSameSize(*psrc1, dtype); if( reallocate ) _dst.setTo(0.); CV_OCL_RUN(use_opencl, ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype, usrdata, oclop, haveScalar)) BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype); BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype); BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype); size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2); size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype); size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz; BinaryFunc copymask = getCopyMaskFunc(dsz); Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat(); AutoBuffer
_buf; uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0; size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0); BinaryFunc func = tab[CV_MAT_DEPTH(wtype)]; if( !haveScalar ) { const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 }; uchar* ptrs[4]; NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = total; if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst ) blocksize = std::min(blocksize, blocksize0); _buf.allocate(bufesz*blocksize + 64); buf = _buf; if( cvtsrc1 ) buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); if( cvtsrc2 ) buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16); wbuf = maskbuf = buf; if( cvtdst ) buf = alignPtr(buf + blocksize*wsz, 16); if( haveMask ) maskbuf = buf; for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)MIN(total - j, blocksize); Size bszn(bsz*cn, 1); const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1]; uchar* dptr = ptrs[2]; if( cvtsrc1 ) { cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); sptr1 = buf1; } if( ptrs[0] == ptrs[1] ) sptr2 = sptr1; else if( cvtsrc2 ) { cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 ); sptr2 = buf2; } if( !haveMask && !cvtdst ) func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); else { func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata ); if( !haveMask ) cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); else if( !cvtdst ) { copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); ptrs[3] += bsz; } else { cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz ); ptrs[3] += bsz; } } ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz; } } } else { const Mat* arrays[] = { &src1, &dst, &mask, 0 }; uchar* ptrs[3]; NAryMatIterator it(arrays, ptrs); size_t total = it.size, blocksize = std::min(total, blocksize0); _buf.allocate(bufesz*blocksize + 64); buf = _buf; if( cvtsrc1 ) buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16); buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16); wbuf = maskbuf = buf; if( cvtdst ) buf = alignPtr(buf + blocksize*wsz, 16); if( haveMask ) maskbuf = buf; convertAndUnrollScalar( src2, wtype, buf2, blocksize); for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( size_t j = 0; j < total; j += blocksize ) { int bsz = (int)MIN(total - j, blocksize); Size bszn(bsz*cn, 1); const uchar *sptr1 = ptrs[0]; const uchar* sptr2 = buf2; uchar* dptr = ptrs[1]; if( cvtsrc1 ) { cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 ); sptr1 = buf1; } if( swapped12 ) std::swap(sptr1, sptr2); if( !haveMask && !cvtdst ) func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata ); else { func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata ); if( !haveMask ) cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 ); else if( !cvtdst ) { copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); ptrs[2] += bsz; } else { cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 ); copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz ); ptrs[2] += bsz; } } ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz; } } } } static BinaryFunc* getAddTab() { static BinaryFunc addTab[] = { (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s), (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s), (BinaryFunc)GET_OPTIMIZED(add32s), (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f, 0 }; return addTab; } static BinaryFunc* getSubTab() { static BinaryFunc subTab[] = { (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s), (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s), (BinaryFunc)GET_OPTIMIZED(sub32s), (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f, 0 }; return subTab; } static BinaryFunc* getAbsDiffTab() { static BinaryFunc absDiffTab[] = { (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s), (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s), (BinaryFunc)GET_OPTIMIZED(absdiff32s), (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f, 0 }; return absDiffTab; } } void cv::add( InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype ) { arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD ); } void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst, InputArray mask, int dtype ) { #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::useTegra()) { int kind1 = _src1.kind(), kind2 = _src2.kind(); Mat src1 = _src1.getMat(), src2 = _src2.getMat(); bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2); bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1); if (!src1Scalar && !src2Scalar && src1.depth() == CV_8U && src2.type() == src1.type() && src1.dims == 2 && src2.size() == src1.size() && mask.empty()) { if (dtype < 0) { if (_dst.fixedType()) { dtype = _dst.depth(); } else { dtype = src1.depth(); } } dtype = CV_MAT_DEPTH(dtype); if (!_dst.fixedType() || dtype == _dst.depth()) { _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels())); if (dtype == CV_16S) { Mat dst = _dst.getMat(); if(tegra::subtract_8u8u16s(src1, src2, dst)) return; } else if (dtype == CV_32F) { Mat dst = _dst.getMat(); if(tegra::subtract_8u8u32f(src1, src2, dst)) return; } else if (dtype == CV_8S) { Mat dst = _dst.getMat(); if(tegra::subtract_8u8u8s(src1, src2, dst)) return; } } } } #endif arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB ); } void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst ) { arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF); } /****************************************************************************************\ * multiply/divide * \****************************************************************************************/ namespace cv { template
struct Mul_SIMD { int operator() (const T *, const T *, T *, int, WT) const { return 0; } }; #if CV_NEON template <> struct Mul_SIMD
{ int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const { int x = 0; if( scale == 1.0f ) for ( ; x <= width - 8; x += 8) { uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); vst1_u8(dst + x, vqmovn_u16(v_dst)); } else { float32x4_t v_scale = vdupq_n_f32(scale); for ( ; x <= width - 8; x += 8) { uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x)); uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x)); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); v_dst1 = vmulq_f32(v_dst1, v_scale); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); v_dst2 = vmulq_f32(v_dst2, v_scale); uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); vst1_u8(dst + x, vqmovn_u16(v_dst)); } } return x; } }; template <> struct Mul_SIMD
{ int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const { int x = 0; if( scale == 1.0f ) for ( ; x <= width - 8; x += 8) { int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); vst1_s8(dst + x, vqmovn_s16(v_dst)); } else { float32x4_t v_scale = vdupq_n_f32(scale); for ( ; x <= width - 8; x += 8) { int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x)); int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x)); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); v_dst1 = vmulq_f32(v_dst1, v_scale); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); v_dst2 = vmulq_f32(v_dst2, v_scale); int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); vst1_s8(dst + x, vqmovn_s16(v_dst)); } } return x; } }; template <> struct Mul_SIMD
{ int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const { int x = 0; if( scale == 1.0f ) for ( ; x <= width - 8; x += 8) { uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); vst1q_u16(dst + x, v_dst); } else { float32x4_t v_scale = vdupq_n_f32(scale); for ( ; x <= width - 8; x += 8) { uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2)))); v_dst1 = vmulq_f32(v_dst1, v_scale); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2)))); v_dst2 = vmulq_f32(v_dst2, v_scale); uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)), vqmovn_u32(cv_vrndq_u32_f32(v_dst2))); vst1q_u16(dst + x, v_dst); } } return x; } }; template <> struct Mul_SIMD
{ int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const { int x = 0; if( scale == 1.0f ) for ( ; x <= width - 8; x += 8) { int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); vst1q_s16(dst + x, v_dst); } else { float32x4_t v_scale = vdupq_n_f32(scale); for ( ; x <= width - 8; x += 8) { int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2)))); v_dst1 = vmulq_f32(v_dst1, v_scale); float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2)))); v_dst2 = vmulq_f32(v_dst2, v_scale); int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)), vqmovn_s32(cv_vrndq_s32_f32(v_dst2))); vst1q_s16(dst + x, v_dst); } } return x; } }; template <> struct Mul_SIMD
{ int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const { int x = 0; if( scale == 1.0f ) for ( ; x <= width - 8; x += 8) { float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); vst1q_f32(dst + x, v_dst1); vst1q_f32(dst + x + 4, v_dst2); } else { float32x4_t v_scale = vdupq_n_f32(scale); for ( ; x <= width - 8; x += 8) { float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x)); v_dst1 = vmulq_f32(v_dst1, v_scale); float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)); v_dst2 = vmulq_f32(v_dst2, v_scale); vst1q_f32(dst + x, v_dst1); vst1q_f32(dst + x + 4, v_dst2); } } return x; } }; #elif CV_SSE2 #if CV_SSE4_1 template <> struct Mul_SIMD
{ Mul_SIMD() { haveSSE = checkHardwareSupport(CV_CPU_SSE4_1); } int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const { int x = 0; if (!haveSSE) return x; __m128i v_zero = _mm_setzero_si128(); if( scale != 1.0f ) { __m128 v_scale = _mm_set1_ps(scale); for ( ; x <= width - 8; x += 8) { __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero))); v_dst1 = _mm_mul_ps(v_dst1, v_scale); __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero))); v_dst2 = _mm_mul_ps(v_dst2, v_scale); __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); _mm_storeu_si128((__m128i *)(dst + x), v_dsti); } } return x; } bool haveSSE; }; #endif template <> struct Mul_SIMD
{ Mul_SIMD() { haveSSE = checkHardwareSupport(CV_CPU_SSE2); } int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const { int x = 0; if (!haveSSE) return x; __m128i v_zero = _mm_setzero_si128(); if( scale == 1.0f ) for ( ; x <= width - 8; x += 8) { __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); } else { __m128 v_scale = _mm_set1_ps(scale); for ( ; x <= width - 8; x += 8) { __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x)); __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x)); v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); v_dst1 = _mm_mul_ps(v_dst1, v_scale); __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); v_dst2 = _mm_mul_ps(v_dst2, v_scale); __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero)); } } return x; } bool haveSSE; }; template <> struct Mul_SIMD
{ Mul_SIMD() { haveSSE = checkHardwareSupport(CV_CPU_SSE2); } int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const { int x = 0; if (!haveSSE) return x; __m128i v_zero = _mm_setzero_si128(); if( scale != 1.0f ) { __m128 v_scale = _mm_set1_ps(scale); for ( ; x <= width - 8; x += 8) { __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x)); __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x)); __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16))); v_dst1 = _mm_mul_ps(v_dst1, v_scale); __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16))); v_dst2 = _mm_mul_ps(v_dst2, v_scale); __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)); _mm_storeu_si128((__m128i *)(dst + x), v_dsti); } } return x; } bool haveSSE; }; #endif template
static void mul_( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, WT scale ) { step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); Mul_SIMD
vop; if( scale == (WT)1. ) { for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int i = vop(src1, src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for(; i <= size.width - 4; i += 4 ) { T t0; T t1; t0 = saturate_cast
(src1[i ] * src2[i ]); t1 = saturate_cast
(src1[i+1] * src2[i+1]); dst[i ] = t0; dst[i+1] = t1; t0 = saturate_cast
(src1[i+2] * src2[i+2]); t1 = saturate_cast
(src1[i+3] * src2[i+3]); dst[i+2] = t0; dst[i+3] = t1; } #endif for( ; i < size.width; i++ ) dst[i] = saturate_cast
(src1[i] * src2[i]); } } else { for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int i = vop(src1, src2, dst, size.width, scale); #if CV_ENABLE_UNROLLED for(; i <= size.width - 4; i += 4 ) { T t0 = saturate_cast
(scale*(WT)src1[i]*src2[i]); T t1 = saturate_cast
(scale*(WT)src1[i+1]*src2[i+1]); dst[i] = t0; dst[i+1] = t1; t0 = saturate_cast
(scale*(WT)src1[i+2]*src2[i+2]); t1 = saturate_cast
(scale*(WT)src1[i+3]*src2[i+3]); dst[i+2] = t0; dst[i+3] = t1; } #endif for( ; i < size.width; i++ ) dst[i] = saturate_cast
(scale*(WT)src1[i]*src2[i]); } } } template
struct Div_SIMD { int operator() (const T *, const T *, T *, int, double) const { return 0; } }; template
struct Recip_SIMD { int operator() (const T *, T *, int, double) const { return 0; } }; #if CV_SIMD128 template <> struct Div_SIMD
{ bool haveSIMD; Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { v_uint16x8 v_src1 = v_load_expand(src1 + x); v_uint16x8 v_src2 = v_load_expand(src2 + x); v_uint32x4 t0, t1, t2, t3; v_expand(v_src1, t0, t1); v_expand(v_src2, t2, t3); v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); f0 = f0 * v_scale / f2; f1 = f1 * v_scale / f3; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_uint16x8 res = v_pack_u(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_pack_store(dst + x, res); } return x; } }; template <> struct Div_SIMD
{ bool haveSIMD; Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { v_int16x8 v_src1 = v_load_expand(src1 + x); v_int16x8 v_src2 = v_load_expand(src2 + x); v_int32x4 t0, t1, t2, t3; v_expand(v_src1, t0, t1); v_expand(v_src2, t2, t3); v_float32x4 f0 = v_cvt_f32(t0); v_float32x4 f1 = v_cvt_f32(t1); v_float32x4 f2 = v_cvt_f32(t2); v_float32x4 f3 = v_cvt_f32(t3); f0 = f0 * v_scale / f2; f1 = f1 * v_scale / f3; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_int16x8 res = v_pack(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_pack_store(dst + x, res); } return x; } }; template <> struct Div_SIMD
{ bool haveSIMD; Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { v_uint16x8 v_src1 = v_load(src1 + x); v_uint16x8 v_src2 = v_load(src2 + x); v_uint32x4 t0, t1, t2, t3; v_expand(v_src1, t0, t1); v_expand(v_src2, t2, t3); v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2)); v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3)); f0 = f0 * v_scale / f2; f1 = f1 * v_scale / f3; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_uint16x8 res = v_pack_u(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_store(dst + x, res); } return x; } }; template <> struct Div_SIMD
{ bool haveSIMD; Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { v_int16x8 v_src1 = v_load(src1 + x); v_int16x8 v_src2 = v_load(src2 + x); v_int32x4 t0, t1, t2, t3; v_expand(v_src1, t0, t1); v_expand(v_src2, t2, t3); v_float32x4 f0 = v_cvt_f32(t0); v_float32x4 f1 = v_cvt_f32(t1); v_float32x4 f2 = v_cvt_f32(t2); v_float32x4 f3 = v_cvt_f32(t3); f0 = f0 * v_scale / f2; f1 = f1 * v_scale / f3; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_int16x8 res = v_pack(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_store(dst + x, res); } return x; } }; template <> struct Div_SIMD
{ bool haveSIMD; Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_int32x4 v_zero = v_setzero_s32(); for ( ; x <= width - 8; x += 8) { v_int32x4 t0 = v_load(src1 + x); v_int32x4 t1 = v_load(src1 + x + 4); v_int32x4 t2 = v_load(src2 + x); v_int32x4 t3 = v_load(src2 + x + 4); v_float32x4 f0 = v_cvt_f32(t0); v_float32x4 f1 = v_cvt_f32(t1); v_float32x4 f2 = v_cvt_f32(t2); v_float32x4 f3 = v_cvt_f32(t3); f0 = f0 * v_scale / f2; f1 = f1 * v_scale / f3; v_int32x4 res0 = v_round(f0), res1 = v_round(f1); res0 = v_select(t2 == v_zero, v_zero, res0); res1 = v_select(t3 == v_zero, v_zero, res1); v_store(dst + x, res0); v_store(dst + x + 4, res1); } return x; } }; template <> struct Div_SIMD
{ bool haveSIMD; Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_float32x4 v_zero = v_setzero_f32(); for ( ; x <= width - 8; x += 8) { v_float32x4 f0 = v_load(src1 + x); v_float32x4 f1 = v_load(src1 + x + 4); v_float32x4 f2 = v_load(src2 + x); v_float32x4 f3 = v_load(src2 + x + 4); v_float32x4 res0 = f0 * v_scale / f2; v_float32x4 res1 = f1 * v_scale / f3; res0 = v_select(f2 == v_zero, v_zero, res0); res1 = v_select(f3 == v_zero, v_zero, res1); v_store(dst + x, res0); v_store(dst + x + 4, res1); } return x; } }; ///////////////////////// RECIPROCAL ////////////////////// template <> struct Recip_SIMD
{ bool haveSIMD; Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const uchar * src2, uchar * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { v_uint16x8 v_src2 = v_load_expand(src2 + x); v_uint32x4 t0, t1; v_expand(v_src2, t0, t1); v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); f0 = v_scale / f0; f1 = v_scale / f1; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_uint16x8 res = v_pack_u(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_pack_store(dst + x, res); } return x; } }; template <> struct Recip_SIMD
{ bool haveSIMD; Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const schar * src2, schar * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { v_int16x8 v_src2 = v_load_expand(src2 + x); v_int32x4 t0, t1; v_expand(v_src2, t0, t1); v_float32x4 f0 = v_cvt_f32(t0); v_float32x4 f1 = v_cvt_f32(t1); f0 = v_scale / f0; f1 = v_scale / f1; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_int16x8 res = v_pack(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_pack_store(dst + x, res); } return x; } }; template <> struct Recip_SIMD
{ bool haveSIMD; Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const ushort * src2, ushort * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_uint16x8 v_zero = v_setzero_u16(); for ( ; x <= width - 8; x += 8) { v_uint16x8 v_src2 = v_load(src2 + x); v_uint32x4 t0, t1; v_expand(v_src2, t0, t1); v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0)); v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1)); f0 = v_scale / f0; f1 = v_scale / f1; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_uint16x8 res = v_pack_u(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_store(dst + x, res); } return x; } }; template <> struct Recip_SIMD
{ bool haveSIMD; Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const short * src2, short * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_int16x8 v_zero = v_setzero_s16(); for ( ; x <= width - 8; x += 8) { v_int16x8 v_src2 = v_load(src2 + x); v_int32x4 t0, t1; v_expand(v_src2, t0, t1); v_float32x4 f0 = v_cvt_f32(t0); v_float32x4 f1 = v_cvt_f32(t1); f0 = v_scale / f0; f1 = v_scale / f1; v_int32x4 i0 = v_round(f0), i1 = v_round(f1); v_int16x8 res = v_pack(i0, i1); res = v_select(v_src2 == v_zero, v_zero, res); v_store(dst + x, res); } return x; } }; template <> struct Recip_SIMD
{ bool haveSIMD; Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const int * src2, int * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_int32x4 v_zero = v_setzero_s32(); for ( ; x <= width - 8; x += 8) { v_int32x4 t0 = v_load(src2 + x); v_int32x4 t1 = v_load(src2 + x + 4); v_float32x4 f0 = v_cvt_f32(t0); v_float32x4 f1 = v_cvt_f32(t1); f0 = v_scale / f0; f1 = v_scale / f1; v_int32x4 res0 = v_round(f0), res1 = v_round(f1); res0 = v_select(t0 == v_zero, v_zero, res0); res1 = v_select(t1 == v_zero, v_zero, res1); v_store(dst + x, res0); v_store(dst + x + 4, res1); } return x; } }; template <> struct Recip_SIMD
{ bool haveSIMD; Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const float * src2, float * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float32x4 v_scale = v_setall_f32((float)scale); v_float32x4 v_zero = v_setzero_f32(); for ( ; x <= width - 8; x += 8) { v_float32x4 f0 = v_load(src2 + x); v_float32x4 f1 = v_load(src2 + x + 4); v_float32x4 res0 = v_scale / f0; v_float32x4 res1 = v_scale / f1; res0 = v_select(f0 == v_zero, v_zero, res0); res1 = v_select(f1 == v_zero, v_zero, res1); v_store(dst + x, res0); v_store(dst + x + 4, res1); } return x; } }; #if CV_SIMD128_64F template <> struct Div_SIMD
{ bool haveSIMD; Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float64x2 v_scale = v_setall_f64(scale); v_float64x2 v_zero = v_setzero_f64(); for ( ; x <= width - 4; x += 4) { v_float64x2 f0 = v_load(src1 + x); v_float64x2 f1 = v_load(src1 + x + 2); v_float64x2 f2 = v_load(src2 + x); v_float64x2 f3 = v_load(src2 + x + 2); v_float64x2 res0 = f0 * v_scale / f2; v_float64x2 res1 = f1 * v_scale / f3; res0 = v_select(f0 == v_zero, v_zero, res0); res1 = v_select(f1 == v_zero, v_zero, res1); v_store(dst + x, res0); v_store(dst + x + 2, res1); } return x; } }; template <> struct Recip_SIMD
{ bool haveSIMD; Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); } int operator() (const double * src2, double * dst, int width, double scale) const { int x = 0; if (!haveSIMD) return x; v_float64x2 v_scale = v_setall_f64(scale); v_float64x2 v_zero = v_setzero_f64(); for ( ; x <= width - 4; x += 4) { v_float64x2 f0 = v_load(src2 + x); v_float64x2 f1 = v_load(src2 + x + 2); v_float64x2 res0 = v_scale / f0; v_float64x2 res1 = v_scale / f1; res0 = v_select(f0 == v_zero, v_zero, res0); res1 = v_select(f1 == v_zero, v_zero, res1); v_store(dst + x, res0); v_store(dst + x + 2, res1); } return x; } }; #endif #endif template
static void div_i( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) { step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); Div_SIMD
vop; float scale_f = (float)scale; for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int i = vop(src1, src2, dst, size.width, scale); for( ; i < size.width; i++ ) { T num = src1[i], denom = src2[i]; dst[i] = denom != 0 ? saturate_cast
(num*scale_f/denom) : (T)0; } } } template
static void div_f( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) { T scale_f = (T)scale; step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); Div_SIMD
vop; for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int i = vop(src1, src2, dst, size.width, scale); for( ; i < size.width; i++ ) { T num = src1[i], denom = src2[i]; dst[i] = denom != 0 ? saturate_cast
(num*scale_f/denom) : (T)0; } } } template
static void recip_i( const T*, size_t, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) { step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); Recip_SIMD
vop; float scale_f = (float)scale; for( ; size.height--; src2 += step2, dst += step ) { int i = vop(src2, dst, size.width, scale); for( ; i < size.width; i++ ) { T denom = src2[i]; dst[i] = denom != 0 ? saturate_cast
(scale_f/denom) : (T)0; } } } template
static void recip_f( const T*, size_t, const T* src2, size_t step2, T* dst, size_t step, Size size, double scale ) { T scale_f = (T)scale; step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); Recip_SIMD
vop; for( ; size.height--; src2 += step2, dst += step ) { int i = vop(src2, dst, size.width, scale); for( ; i < size.width; i++ ) { T denom = src2[i]; dst[i] = denom != 0 ? saturate_cast
(scale_f/denom) : (T)0; } } } static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* scale) { float fscale = (float)*(const double*)scale; #if defined HAVE_IPP CV_IPP_CHECK() { if (std::fabs(fscale - 1) <= FLT_EPSILON) { if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } } #endif mul_(src1, step1, src2, step2, dst, step, sz, fscale); } static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scale) { mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale); } static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scale) { float fscale = (float)*(const double*)scale; #if defined HAVE_IPP CV_IPP_CHECK() { if (std::fabs(fscale - 1) <= FLT_EPSILON) { if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } } #endif mul_(src1, step1, src2, step2, dst, step, sz, fscale); } static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scale) { float fscale = (float)*(const double*)scale; #if defined HAVE_IPP CV_IPP_CHECK() { if (std::fabs(fscale - 1) <= FLT_EPSILON) { if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } } #endif mul_(src1, step1, src2, step2, dst, step, sz, fscale); } static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scale) { mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { float fscale = (float)*(const double*)scale; #if defined HAVE_IPP CV_IPP_CHECK() { if (std::fabs(fscale - 1) <= FLT_EPSILON) { if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); } } #endif mul_(src1, step1, src2, step2, dst, step, sz, fscale); } static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* scale) { mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* scale) { if( src1 ) div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); else recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scale) { div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scale) { div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scale) { div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scale) { div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* scale) { div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size sz, void* scale) { recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scale) { recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scale) { recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scale) { recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scale) { recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scale) { recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, Size sz, void* scale) { recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale); } static BinaryFunc* getMulTab() { static BinaryFunc mulTab[] = { (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u, (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f, (BinaryFunc)mul64f, 0 }; return mulTab; } static BinaryFunc* getDivTab() { static BinaryFunc divTab[] = { (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u, (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f, (BinaryFunc)div64f, 0 }; return divTab; } static BinaryFunc* getRecipTab() { static BinaryFunc recipTab[] = { (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u, (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f, (BinaryFunc)recip64f, 0 }; return recipTab; } } void cv::multiply(InputArray src1, InputArray src2, OutputArray dst, double scale, int dtype) { arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE); } void cv::divide(InputArray src1, InputArray src2, OutputArray dst, double scale, int dtype) { arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE); } void cv::divide(double scale, InputArray src2, OutputArray dst, int dtype) { arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE); } /****************************************************************************************\ * addWeighted * \****************************************************************************************/ namespace cv { template
struct AddWeighted_SIMD { int operator() (const T *, const T *, T *, int, WT, WT, WT) const { return 0; } }; #if CV_SSE2 template <> struct AddWeighted_SIMD
{ AddWeighted_SIMD() { haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); } int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const { int x = 0; if (!haveSSE2) return x; __m128i v_zero = _mm_setzero_si128(); __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), v_gamma = _mm_set1_ps(gamma); for( ; x <= width - 8; x += 8 ) { __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x)); __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x)); __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8); __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8); __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha); v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta)); __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha); v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta)); __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), _mm_cvtps_epi32(v_dstf1)); _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero)); } return x; } bool haveSSE2; }; template <> struct AddWeighted_SIMD
{ AddWeighted_SIMD() { haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); } int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const { int x = 0; if (!haveSSE2) return x; __m128i v_zero = _mm_setzero_si128(); __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), v_gamma = _mm_set1_ps(gamma); for( ; x <= width - 8; x += 8 ) { __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha); v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta)); __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha); v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta)); _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0), _mm_cvtps_epi32(v_dstf1))); } return x; } bool haveSSE2; }; #if CV_SSE4_1 template <> struct AddWeighted_SIMD
{ AddWeighted_SIMD() { haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); } int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const { int x = 0; if (!haveSSE4_1) return x; __m128i v_zero = _mm_setzero_si128(); __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta), v_gamma = _mm_set1_ps(gamma); for( ; x <= width - 8; x += 8 ) { __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x)); __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x)); __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha); v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma), _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta)); __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha); v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma), _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta)); _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0), _mm_cvtps_epi32(v_dstf1))); } return x; } bool haveSSE4_1; }; #endif #elif CV_NEON template <> struct AddWeighted_SIMD
{ int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const { int x = 0; float32x4_t g = vdupq_n_f32 (gamma); for( ; x <= width - 8; x += 8 ) { int8x8_t in1 = vld1_s8(src1 + x); int16x8_t in1_16 = vmovl_s8(in1); float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16))); float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16))); int8x8_t in2 = vld1_s8(src2+x); int16x8_t in2_16 = vmovl_s8(in2); float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16))); float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16))); float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); out_f_l = vaddq_f32(out_f_l, g); out_f_h = vaddq_f32(out_f_h, g); int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l)); int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h)); int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h); int8x8_t out = vqmovn_s16(out_16); vst1_s8(dst + x, out); } return x; } }; template <> struct AddWeighted_SIMD
{ int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const { int x = 0; float32x4_t g = vdupq_n_f32(gamma); for( ; x <= width - 8; x += 8 ) { uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x); float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha); float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta); uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha); v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta); uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2)); } return x; } }; template <> struct AddWeighted_SIMD
{ int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const { int x = 0; float32x4_t g = vdupq_n_f32(gamma); for( ; x <= width - 8; x += 8 ) { int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x); float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha); float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta); int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha); v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta); int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g))); vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2)); } return x; } }; #endif template
static void addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size size, void* _scalars ) { const double* scalars = (const double*)_scalars; WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]); step /= sizeof(dst[0]); AddWeighted_SIMD
vop; for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int x = vop(src1, src2, dst, size.width, alpha, beta, gamma); #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { T t0 = saturate_cast
(src1[x]*alpha + src2[x]*beta + gamma); T t1 = saturate_cast
(src1[x+1]*alpha + src2[x+1]*beta + gamma); dst[x] = t0; dst[x+1] = t1; t0 = saturate_cast
(src1[x+2]*alpha + src2[x+2]*beta + gamma); t1 = saturate_cast
(src1[x+3]*alpha + src2[x+3]*beta + gamma); dst[x+2] = t0; dst[x+3] = t1; } #endif for( ; x < size.width; x++ ) dst[x] = saturate_cast
(src1[x]*alpha + src2[x]*beta + gamma); } } static void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, Size size, void* _scalars ) { const double* scalars = (const double*)_scalars; float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2]; for( ; size.height--; src1 += step1, src2 += step2, dst += step ) { int x = 0; #if CV_SSE2 if( USE_SSE2 ) { __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma); __m128i z = _mm_setzero_si128(); for( ; x <= size.width - 8; x += 8 ) { __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z); __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z); __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z)); __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z)); __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z)); __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z)); u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4)); u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4)); u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4); u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1)); u = _mm_packus_epi16(u, u); _mm_storel_epi64((__m128i*)(dst + x), u); } } #elif CV_NEON float32x4_t g = vdupq_n_f32 (gamma); for( ; x <= size.width - 8; x += 8 ) { uint8x8_t in1 = vld1_u8(src1+x); uint16x8_t in1_16 = vmovl_u8(in1); float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16))); float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16))); uint8x8_t in2 = vld1_u8(src2+x); uint16x8_t in2_16 = vmovl_u8(in2); float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16))); float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16))); float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta)); float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta)); out_f_l = vaddq_f32(out_f_l, g); out_f_h = vaddq_f32(out_f_h, g); uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l)); uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h)); uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h); uint8x8_t out = vqmovn_u16(out_16); vst1_u8(dst+x, out); } #endif #if CV_ENABLE_UNROLLED for( ; x <= size.width - 4; x += 4 ) { float t0, t1; t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma; dst[x] = saturate_cast
(t0); dst[x+1] = saturate_cast
(t1); t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma; t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma; dst[x+2] = saturate_cast
(t0); dst[x+3] = saturate_cast
(t1); } #endif for( ; x < size.width; x++ ) { float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma; dst[x] = saturate_cast
(t0); } } } static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, Size sz, void* scalars ) { addWeighted_
(src1, step1, src2, step2, dst, step, sz, scalars); } static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, Size sz, void* scalars ) { addWeighted_
(src1, step1, src2, step2, dst, step, sz, scalars); } static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, Size sz, void* scalars ) { addWeighted_
(src1, step1, src2, step2, dst, step, sz, scalars); } static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, Size sz, void* scalars ) { addWeighted_
(src1, step1, src2, step2, dst, step, sz, scalars); } static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, Size sz, void* scalars ) { addWeighted_