Commit b4bcdd10 authored by Maksim Shabunin's avatar Maksim Shabunin

HAL: improvements

- added new functions from core module: split, merge, add, sub, mul, div, ...
- added function replacement mechanism
- added example of HAL replacement library
parent e8742be3
No related merge requests found
......@@ -587,6 +587,11 @@ include(cmake/OpenCVFindMatlab.cmake)
include(cmake/OpenCVDetectVTK.cmake)
if (OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
get_filename_component(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" ABSOLUTE)
get_filename_component(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" ABSOLUTE)
endif()
# ----------------------------------------------------------------------------
# Add CUDA libraries (needed for apps/tools, samples)
# ----------------------------------------------------------------------------
......
#ifndef _CUSTOM_HAL_INCLUDED_
#define _CUSTOM_HAL_INCLUDED_
@OPENCV_HAL_HEADERS_INCLUDES@
#endif
......@@ -762,6 +762,4 @@ inline float32x2_t cv_vsqrt_f32(float32x2_t val)
} // cv
#include "sse_utils.hpp"
#endif //__OPENCV_CORE_BASE_HPP__
......@@ -277,37 +277,6 @@ execution time.
*/
CV_EXPORTS_W int64 getCPUTickCount();
/** @brief Available CPU features.
remember to keep this list identical to the one in cvdef.h
*/
enum CpuFeatures {
CPU_MMX = 1,
CPU_SSE = 2,
CPU_SSE2 = 3,
CPU_SSE3 = 4,
CPU_SSSE3 = 5,
CPU_SSE4_1 = 6,
CPU_SSE4_2 = 7,
CPU_POPCNT = 8,
CPU_AVX = 10,
CPU_AVX2 = 11,
CPU_FMA3 = 12,
CPU_AVX_512F = 13,
CPU_AVX_512BW = 14,
CPU_AVX_512CD = 15,
CPU_AVX_512DQ = 16,
CPU_AVX_512ER = 17,
CPU_AVX_512IFMA512 = 18,
CPU_AVX_512PF = 19,
CPU_AVX_512VBMI = 20,
CPU_AVX_512VL = 21,
CPU_NEON = 100
};
/** @brief Returns true if the specified feature is supported by the host hardware.
The function returns true if the host hardware supports the specified feature. When user calls
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -42,6 +42,7 @@
//M*/
#include "precomp.hpp"
#include "opencl_kernels_core.hpp"
#ifdef __APPLE__
......@@ -49,776 +50,37 @@
#define CV_NEON 0
#endif
namespace cv
{
/****************************************************************************************\
* split & merge *
\****************************************************************************************/
#if CV_NEON
template<typename T> struct VSplit2;
template<typename T> struct VSplit3;
template<typename T> struct VSplit4;
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, \
data_type* dst1) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
} \
}
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
} \
}
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2, data_type* dst3) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
store_func(dst3, r.val[3]); \
} \
}
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
#elif CV_SSE2
template <typename T>
struct VSplit2
{
VSplit2() : support(false) { }
void operator()(const T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit3
{
VSplit3() : support(false) { }
void operator()(const T *, T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit4
{
VSplit4() : support(false) { }
void operator()(const T *, T *, T *, T *, T *) const { }
bool support;
};
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit2() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
} \
\
bool support; \
}
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit3() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1, data_type * dst2) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
} \
\
bool support; \
}
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit4() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
data_type * dst2, data_type * dst3) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
} \
\
bool support; \
}
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
#endif
template<typename T> static void
split_( const T* src, T** dst, int len, int cn )
{
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
T* dst0 = dst[0];
if(cn == 1)
{
memcpy(dst0, src, len * sizeof(T));
}
else
{
for( i = 0, j = 0 ; i < len; i++, j += cn )
dst0[i] = src[j];
}
}
else if( k == 2 )
{
T *dst0 = dst[0], *dst1 = dst[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
#elif CV_SSE2
if (cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
}
}
else if( k == 3 )
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
#elif CV_SSE2
if (cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
dst2[i] = src[j+2];
}
}
else
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
#elif CV_SSE2
if (cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
dst2[i] = src[j+2]; dst3[i] = src[j+3];
}
}
for( ; k < cn; k += 4 )
{
T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
dst2[i] = src[j+2]; dst3[i] = src[j+3];
}
}
}
#if CV_NEON
template<typename T> struct VMerge2;
template<typename T> struct VMerge3;
template<typename T> struct VMerge4;
#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
store_func(dst, r); \
} \
}
#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
store_func(dst, r); \
} \
}
#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, const data_type* src3, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
r.val[3] = load_func(src3); \
store_func(dst, r); \
} \
}
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
#elif CV_SSE2
template <typename T>
struct VMerge2
{
VMerge2() : support(false) { }
void operator()(const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge3
{
VMerge3() : support(false) { }
void operator()(const T *, const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge4
{
VMerge4() : support(false) { }
void operator()(const T *, const T *, const T *, const T *, T *) const { }
bool support;
};
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge2() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
} \
\
bool support; \
}
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge3() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
} \
\
bool support; \
}
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge4() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
const data_type * src2, const data_type * src3, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \
reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \
} \
\
bool support; \
}
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
#if CV_SSE4_1
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
#endif
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
#endif
template<typename T> static void
merge_( const T** src, T* dst, int len, int cn )
{
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
const T* src0 = src[0];
for( i = j = 0; i < len; i++, j += cn )
dst[j] = src0[i];
}
else if( k == 2 )
{
const T *src0 = src[0], *src1 = src[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#elif CV_SSE2
if(cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
}
}
else if( k == 3 )
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#elif CV_SSE2
if(cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
dst[j+2] = src2[i];
}
}
else
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#elif CV_SSE2
if(cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
for( ; k < cn; k += 4 )
{
const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
}
static void split8u(const uchar* src, uchar** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
static void split16u(const ushort* src, ushort** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
static void split32s(const int* src, int** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
static void split64s(const int64* src, int64** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
static void merge8u(const uchar** src, uchar* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
static void merge16u(const ushort** src, ushort* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
static void merge32s(const int** src, int* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
static void merge64s(const int64** src, int64* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
static SplitFunc getSplitFunc(int depth)
{
static SplitFunc splitTab[] =
{
(SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u),
(SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0
(SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
(SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
};
return splitTab[depth];
}
typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
static MergeFunc getMergeFunc(int depth)
{
static MergeFunc mergeTab[] =
{
(MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u),
(MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0
(MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
(MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
};
return mergeTab[depth];
}
}
void cv::split(const Mat& src, Mat* mv)
{
int k, depth = src.depth(), cn = src.channels();
......
......@@ -83,6 +83,11 @@ typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
uchar* dst, size_t step, Size sz,
void*);
typedef void (*BinaryFuncC)(const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height,
void*);
BinaryFunc getConvertFunc(int sdepth, int ddepth);
BinaryFunc getCopyMaskFunc(size_t esz);
......@@ -114,46 +119,6 @@ extern const uchar g_Saturate8u[];
void deleteThreadAllocData();
#endif
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
};
template<typename T> struct OpMin
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::min(a, b); }
};
template<typename T> struct OpMax
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::max(a, b); }
};
inline Size getContinuousSize_( int flags, int cols, int rows, int widthScale )
{
int64 sz = (int64)cols * rows * widthScale;
......@@ -201,11 +166,6 @@ struct NoVec
size_t operator()(const void*, const void*, void*, size_t) const { return 0; }
};
extern volatile bool USE_SSE2;
extern volatile bool USE_SSE4_2;
extern volatile bool USE_AVX;
extern volatile bool USE_AVX2;
enum { BLOCK_SIZE = 1024 };
#if defined HAVE_IPP && (IPP_VERSION_X100 >= 700)
......
......@@ -86,45 +86,6 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex();
#undef max
#undef abs
#include <tchar.h>
#if defined _MSC_VER
#if _MSC_VER >= 1400
#include <intrin.h>
#elif defined _M_IX86
static void __cpuid(int* cpuid_data, int)
{
__asm
{
push ebx
push edi
mov edi, cpuid_data
mov eax, 1
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
pop ebx
}
}
static void __cpuidex(int* cpuid_data, int, int)
{
__asm
{
push edi
mov edi, cpuid_data
mov eax, 7
mov ecx, 0
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
}
}
#endif
#endif
#ifdef WINRT
#include <wrl/client.h>
......@@ -237,160 +198,15 @@ void Exception::formatMessage()
msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
}
struct HWFeatures
{
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
HWFeatures(void)
{
memset( have, 0, sizeof(have) );
x86_family = 0;
}
static HWFeatures initialize(void)
{
HWFeatures f;
int cpuid_data[4] = { 0, 0, 0, 0 };
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuid(cpuid_data, 1);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $1, %%eax\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%ebx\n\t"
"movl $1,%%eax\n\t"
"cpuid\n\t"
"popl %%ebx\n\t"
: "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
:
: "cc"
);
#endif
#endif
f.x86_family = (cpuid_data[0] >> 8) & 15;
if( f.x86_family >= 6 )
{
f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0;
f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0;
f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0;
f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0;
f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0;
f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0;
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
// make the second call to the cpuid command in order to get
// information about extended features like AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex(cpuid_data, 7, 0);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $7, %%eax\n\t"
"movl $0, %%ecx\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%ebx\n\t"
"movl $7,%%eax\n\t"
"movl $0,%%ecx\n\t"
"cpuid\n\t"
"movl %%ebx, %0\n\t"
"popl %%ebx\n\t"
: "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
:
: "cc"
);
#endif
#endif
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0;
f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0;
f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0;
f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0;
f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0;
f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0;
f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0;
f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0;
}
#if defined ANDROID || defined __linux__
#ifdef __aarch64__
f.have[CV_CPU_NEON] = true;
#else
int cpufile = open("/proc/self/auxv", O_RDONLY);
if (cpufile >= 0)
{
Elf32_auxv_t auxv;
const size_t size_auxv_t = sizeof(auxv);
while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
{
if (auxv.a_type == AT_HWCAP)
{
f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
break;
}
}
close(cpufile);
}
#endif
#elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
f.have[CV_CPU_NEON] = true;
#endif
return f;
}
int x86_family;
bool have[MAX_FEATURE+1];
};
static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
static HWFeatures* currentFeatures = &featuresEnabled;
bool checkHardwareSupport(int feature)
{
CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
return currentFeatures->have[feature];
return cv::hal::checkHardwareSupport(feature);
}
volatile bool useOptimizedFlag = true;
volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
void setUseOptimized( bool flag )
{
useOptimizedFlag = flag;
currentFeatures = flag ? &featuresEnabled : &featuresDisabled;
USE_SSE2 = currentFeatures->have[CV_CPU_SSE2];
cv::hal::setUseOptimized(flag);
ipp::setUseIPP(flag);
#ifdef HAVE_OPENCL
......@@ -403,7 +219,7 @@ void setUseOptimized( bool flag )
bool useOptimized(void)
{
return useOptimizedFlag;
return cv::hal::useOptimized();
}
int64 getTickCount(void)
......@@ -683,12 +499,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata)
CV_IMPL int cvCheckHardwareSupport(int feature)
{
CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
return cv::currentFeatures->have[feature];
return cv::hal::checkHardwareSupport(feature);
}
CV_IMPL int cvUseOptimized( int flag )
{
int prevMode = cv::useOptimizedFlag;
int prevMode = cv::useOptimized();
cv::setUseOptimized( flag != 0 );
return prevMode;
}
......
......@@ -2,10 +2,20 @@ set(the_description "The Hardware Acceleration Layer (HAL) module")
set(OPENCV_MODULE_TYPE STATIC)
if(OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
set(OPENCV_HAL_HEADERS_INCLUDES "#include \"${OPENCV_HAL_HEADERS}\"")
set(DEPS "${OPENCV_HAL_LIBS}")
else()
set(OPENCV_HAL_HEADERS_INCLUDES "// using default HAL")
set(DEPS "")
endif()
configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY)
if(UNIX)
if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
endif()
endif()
ocv_define_module(hal)
ocv_define_module(hal ${DEPS})
......@@ -46,6 +46,7 @@
#define __OPENCV_HAL_HPP__
#include "opencv2/hal/defs.h"
#include "opencv2/hal/interface.hpp"
/**
@defgroup hal Hardware Acceleration Layer
......@@ -58,22 +59,19 @@
@}
*/
namespace cv { namespace hal {
//! @addtogroup hal
//! @{
namespace Error {
enum
class Failure
{
Ok = 0,
Unknown = -1
public:
Failure(int code_ = Error::Unknown) : code(code_) {}
public:
int code;
};
}
int normHamming(const uchar* a, int n);
int normHamming(const uchar* a, const uchar* b, int n);
......@@ -104,8 +102,194 @@ void sqrt(const double* src, double* dst, int len);
void invSqrt(const float* src, float* dst, int len);
void invSqrt(const double* src, double* dst, int len);
void split8u(const uchar* src, uchar** dst, int len, int cn );
void split16u(const ushort* src, ushort** dst, int len, int cn );
void split32s(const int* src, int** dst, int len, int cn );
void split64s(const int64* src, int64** dst, int len, int cn );
void merge8u(const uchar** src, uchar* dst, int len, int cn );
void merge16u(const ushort** src, ushort* dst, int len, int cn );
void merge32s(const int** src, int* dst, int len, int cn );
void merge64s(const int64** src, int64* dst, int len, int cn );
void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
//! @}
}} //cv::hal
namespace cv {
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
};
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
{
typedef T1 type1;
typedef T2 type2;
typedef T3 rtype;
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
};
template<typename T> struct OpMin
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::min(a, b); }
};
template<typename T> struct OpMax
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator ()(const T a, const T b) const { return std::max(a, b); }
};
template<typename T> struct OpAbsDiff
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()(T a, T b) const { return (T)std::abs(a - b); }
};
template<typename T, typename WT=T> struct OpAbsDiffS
{
typedef T type1;
typedef WT type2;
typedef T rtype;
T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
};
template<typename T> struct OpAnd
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a & b; }
};
template<typename T> struct OpOr
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a | b; }
};
template<typename T> struct OpXor
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T b ) const { return a ^ b; }
};
template<typename T> struct OpNot
{
typedef T type1;
typedef T type2;
typedef T rtype;
T operator()( T a, T ) const { return ~a; }
};
}
#endif //__OPENCV_HAL_HPP__
......@@ -53,6 +53,7 @@
#endif
#include <limits.h>
#include "opencv2/hal/interface.hpp"
#if defined __ICL
# define CV_ICC __ICL
......@@ -117,9 +118,38 @@
#define CV_CPU_NEON 100
// when adding to this list remember to update the enum in core/utility.cpp
// when adding to this list remember to update the following enum
#define CV_HARDWARE_MAX_FEATURE 255
/** @brief Available CPU features.
*/
enum CpuFeatures {
CPU_MMX = 1,
CPU_SSE = 2,
CPU_SSE2 = 3,
CPU_SSE3 = 4,
CPU_SSSE3 = 5,
CPU_SSE4_1 = 6,
CPU_SSE4_2 = 7,
CPU_POPCNT = 8,
CPU_AVX = 10,
CPU_AVX2 = 11,
CPU_FMA3 = 12,
CPU_AVX_512F = 13,
CPU_AVX_512BW = 14,
CPU_AVX_512CD = 15,
CPU_AVX_512DQ = 16,
CPU_AVX_512ER = 17,
CPU_AVX_512IFMA512 = 18,
CPU_AVX_512PF = 19,
CPU_AVX_512VBMI = 20,
CPU_AVX_512VL = 21,
CPU_NEON = 100
};
// do not include SSE/AVX/NEON headers for NVCC compiler
#ifndef __CUDACC__
......@@ -257,49 +287,6 @@
# define CV_VFP 0
#endif
/* primitive types */
/*
schar - signed 1 byte integer
uchar - unsigned 1 byte integer
short - signed 2 byte integer
ushort - unsigned 2 byte integer
int - signed 4 byte integer
uint - unsigned 4 byte integer
int64 - signed 8 byte integer
uint64 - unsigned 8 byte integer
*/
#if !defined _MSC_VER && !defined __BORLANDC__
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
# include <cstdint>
typedef std::uint32_t uint;
# else
# include <stdint.h>
typedef uint32_t uint;
# endif
#else
typedef unsigned uint;
#endif
typedef signed char schar;
#ifndef __IPL_H__
typedef unsigned char uchar;
typedef unsigned short ushort;
#endif
#if defined _MSC_VER || defined __BORLANDC__
typedef __int64 int64;
typedef unsigned __int64 uint64;
# define CV_BIG_INT(n) n##I64
# define CV_BIG_UINT(n) n##UI64
#else
typedef int64_t int64;
typedef uint64_t uint64;
# define CV_BIG_INT(n) n##LL
# define CV_BIG_UINT(n) n##ULL
#endif
/* fundamental constants */
#define CV_PI 3.1415926535897932384626433832795
#define CV_2PI 6.283185307179586476925286766559
......@@ -321,6 +308,19 @@ typedef union Cv64suf
}
Cv64suf;
namespace cv { namespace hal {
bool checkHardwareSupport(int feature);
void setUseOptimized(bool onoff);
bool useOptimized();
}}
#define USE_SSE2 (cv::hal::checkHardwareSupport(CV_CPU_SSE))
#define USE_SSE4_2 (cv::hal::checkHardwareSupport(CV_CPU_SSE4_2))
#define USE_AVX (cv::hal::checkHardwareSupport(CV_CPU_AVX))
#define USE_AVX2 (cv::hal::checkHardwareSupport(CV_CPU_AVX2))
/****************************************************************************************\
* fast math *
......
#ifndef _HAL_INTERFACE_HPP_INCLUDED_
#define _HAL_INTERFACE_HPP_INCLUDED_
#define CV_HAL_ERROR_OK 0
#define CV_HAL_ERROR_NI 1
#define CV_HAL_ERROR_UNKNOWN -1
#define CV_HAL_CMP_EQ 0
#define CV_HAL_CMP_GT 1
#define CV_HAL_CMP_GE 2
#define CV_HAL_CMP_LT 3
#define CV_HAL_CMP_LE 4
#define CV_HAL_CMP_NE 5
#ifdef __cplusplus
namespace cv { namespace hal {
namespace Error {
enum
{
Ok = 0,
NotImplemented = 1,
Unknown = -1
};
}
enum
{
CMP_EQ = 0,
CMP_GT = 1,
CMP_GE = 2,
CMP_LT = 3,
CMP_LE = 4,
CMP_NE = 5
};
}}
#endif
#ifdef __cplusplus
#include <cstddef>
#else
#include <stddef.h>
#endif
/* primitive types */
/*
schar - signed 1 byte integer
uchar - unsigned 1 byte integer
short - signed 2 byte integer
ushort - unsigned 2 byte integer
int - signed 4 byte integer
uint - unsigned 4 byte integer
int64 - signed 8 byte integer
uint64 - unsigned 8 byte integer
*/
#if !defined _MSC_VER && !defined __BORLANDC__
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
# include <cstdint>
typedef std::uint32_t uint;
# else
# include <stdint.h>
typedef uint32_t uint;
# endif
#else
typedef unsigned uint;
#endif
typedef signed char schar;
#ifndef __IPL_H__
typedef unsigned char uchar;
typedef unsigned short ushort;
#endif
#if defined _MSC_VER || defined __BORLANDC__
typedef __int64 int64;
typedef unsigned __int64 uint64;
# define CV_BIG_INT(n) n##I64
# define CV_BIG_UINT(n) n##UI64
#else
typedef int64_t int64;
typedef uint64_t uint64;
# define CV_BIG_INT(n) n##LL
# define CV_BIG_UINT(n) n##ULL
#endif
#endif
......@@ -46,6 +46,8 @@
# error sse_utils.hpp header must be compiled as C++
#endif
#include "opencv2/hal/defs.h"
#if CV_SSE2
inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
......
cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
if(UNIX)
if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
endif()
endif()
add_library(simple_hal simple.cpp)
set(OPENCV_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
target_include_directories(simple_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_HAL_DIR}/include)
#include "simple.hpp"
int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
for(int x = 0 ; x < width; x++ )
dst[x] = src1[x] & src2[x];
return cv::hal::Error::Ok;
}
int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
for(int x = 0 ; x < width; x++ )
dst[x] = src1[x] | src2[x];
return cv::hal::Error::Ok;
}
int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
for(int x = 0 ; x < width; x++ )
dst[x] = src1[x] ^ src2[x];
return cv::hal::Error::Ok;
}
int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
{
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
for(int x = 0 ; x < width; x++ )
dst[x] = ~src1[x];
return cv::hal::Error::Ok;
}
#ifndef _SIMPLE_HPP_INCLUDED_
#define _SIMPLE_HPP_INCLUDED_
#include "opencv2/hal/interface.hpp"
int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
#undef hal_and8u
#define hal_and8u slow_and8u
#undef hal_or8u
#define hal_or8u slow_or8u
#undef hal_xor8u
#define hal_xor8u slow_xor8u
#undef hal_not8u
#define hal_not8u slow_not8u
#endif
......@@ -7,11 +7,13 @@
// copy or use the software.
//
//
// License Agreement
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
......@@ -41,7 +43,1089 @@
//M*/
#include "precomp.hpp"
#include "arithm_simd.hpp"
#include "arithm_core.hpp"
#include "opencv2/hal/replacement.hpp"
namespace cv { namespace hal {
}}
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
#if (ARITHM_USE_IPP == 1)
static inline void fixSteps(width, height, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
{
if( height == 1 )
step1 = step2 = step = width*elemSize;
}
#define CALL_IPP_BIN_12(fun) \
CV_IPP_CHECK() \
{ \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
}
#else
#define CALL_IPP_BIN_12(fun)
#endif
//=======================================
// Add
//=======================================
void add8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add8u)
CALL_IPP_BIN_12(ippiAdd_8u_C1RSfs)
(vBinOp<uchar, cv::OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add8s)
vBinOp<schar, cv::OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void add16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add16u)
CALL_IPP_BIN_12(ippiAdd_16u_C1RSfs)
(vBinOp<ushort, cv::OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add16s)
CALL_IPP_BIN_12(ippiAdd_16s_C1RSfs)
(vBinOp<short, cv::OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add32s)
vBinOp32<int, cv::OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void add32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add32f)
CALL_IPP_BIN_12(ippiAdd_32f_C1R)
(vBinOp32<float, cv::OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add64f)
vBinOp64<double, cv::OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
#if (ARITHM_USE_IPP == 1)
#define CALL_IPP_BIN_21(fun) \
CV_IPP_CHECK() \
{ \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
if (0 <= fun(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
}
#else
#define CALL_IPP_BIN_21(fun)
#endif
//=======================================
// Subtract
//=======================================
void sub8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub8u)
CALL_IPP_BIN_21(ippiSub_8u_C1RSfs)
(vBinOp<uchar, cv::OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub8s)
vBinOp<schar, cv::OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void sub16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub16u)
CALL_IPP_BIN_21(ippiSub_16u_C1RSfs)
(vBinOp<ushort, cv::OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub16s)
CALL_IPP_BIN_21(ippiSub_16s_C1RSfs)
(vBinOp<short, cv::OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub32s)
vBinOp32<int, cv::OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void sub32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub32f)
CALL_IPP_BIN_21(ippiSub_32f_C1R)
(vBinOp32<float, cv::OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub64f)
vBinOp64<double, cv::OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
#if (ARITHM_USE_IPP == 1)
#define CALL_IPP_MIN_MAX(fun, type) \
CV_IPP_CHECK() \
{ \
type* s1 = (type*)src1; \
type* s2 = (type*)src2; \
type* d = dst; \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
int i = 0; \
for(; i < height; i++) \
{ \
if (0 > fun(s1, s2, d, width)) \
break; \
s1 = (type*)((uchar*)s1 + step1); \
s2 = (type*)((uchar*)s2 + step2); \
d = (type*)((uchar*)d + step); \
} \
if (i == height) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
}
#else
#define CALL_IPP_MIN_MAX(fun, type)
#endif
//=======================================
// Max
//=======================================
void max8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max8u)
CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar)
vBinOp<uchar, cv::OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max8s)
vBinOp<schar, cv::OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max16u)
CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort)
vBinOp<ushort, cv::OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max16s)
vBinOp<short, cv::OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max32s)
vBinOp32<int, cv::OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max32f)
CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float)
vBinOp32<float, cv::OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max64f)
CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double)
vBinOp64<double, cv::OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
// Min
//=======================================
void min8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min8u)
CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar)
vBinOp<uchar, cv::OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min8s)
vBinOp<schar, cv::OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min16u)
CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort)
vBinOp<ushort, cv::OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min16s)
vBinOp<short, cv::OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min32s)
vBinOp32<int, cv::OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min32f)
CALL_IPP_MIN_MAX(ippsMinEvery_32f, float)
vBinOp32<float, cv::OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min64f)
CALL_IPP_MIN_MAX(ippsMinEvery_64f, double)
vBinOp64<double, cv::OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
// AbsDiff
//=======================================
void absdiff8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff8u)
CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R)
(vBinOp<uchar, cv::OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void absdiff8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff8s)
vBinOp<schar, cv::OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void absdiff16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff16u)
CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R)
(vBinOp<ushort, cv::OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
}
void absdiff16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff16s)
vBinOp<short, cv::OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, width, height);
}
void absdiff32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff32s)
vBinOp32<int, cv::OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void absdiff32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff32f)
CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R)
(vBinOp32<float, cv::OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, width, height));
}
void absdiff64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff64f)
vBinOp64<double, cv::OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
// Logical
//=======================================
void and8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_and8u)
CALL_IPP_BIN_12(ippiAnd_8u_C1R)
(vBinOp<uchar, cv::OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void or8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_or8u)
CALL_IPP_BIN_12(ippiOr_8u_C1R)
(vBinOp<uchar, cv::OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void xor8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_xor8u)
CALL_IPP_BIN_12(ippiXor_8u_C1R)
(vBinOp<uchar, cv::OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void not8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_not8u)
CALL_IPP_BIN_12(ippiNot_8u_C1R)
(vBinOp<uchar, cv::OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
#if ARITHM_USE_IPP
inline static IppCmpOp convert_cmp(int _cmpop)
{
return _cmpop == CMP_EQ ? ippCmpEq :
_cmpop == CMP_GT ? ippCmpGreater :
_cmpop == CMP_GE ? ippCmpGreaterEq :
_cmpop == CMP_LT ? ippCmpLess :
_cmpop == CMP_LE ? ippCmpLessEq :
(IppCmpOp)-1;
}
#define CALL_IPP_CMP(fun) \
CV_IPP_CHECK() \
{ \
IppCmpOp op = convert_cmp(*(int *)_cmpop); \
if( op >= 0 ) \
{ \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
} \
}
#else
#define CALL_IPP_CMP(fun)
#endif
//=======================================
// Compare
//=======================================
void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp8u)
CALL_IPP_CMP(ippiCompare_8u_C1R)
//vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
int code = *(int*)_cmpop;
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
if( code == CMP_GE || code == CMP_LT )
{
std::swap(src1, src2);
std::swap(step1, step2);
code = code == CMP_GE ? CMP_LE : CMP_GT;
}
if( code == CMP_GT || code == CMP_LE )
{
int m = code == CMP_GT ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x =0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
__m128i c128 = _mm_set1_epi8 (-128);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
// no simd for 8u comparison, that's why we need the trick
r00 = _mm_sub_epi8(r00,c128);
r10 = _mm_sub_epi8(r10,c128);
r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
_mm_storeu_si128((__m128i*)(dst + x),r00);
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
}
#endif
for( ; x < width; x++ ){
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
}
}
}
else if( code == CMP_EQ || code == CMP_NE )
{
int m = code == CMP_EQ ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
_mm_storeu_si128((__m128i*)(dst + x), r00);
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
}
}
void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp8s)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp16u)
CALL_IPP_CMP(ippiCompare_16u_C1R)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp16s)
CALL_IPP_CMP(ippiCompare_16s_C1R)
//vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
int code = *(int*)_cmpop;
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
if( code == CMP_GE || code == CMP_LT )
{
std::swap(src1, src2);
std::swap(step1, step2);
code = code == CMP_GE ? CMP_LE : CMP_GT;
}
if( code == CMP_GT || code == CMP_LE )
{
int m = code == CMP_GT ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x =0;
#if CV_SSE2
if( USE_SSE2)
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
__m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
__m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
r11 = _mm_packs_epi16(r00, r01);
_mm_storeu_si128((__m128i*)(dst + x), r11);
}
if( x <= width-8)
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
r10 = _mm_packs_epi16(r00, r00);
_mm_storel_epi64((__m128i*)(dst + x), r10);
x += 8;
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
int16x8_t in1 = vld1q_s16(src1 + x);
int16x8_t in2 = vld1q_s16(src2 + x);
uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
in1 = vld1q_s16(src1 + x + 8);
in2 = vld1q_s16(src2 + x + 8);
uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
}
#endif
for( ; x < width; x++ ){
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
}
}
}
else if( code == CMP_EQ || code == CMP_NE )
{
int m = code == CMP_EQ ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
__m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
__m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
r11 = _mm_packs_epi16(r00, r01);
_mm_storeu_si128((__m128i*)(dst + x), r11);
}
if( x <= width - 8)
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
r10 = _mm_packs_epi16(r00, r00);
_mm_storel_epi64((__m128i*)(dst + x), r10);
x += 8;
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
int16x8_t in1 = vld1q_s16(src1 + x);
int16x8_t in2 = vld1q_s16(src2 + x);
uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
in1 = vld1q_s16(src1 + x + 8);
in2 = vld1q_s16(src2 + x + 8);
uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
}
}
void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp32s)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp32f)
CALL_IPP_CMP(ippiCompare_32f_C1R)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp64f)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
#if defined HAVE_IPP
#define CALL_IPP_MUL(fun) \
CV_IPP_CHECK() \
{ \
if (std::fabs(fscale - 1) <= FLT_EPSILON) \
{ \
if (fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
} \
}
#else
#define CALL_IPP_MUL(fun)
#endif
//=======================================
// Multilpy
//=======================================
void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul8u)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_8u_C1RSfs)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul8s)
mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale);
}
void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul16u)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_16u_C1RSfs)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul16s)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_16s_C1RSfs)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul32s)
mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul32f)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_32f_C1R)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul64f)
mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
//=======================================
// Divide
//=======================================
void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div8u)
if( src1 )
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
else
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div8s)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div16u)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div16s)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div32s)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div32f)
div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div64f)
div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
//=======================================
// Reciprocial
//=======================================
void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip8u)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip8s)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip16u)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip16s)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip32s)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip32f)
recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip64f)
recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height, scalars); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
//=======================================
// Add weighted
//=======================================
void
addWeighted8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height,
void* scalars )
{
CALL_HAL(hal_addWeighted8u)
const double* scalars_ = (const double*)scalars;
float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2];
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
__m128i z = _mm_setzero_si128();
for( ; x <= width - 8; x += 8 )
{
__m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
__m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
__m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
__m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
__m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
__m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
u = _mm_packus_epi16(u, u);
_mm_storel_epi64((__m128i*)(dst + x), u);
}
}
#elif CV_NEON
float32x4_t g = vdupq_n_f32 (gamma);
for( ; x <= width - 8; x += 8 )
{
uint8x8_t in1 = vld1_u8(src1+x);
uint16x8_t in1_16 = vmovl_u8(in1);
float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
uint8x8_t in2 = vld1_u8(src2+x);
uint16x8_t in2_16 = vmovl_u8(in2);
float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
out_f_l = vaddq_f32(out_f_l, g);
out_f_h = vaddq_f32(out_f_h, g);
uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
uint8x8_t out = vqmovn_u16(out_16);
vst1_u8(dst+x, out);
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
float t0, t1;
t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
dst[x] = saturate_cast<uchar>(t0);
dst[x+1] = saturate_cast<uchar>(t1);
t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
dst[x+2] = saturate_cast<uchar>(t0);
dst[x+3] = saturate_cast<uchar>(t1);
}
#endif
for( ; x < width; x++ )
{
float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
dst[x] = saturate_cast<uchar>(t0);
}
}
}
void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted8s)
addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted16u)
addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted16s)
addWeighted_<short, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted32s)
addWeighted_<int, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted32f)
addWeighted_<float, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted64f)
addWeighted_<double, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
}} // cv::hal::
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_HAL_ARITHM_CORE_HPP__
#define __OPENCV_HAL_ARITHM_CORE_HPP__
#include "arithm_simd.hpp"
const uchar g_Saturate8u[] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255
};
#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), g_Saturate8u[(t)+256])
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b)))
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a)))
const float g_8x32fTab[] =
{
-128.f, -127.f, -126.f, -125.f, -124.f, -123.f, -122.f, -121.f,
-120.f, -119.f, -118.f, -117.f, -116.f, -115.f, -114.f, -113.f,
-112.f, -111.f, -110.f, -109.f, -108.f, -107.f, -106.f, -105.f,
-104.f, -103.f, -102.f, -101.f, -100.f, -99.f, -98.f, -97.f,
-96.f, -95.f, -94.f, -93.f, -92.f, -91.f, -90.f, -89.f,
-88.f, -87.f, -86.f, -85.f, -84.f, -83.f, -82.f, -81.f,
-80.f, -79.f, -78.f, -77.f, -76.f, -75.f, -74.f, -73.f,
-72.f, -71.f, -70.f, -69.f, -68.f, -67.f, -66.f, -65.f,
-64.f, -63.f, -62.f, -61.f, -60.f, -59.f, -58.f, -57.f,
-56.f, -55.f, -54.f, -53.f, -52.f, -51.f, -50.f, -49.f,
-48.f, -47.f, -46.f, -45.f, -44.f, -43.f, -42.f, -41.f,
-40.f, -39.f, -38.f, -37.f, -36.f, -35.f, -34.f, -33.f,
-32.f, -31.f, -30.f, -29.f, -28.f, -27.f, -26.f, -25.f,
-24.f, -23.f, -22.f, -21.f, -20.f, -19.f, -18.f, -17.f,
-16.f, -15.f, -14.f, -13.f, -12.f, -11.f, -10.f, -9.f,
-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f,
0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f,
8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f,
16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f,
24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f,
32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f,
40.f, 41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f,
48.f, 49.f, 50.f, 51.f, 52.f, 53.f, 54.f, 55.f,
56.f, 57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f,
64.f, 65.f, 66.f, 67.f, 68.f, 69.f, 70.f, 71.f,
72.f, 73.f, 74.f, 75.f, 76.f, 77.f, 78.f, 79.f,
80.f, 81.f, 82.f, 83.f, 84.f, 85.f, 86.f, 87.f,
88.f, 89.f, 90.f, 91.f, 92.f, 93.f, 94.f, 95.f,
96.f, 97.f, 98.f, 99.f, 100.f, 101.f, 102.f, 103.f,
104.f, 105.f, 106.f, 107.f, 108.f, 109.f, 110.f, 111.f,
112.f, 113.f, 114.f, 115.f, 116.f, 117.f, 118.f, 119.f,
120.f, 121.f, 122.f, 123.f, 124.f, 125.f, 126.f, 127.f,
128.f, 129.f, 130.f, 131.f, 132.f, 133.f, 134.f, 135.f,
136.f, 137.f, 138.f, 139.f, 140.f, 141.f, 142.f, 143.f,
144.f, 145.f, 146.f, 147.f, 148.f, 149.f, 150.f, 151.f,
152.f, 153.f, 154.f, 155.f, 156.f, 157.f, 158.f, 159.f,
160.f, 161.f, 162.f, 163.f, 164.f, 165.f, 166.f, 167.f,
168.f, 169.f, 170.f, 171.f, 172.f, 173.f, 174.f, 175.f,
176.f, 177.f, 178.f, 179.f, 180.f, 181.f, 182.f, 183.f,
184.f, 185.f, 186.f, 187.f, 188.f, 189.f, 190.f, 191.f,
192.f, 193.f, 194.f, 195.f, 196.f, 197.f, 198.f, 199.f,
200.f, 201.f, 202.f, 203.f, 204.f, 205.f, 206.f, 207.f,
208.f, 209.f, 210.f, 211.f, 212.f, 213.f, 214.f, 215.f,
216.f, 217.f, 218.f, 219.f, 220.f, 221.f, 222.f, 223.f,
224.f, 225.f, 226.f, 227.f, 228.f, 229.f, 230.f, 231.f,
232.f, 233.f, 234.f, 235.f, 236.f, 237.f, 238.f, 239.f,
240.f, 241.f, 242.f, 243.f, 244.f, 245.f, 246.f, 247.f,
248.f, 249.f, 250.f, 251.f, 252.f, 253.f, 254.f, 255.f
};
#define CV_8TO32F(x) g_8x32fTab[(x)+128]
namespace cv {
template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a + b); }
template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a - b); }
template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
{ return saturate_cast<short>(std::abs(a - b)); }
template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
{ return saturate_cast<schar>(std::abs(a - b)); }
template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
}
namespace cv { namespace hal {
template<typename T, class Op, class VOp>
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
{
#if CV_SSE2 || CV_NEON
VOp vop;
#endif
Op op;
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
src2 = (const T *)((const uchar *)src2 + step2),
dst = (T *)((uchar *)dst + step) )
{
int x = 0;
#if CV_NEON || CV_SSE2
#if CV_AVX2
if( USE_AVX2 )
{
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
{
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
VLoadStore256<T>::store(dst + x, r0);
}
}
#else
#if CV_SSE2
if( USE_SSE2 )
{
#endif // CV_SSE2
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
{
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
r0 = vop(r0, VLoadStore128<T>::load(src2 + x ));
r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
VLoadStore128<T>::store(dst + x , r0);
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
}
#if CV_SSE2
}
#endif // CV_SSE2
#endif // CV_AVX2
#endif // CV_NEON || CV_SSE2
#if CV_AVX2
// nothing
#elif CV_SSE2
if( USE_SSE2 )
{
for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
{
typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
r = vop(r, VLoadStore64<T>::load(src2 + x));
VLoadStore64<T>::store(dst + x, r);
}
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
T v0 = op(src1[x], src2[x]);
T v1 = op(src1[x+1], src2[x+1]);
dst[x] = v0; dst[x+1] = v1;
v0 = op(src1[x+2], src2[x+2]);
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
#endif
for( ; x < width; x++ )
dst[x] = op(src1[x], src2[x]);
}
}
template<typename T, class Op, class Op32>
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height)
{
#if CV_SSE2 || CV_NEON
Op32 op32;
#endif
Op op;
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
src2 = (const T *)((const uchar *)src2 + step2),
dst = (T *)((uchar *)dst + step) )
{
int x = 0;
#if CV_AVX2
if( USE_AVX2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
{
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
VLoadStore256Aligned<T>::store(dst + x, r0);
}
}
}
#elif CV_SSE2
if( USE_SSE2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
{
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x ));
r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
VLoadStore128Aligned<T>::store(dst + x , r0);
VLoadStore128Aligned<T>::store(dst + x + 4, r1);
}
}
}
#endif // CV_AVX2
#if CV_NEON || CV_SSE2
#if CV_AVX2
if( USE_AVX2 )
{
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
VLoadStore256<T>::store(dst + x, r0);
}
}
#else
#if CV_SSE2
if( USE_SSE2 )
{
#endif // CV_SSE2
for( ; x <= width - 8; x += 8 )
{
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
r0 = op32(r0, VLoadStore128<T>::load(src2 + x ));
r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
VLoadStore128<T>::store(dst + x , r0);
VLoadStore128<T>::store(dst + x + 4, r1);
}
#if CV_SSE2
}
#endif // CV_SSE2
#endif // CV_AVX2
#endif // CV_NEON || CV_SSE2
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
T v0 = op(src1[x], src2[x]);
T v1 = op(src1[x+1], src2[x+1]);
dst[x] = v0; dst[x+1] = v1;
v0 = op(src1[x+2], src2[x+2]);
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
#endif
for( ; x < width; x++ )
dst[x] = op(src1[x], src2[x]);
}
}
template<typename T, class Op, class Op64>
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height)
{
#if CV_SSE2
Op64 op64;
#endif
Op op;
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
src2 = (const T *)((const uchar *)src2 + step2),
dst = (T *)((uchar *)dst + step) )
{
int x = 0;
#if CV_AVX2
if( USE_AVX2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
{
for( ; x <= width - 4; x += 4 )
{
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
VLoadStore256Aligned<T>::store(dst + x, r0);
}
}
}
#elif CV_SSE2
if( USE_SSE2 )
{
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
{
for( ; x <= width - 4; x += 4 )
{
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x ));
r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
VLoadStore128Aligned<T>::store(dst + x , r0);
VLoadStore128Aligned<T>::store(dst + x + 2, r1);
}
}
}
#endif
for( ; x <= width - 4; x += 4 )
{
T v0 = op(src1[x], src2[x]);
T v1 = op(src1[x+1], src2[x+1]);
dst[x] = v0; dst[x+1] = v1;
v0 = op(src1[x+2], src2[x+2]);
v1 = op(src1[x+3], src2[x+3]);
dst[x+2] = v0; dst[x+3] = v1;
}
for( ; x < width; x++ )
dst[x] = op(src1[x], src2[x]);
}
}
template<typename T> static void
cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
uchar* dst, size_t step, int width, int height, int code)
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
if( code == CMP_GE || code == CMP_LT )
{
std::swap(src1, src2);
std::swap(step1, step2);
code = code == CMP_GE ? CMP_LE : CMP_GT;
}
Cmp_SIMD<T> vop(code);
if( code == CMP_GT || code == CMP_LE )
{
int m = code == CMP_GT ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = vop(src1, src2, dst, width);
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
int t0, t1;
t0 = -(src1[x] > src2[x]) ^ m;
t1 = -(src1[x+1] > src2[x+1]) ^ m;
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
t0 = -(src1[x+2] > src2[x+2]) ^ m;
t1 = -(src1[x+3] > src2[x+3]) ^ m;
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
}
}
else if( code == CMP_EQ || code == CMP_NE )
{
int m = code == CMP_EQ ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
int t0, t1;
t0 = -(src1[x] == src2[x]) ^ m;
t1 = -(src1[x+1] == src2[x+1]) ^ m;
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
t0 = -(src1[x+2] == src2[x+2]) ^ m;
t1 = -(src1[x+3] == src2[x+3]) ^ m;
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
}
}
template<typename T, typename WT> static void
mul_( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, WT scale )
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Mul_SIMD<T, WT> vop;
if( scale == (WT)1. )
{
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
#if CV_ENABLE_UNROLLED
for(; i <= width - 4; i += 4 )
{
T t0;
T t1;
t0 = saturate_cast<T>(src1[i ] * src2[i ]);
t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
dst[i ] = t0;
dst[i+1] = t1;
t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
dst[i+2] = t0;
dst[i+3] = t1;
}
#endif
for( ; i < width; i++ )
dst[i] = saturate_cast<T>(src1[i] * src2[i]);
}
}
else
{
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
#if CV_ENABLE_UNROLLED
for(; i <= width - 4; i += 4 )
{
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
dst[i] = t0; dst[i+1] = t1;
t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
dst[i+2] = t0; dst[i+3] = t1;
}
#endif
for( ; i < width; i++ )
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
}
}
}
template<typename T> static void
div_i( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Div_SIMD<T> vop;
float scale_f = (float)scale;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
for( ; i < width; i++ )
{
T num = src1[i], denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
div_f( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
T scale_f = (T)scale;
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Div_SIMD<T> vop;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int i = vop(src1, src2, dst, width, scale);
for( ; i < width; i++ )
{
T num = src1[i], denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
recip_i( const T*, size_t, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Recip_SIMD<T> vop;
float scale_f = (float)scale;
for( ; height--; src2 += step2, dst += step )
{
int i = vop(src2, dst, width, scale);
for( ; i < width; i++ )
{
T denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
}
}
}
template<typename T> static void
recip_f( const T*, size_t, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, double scale )
{
T scale_f = (T)scale;
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
Recip_SIMD<T> vop;
for( ; height--; src2 += step2, dst += step )
{
int i = vop(src2, dst, width, scale);
for( ; i < width; i++ )
{
T denom = src2[i];
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
}
}
}
template<typename T, typename WT> static void
addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
T* dst, size_t step, int width, int height, void* _scalars )
{
const double* scalars = (const double*)_scalars;
WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
step /= sizeof(dst[0]);
AddWeighted_SIMD<T, WT> vop;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = vop(src1, src2, dst, width, alpha, beta, gamma);
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
dst[x] = t0; dst[x+1] = t1;
t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
dst[x+2] = t0; dst[x+3] = t1;
}
#endif
for( ; x < width; x++ )
dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
}
}
}} // cv::hal::
#endif // __OPENCV_HAL_ARITHM_CORE_HPP__
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_HAL_ARITHM_SIMD_HPP__
#define __OPENCV_HAL_ARITHM_SIMD_HPP__
namespace cv { namespace hal {
struct NOP {};
#if CV_SSE2 || CV_NEON
#define IF_SIMD(op) op
#else
#define IF_SIMD(op) NOP
#endif
#if CV_SSE2 || CV_NEON
#define FUNCTOR_TEMPLATE(name) \
template<typename T> struct name {}
FUNCTOR_TEMPLATE(VLoadStore128);
#if CV_SSE2
FUNCTOR_TEMPLATE(VLoadStore64);
FUNCTOR_TEMPLATE(VLoadStore128Aligned);
#if CV_AVX2
FUNCTOR_TEMPLATE(VLoadStore256);
FUNCTOR_TEMPLATE(VLoadStore256Aligned);
#endif
#endif
#endif
#if CV_AVX2
#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \
template <> \
struct name<template_arg>{ \
typedef register_type reg_type; \
static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
}
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
template <> \
struct name<template_arg>{ \
typedef register_type reg_type; \
static reg_type load(const template_arg * p) { return load_body (p); } \
static void store(template_arg * p, reg_type v) { store_body (p, v); } \
}
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \
template<> \
struct name<template_arg> \
{ \
VLoadStore256<template_arg>::reg_type operator()( \
const VLoadStore256<template_arg>::reg_type & a, \
const VLoadStore256<template_arg>::reg_type & b) const \
{ \
body; \
} \
}
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \
template<> \
struct name<template_arg> \
{ \
VLoadStore256<template_arg>::reg_type operator()( \
const VLoadStore256<template_arg>::reg_type & a, \
const VLoadStore256<template_arg>::reg_type & ) const \
{ \
body; \
} \
}
FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps );
FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd );
FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256);
FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps );
FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd );
FUNCTOR_TEMPLATE(VAdd);
FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b));
FUNCTOR_TEMPLATE(VSub);
FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b));
FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b));
FUNCTOR_TEMPLATE(VMin);
FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b));
FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b));
FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b));
FUNCTOR_TEMPLATE(VMax);
FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b));
FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b));
FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b));
static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
FUNCTOR_TEMPLATE(VAbsDiff);
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
__m256i d = _mm256_subs_epi8(a, b);
__m256i m = _mm256_cmpgt_epi8(b, a);
return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
__m256i M = _mm256_max_epi16(a, b);
__m256i m = _mm256_min_epi16(a, b);
return _mm256_subs_epi16(M, m);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
__m256i d = _mm256_sub_epi32(a, b);
__m256i m = _mm256_cmpgt_epi32(b, a);
return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
);
FUNCTOR_TEMPLATE(VAnd);
FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
FUNCTOR_TEMPLATE(VOr);
FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
FUNCTOR_TEMPLATE(VXor);
FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
FUNCTOR_TEMPLATE(VNot);
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
#elif CV_SSE2
#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
template <> \
struct name<template_arg>{ \
typedef register_type reg_type; \
static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
}
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
template <> \
struct name<template_arg>{ \
typedef register_type reg_type; \
static reg_type load(const template_arg * p) { return load_body (p); } \
static void store(template_arg * p, reg_type v) { store_body (p, v); } \
}
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
template<> \
struct name<template_arg> \
{ \
VLoadStore128<template_arg>::reg_type operator()( \
const VLoadStore128<template_arg>::reg_type & a, \
const VLoadStore128<template_arg>::reg_type & b) const \
{ \
body; \
} \
}
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
template<> \
struct name<template_arg> \
{ \
VLoadStore128<template_arg>::reg_type operator()( \
const VLoadStore128<template_arg>::reg_type & a, \
const VLoadStore128<template_arg>::reg_type & ) const \
{ \
body; \
} \
}
FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps );
FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd );
FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128);
FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps );
FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd );
FUNCTOR_TEMPLATE(VAdd);
FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b));
FUNCTOR_TEMPLATE(VSub);
FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b));
FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b));
FUNCTOR_TEMPLATE(VMin);
FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
FUNCTOR_CLOSURE_2arg(VMin, schar,
__m128i m = _mm_cmpgt_epi8(a, b);
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
);
FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VMin, int,
__m128i m = _mm_cmpgt_epi32(a, b);
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
);
FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b));
FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
FUNCTOR_TEMPLATE(VMax);
FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
FUNCTOR_CLOSURE_2arg(VMax, schar,
__m128i m = _mm_cmpgt_epi8(b, a);
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
);
FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VMax, int,
__m128i m = _mm_cmpgt_epi32(b, a);
return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
);
FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b));
FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
FUNCTOR_TEMPLATE(VAbsDiff);
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
__m128i d = _mm_subs_epi8(a, b);
__m128i m = _mm_cmpgt_epi8(b, a);
return _mm_subs_epi8(_mm_xor_si128(d, m), m);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
__m128i M = _mm_max_epi16(a, b);
__m128i m = _mm_min_epi16(a, b);
return _mm_subs_epi16(M, m);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
__m128i d = _mm_sub_epi32(a, b);
__m128i m = _mm_cmpgt_epi32(b, a);
return _mm_sub_epi32(_mm_xor_si128(d, m), m);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
);
FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
);
FUNCTOR_TEMPLATE(VAnd);
FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
FUNCTOR_TEMPLATE(VOr);
FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
FUNCTOR_TEMPLATE(VXor);
FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
FUNCTOR_TEMPLATE(VNot);
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
#endif
#if CV_NEON
#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
template <> \
struct name<template_arg>{ \
typedef register_type reg_type; \
static reg_type load(const template_arg * p) { return load_body (p);}; \
static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
}
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
template<> \
struct name<template_arg> \
{ \
VLoadStore128<template_arg>::reg_type operator()( \
VLoadStore128<template_arg>::reg_type a, \
VLoadStore128<template_arg>::reg_type b) const \
{ \
return body; \
}; \
}
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
template<> \
struct name<template_arg> \
{ \
VLoadStore128<template_arg>::reg_type operator()( \
VLoadStore128<template_arg>::reg_type a, \
VLoadStore128<template_arg>::reg_type ) const \
{ \
return body; \
}; \
}
FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 );
FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 );
FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16);
FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16);
FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32);
FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32);
FUNCTOR_TEMPLATE(VAdd);
FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b));
FUNCTOR_TEMPLATE(VSub);
FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b));
FUNCTOR_TEMPLATE(VMin);
FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b));
FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b));
FUNCTOR_TEMPLATE(VMax);
FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b));
FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b));
FUNCTOR_TEMPLATE(VAbsDiff);
FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b)));
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b)));
FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b));
FUNCTOR_TEMPLATE(VAnd);
FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
FUNCTOR_TEMPLATE(VOr);
FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
FUNCTOR_TEMPLATE(VXor);
FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
FUNCTOR_TEMPLATE(VNot);
FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a ));
#endif
template <typename T>
struct Cmp_SIMD
{
explicit Cmp_SIMD(int)
{
}
int operator () (const T *, const T *, uchar *, int) const
{
return 0;
}
};
#if CV_NEON
template <>
struct Cmp_SIMD<schar>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
// CV_Assert(code == CMP_GT || code == CMP_LE ||
// code == CMP_EQ || code == CMP_NE);
v_mask = vdupq_n_u8(255);
}
int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
{
int x = 0;
if (code == CMP_GT)
for ( ; x <= width - 16; x += 16)
vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
else if (code == CMP_LE)
for ( ; x <= width - 16; x += 16)
vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
else if (code == CMP_EQ)
for ( ; x <= width - 16; x += 16)
vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
else if (code == CMP_NE)
for ( ; x <= width - 16; x += 16)
vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
return x;
}
int code;
uint8x16_t v_mask;
};
template <>
struct Cmp_SIMD<ushort>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
// CV_Assert(code == CMP_GT || code == CMP_LE ||
// code == CMP_EQ || code == CMP_NE);
v_mask = vdup_n_u8(255);
}
int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
{
int x = 0;
if (code == CMP_GT)
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
vst1_u8(dst + x, vmovn_u16(v_dst));
}
else if (code == CMP_LE)
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
vst1_u8(dst + x, vmovn_u16(v_dst));
}
else if (code == CMP_EQ)
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
vst1_u8(dst + x, vmovn_u16(v_dst));
}
else if (code == CMP_NE)
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
}
return x;
}
int code;
uint8x8_t v_mask;
};
template <>
struct Cmp_SIMD<int>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
// CV_Assert(code == CMP_GT || code == CMP_LE ||
// code == CMP_EQ || code == CMP_NE);
v_mask = vdup_n_u8(255);
}
int operator () (const int * src1, const int * src2, uchar * dst, int width) const
{
int x = 0;
if (code == CMP_GT)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
}
else if (code == CMP_LE)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
}
else if (code == CMP_EQ)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
}
else if (code == CMP_NE)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
vst1_u8(dst + x, veor_u8(v_dst, v_mask));
}
return x;
}
int code;
uint8x8_t v_mask;
};
template <>
struct Cmp_SIMD<float>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
// CV_Assert(code == CMP_GT || code == CMP_LE ||
// code == CMP_EQ || code == CMP_NE);
v_mask = vdup_n_u8(255);
}
int operator () (const float * src1, const float * src2, uchar * dst, int width) const
{
int x = 0;
if (code == CMP_GT)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
}
else if (code == CMP_LE)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
}
else if (code == CMP_EQ)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
}
else if (code == CMP_NE)
for ( ; x <= width - 8; x += 8)
{
uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
vst1_u8(dst + x, veor_u8(v_dst, v_mask));
}
return x;
}
int code;
uint8x8_t v_mask;
};
#elif CV_SSE2
template <>
struct Cmp_SIMD<schar>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
// CV_Assert(code == CMP_GT || code == CMP_LE ||
// code == CMP_EQ || code == CMP_NE);
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
v_mask = _mm_set1_epi8(-1);
}
int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
{
int x = 0;
if (!haveSSE)
return x;
if (code == CMP_GT)
for ( ; x <= width - 16; x += 16)
_mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x))));
else if (code == CMP_LE)
for ( ; x <= width - 16; x += 16)
{
__m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
_mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
}
else if (code == CMP_EQ)
for ( ; x <= width - 16; x += 16)
_mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x))));
else if (code == CMP_NE)
for ( ; x <= width - 16; x += 16)
{
__m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
_mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
}
return x;
}
int code;
__m128i v_mask;
bool haveSSE;
};
template <>
struct Cmp_SIMD<int>
{
explicit Cmp_SIMD(int code_) :
code(code_)
{
// CV_Assert(code == CMP_GT || code == CMP_LE ||
// code == CMP_EQ || code == CMP_NE);
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
v_mask = _mm_set1_epi32(0xffffffff);
}
int operator () (const int * src1, const int * src2, uchar * dst, int width) const
{
int x = 0;
if (!haveSSE)
return x;
if (code == CMP_GT)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
}
else if (code == CMP_LE)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
}
else if (code == CMP_EQ)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
}
else if (code == CMP_NE)
for ( ; x <= width - 8; x += 8)
{
__m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
_mm_loadu_si128((const __m128i *)(src2 + x)));
__m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
_mm_loadu_si128((const __m128i *)(src2 + x + 4)));
_mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
}
return x;
}
int code;
__m128i v_mask;
bool haveSSE;
};
#endif
template <typename T, typename WT>
struct Mul_SIMD
{
int operator() (const T *, const T *, T *, int, WT) const
{
return 0;
}
};
#if CV_NEON
template <>
struct Mul_SIMD<uchar, float>
{
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
{
int x = 0;
if( scale == 1.0f )
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
vst1_u8(dst + x, vqmovn_u16(v_dst));
}
else
{
float32x4_t v_scale = vdupq_n_f32(scale);
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
v_dst1 = vmulq_f32(v_dst1, v_scale);
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
v_dst2 = vmulq_f32(v_dst2, v_scale);
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
vst1_u8(dst + x, vqmovn_u16(v_dst));
}
}
return x;
}
};
template <>
struct Mul_SIMD<schar, float>
{
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
{
int x = 0;
if( scale == 1.0f )
for ( ; x <= width - 8; x += 8)
{
int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
vst1_s8(dst + x, vqmovn_s16(v_dst));
}
else
{
float32x4_t v_scale = vdupq_n_f32(scale);
for ( ; x <= width - 8; x += 8)
{
int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
v_dst1 = vmulq_f32(v_dst1, v_scale);
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
v_dst2 = vmulq_f32(v_dst2, v_scale);
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
vst1_s8(dst + x, vqmovn_s16(v_dst));
}
}
return x;
}
};
template <>
struct Mul_SIMD<ushort, float>
{
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
{
int x = 0;
if( scale == 1.0f )
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
vst1q_u16(dst + x, v_dst);
}
else
{
float32x4_t v_scale = vdupq_n_f32(scale);
for ( ; x <= width - 8; x += 8)
{
uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
v_dst1 = vmulq_f32(v_dst1, v_scale);
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
v_dst2 = vmulq_f32(v_dst2, v_scale);
uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
vst1q_u16(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct Mul_SIMD<short, float>
{
int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
{
int x = 0;
if( scale == 1.0f )
for ( ; x <= width - 8; x += 8)
{
int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
vst1q_s16(dst + x, v_dst);
}
else
{
float32x4_t v_scale = vdupq_n_f32(scale);
for ( ; x <= width - 8; x += 8)
{
int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
v_dst1 = vmulq_f32(v_dst1, v_scale);
float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
v_dst2 = vmulq_f32(v_dst2, v_scale);
int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
vst1q_s16(dst + x, v_dst);
}
}
return x;
}
};
template <>
struct Mul_SIMD<float, float>
{
int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
{
int x = 0;
if( scale == 1.0f )
for ( ; x <= width - 8; x += 8)
{
float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
vst1q_f32(dst + x, v_dst1);
vst1q_f32(dst + x + 4, v_dst2);
}
else
{
float32x4_t v_scale = vdupq_n_f32(scale);
for ( ; x <= width - 8; x += 8)
{
float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
v_dst1 = vmulq_f32(v_dst1, v_scale);
float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
v_dst2 = vmulq_f32(v_dst2, v_scale);
vst1q_f32(dst + x, v_dst1);
vst1q_f32(dst + x + 4, v_dst2);
}
}
return x;
}
};
#elif CV_SSE2
#if CV_SSE4_1
template <>
struct Mul_SIMD<ushort, float>
{
Mul_SIMD()
{
haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
}
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
{
int x = 0;
if (!haveSSE)
return x;
__m128i v_zero = _mm_setzero_si128();
if( scale != 1.0f )
{
__m128 v_scale = _mm_set1_ps(scale);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
__m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
v_dst1 = _mm_mul_ps(v_dst1, v_scale);
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
v_dst2 = _mm_mul_ps(v_dst2, v_scale);
__m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
_mm_storeu_si128((__m128i *)(dst + x), v_dsti);
}
}
return x;
}
bool haveSSE;
};
#endif
template <>
struct Mul_SIMD<schar, float>
{
Mul_SIMD()
{
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
{
int x = 0;
if (!haveSSE)
return x;
__m128i v_zero = _mm_setzero_si128();
if( scale == 1.0f )
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
__m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
__m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
}
else
{
__m128 v_scale = _mm_set1_ps(scale);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
__m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
v_dst1 = _mm_mul_ps(v_dst1, v_scale);
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
v_dst2 = _mm_mul_ps(v_dst2, v_scale);
__m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
}
}
return x;
}
bool haveSSE;
};
template <>
struct Mul_SIMD<short, float>
{
Mul_SIMD()
{
haveSSE = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
{
int x = 0;
if (!haveSSE)
return x;
__m128i v_zero = _mm_setzero_si128();
if( scale != 1.0f )
{
__m128 v_scale = _mm_set1_ps(scale);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
__m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
__m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
v_dst1 = _mm_mul_ps(v_dst1, v_scale);
__m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
v_dst2 = _mm_mul_ps(v_dst2, v_scale);
__m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
_mm_storeu_si128((__m128i *)(dst + x), v_dsti);
}
}
return x;
}
bool haveSSE;
};
#endif
template <typename T>
struct Div_SIMD
{
int operator() (const T *, const T *, T *, int, double) const
{
return 0;
}
};
template <typename T>
struct Recip_SIMD
{
int operator() (const T *, T *, int, double) const
{
return 0;
}
};
#if CV_SIMD128
template <>
struct Div_SIMD<uchar>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
v_uint16x8 v_src1 = v_load_expand(src1 + x);
v_uint16x8 v_src2 = v_load_expand(src2 + x);
v_uint32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
template <>
struct Div_SIMD<schar>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
v_int16x8 v_src1 = v_load_expand(src1 + x);
v_int16x8 v_src2 = v_load_expand(src2 + x);
v_int32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
v_float32x4 f2 = v_cvt_f32(t2);
v_float32x4 f3 = v_cvt_f32(t3);
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
template <>
struct Div_SIMD<ushort>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
v_uint16x8 v_src1 = v_load(src1 + x);
v_uint16x8 v_src2 = v_load(src2 + x);
v_uint32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
}
};
template <>
struct Div_SIMD<short>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
v_int16x8 v_src1 = v_load(src1 + x);
v_int16x8 v_src2 = v_load(src2 + x);
v_int32x4 t0, t1, t2, t3;
v_expand(v_src1, t0, t1);
v_expand(v_src2, t2, t3);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
v_float32x4 f2 = v_cvt_f32(t2);
v_float32x4 f3 = v_cvt_f32(t3);
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
}
};
template <>
struct Div_SIMD<int>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int32x4 v_zero = v_setzero_s32();
for ( ; x <= width - 8; x += 8)
{
v_int32x4 t0 = v_load(src1 + x);
v_int32x4 t1 = v_load(src1 + x + 4);
v_int32x4 t2 = v_load(src2 + x);
v_int32x4 t3 = v_load(src2 + x + 4);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
v_float32x4 f2 = v_cvt_f32(t2);
v_float32x4 f3 = v_cvt_f32(t3);
f0 = f0 * v_scale / f2;
f1 = f1 * v_scale / f3;
v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
res0 = v_select(t2 == v_zero, v_zero, res0);
res1 = v_select(t3 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
return x;
}
};
template <>
struct Div_SIMD<float>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_float32x4 v_zero = v_setzero_f32();
for ( ; x <= width - 8; x += 8)
{
v_float32x4 f0 = v_load(src1 + x);
v_float32x4 f1 = v_load(src1 + x + 4);
v_float32x4 f2 = v_load(src2 + x);
v_float32x4 f3 = v_load(src2 + x + 4);
v_float32x4 res0 = f0 * v_scale / f2;
v_float32x4 res1 = f1 * v_scale / f3;
res0 = v_select(f2 == v_zero, v_zero, res0);
res1 = v_select(f3 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
return x;
}
};
///////////////////////// RECIPROCAL //////////////////////
template <>
struct Recip_SIMD<uchar>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const uchar * src2, uchar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
v_uint16x8 v_src2 = v_load_expand(src2 + x);
v_uint32x4 t0, t1;
v_expand(v_src2, t0, t1);
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
template <>
struct Recip_SIMD<schar>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const schar * src2, schar * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
v_int16x8 v_src2 = v_load_expand(src2 + x);
v_int32x4 t0, t1;
v_expand(v_src2, t0, t1);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_pack_store(dst + x, res);
}
return x;
}
};
template <>
struct Recip_SIMD<ushort>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const ushort * src2, ushort * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_uint16x8 v_zero = v_setzero_u16();
for ( ; x <= width - 8; x += 8)
{
v_uint16x8 v_src2 = v_load(src2 + x);
v_uint32x4 t0, t1;
v_expand(v_src2, t0, t1);
v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_uint16x8 res = v_pack_u(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
}
};
template <>
struct Recip_SIMD<short>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const short * src2, short * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int16x8 v_zero = v_setzero_s16();
for ( ; x <= width - 8; x += 8)
{
v_int16x8 v_src2 = v_load(src2 + x);
v_int32x4 t0, t1;
v_expand(v_src2, t0, t1);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
v_int16x8 res = v_pack(i0, i1);
res = v_select(v_src2 == v_zero, v_zero, res);
v_store(dst + x, res);
}
return x;
}
};
template <>
struct Recip_SIMD<int>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const int * src2, int * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_int32x4 v_zero = v_setzero_s32();
for ( ; x <= width - 8; x += 8)
{
v_int32x4 t0 = v_load(src2 + x);
v_int32x4 t1 = v_load(src2 + x + 4);
v_float32x4 f0 = v_cvt_f32(t0);
v_float32x4 f1 = v_cvt_f32(t1);
f0 = v_scale / f0;
f1 = v_scale / f1;
v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
res0 = v_select(t0 == v_zero, v_zero, res0);
res1 = v_select(t1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
return x;
}
};
template <>
struct Recip_SIMD<float>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const float * src2, float * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float32x4 v_scale = v_setall_f32((float)scale);
v_float32x4 v_zero = v_setzero_f32();
for ( ; x <= width - 8; x += 8)
{
v_float32x4 f0 = v_load(src2 + x);
v_float32x4 f1 = v_load(src2 + x + 4);
v_float32x4 res0 = v_scale / f0;
v_float32x4 res1 = v_scale / f1;
res0 = v_select(f0 == v_zero, v_zero, res0);
res1 = v_select(f1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 4, res1);
}
return x;
}
};
#if CV_SIMD128_64F
template <>
struct Div_SIMD<double>
{
bool haveSIMD;
Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float64x2 v_scale = v_setall_f64(scale);
v_float64x2 v_zero = v_setzero_f64();
for ( ; x <= width - 4; x += 4)
{
v_float64x2 f0 = v_load(src1 + x);
v_float64x2 f1 = v_load(src1 + x + 2);
v_float64x2 f2 = v_load(src2 + x);
v_float64x2 f3 = v_load(src2 + x + 2);
v_float64x2 res0 = f0 * v_scale / f2;
v_float64x2 res1 = f1 * v_scale / f3;
res0 = v_select(f0 == v_zero, v_zero, res0);
res1 = v_select(f1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 2, res1);
}
return x;
}
};
template <>
struct Recip_SIMD<double>
{
bool haveSIMD;
Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
int operator() (const double * src2, double * dst, int width, double scale) const
{
int x = 0;
if (!haveSIMD)
return x;
v_float64x2 v_scale = v_setall_f64(scale);
v_float64x2 v_zero = v_setzero_f64();
for ( ; x <= width - 4; x += 4)
{
v_float64x2 f0 = v_load(src2 + x);
v_float64x2 f1 = v_load(src2 + x + 2);
v_float64x2 res0 = v_scale / f0;
v_float64x2 res1 = v_scale / f1;
res0 = v_select(f0 == v_zero, v_zero, res0);
res1 = v_select(f1 == v_zero, v_zero, res1);
v_store(dst + x, res0);
v_store(dst + x + 2, res1);
}
return x;
}
};
#endif
#endif
template <typename T, typename WT>
struct AddWeighted_SIMD
{
int operator() (const T *, const T *, T *, int, WT, WT, WT) const
{
return 0;
}
};
#if CV_SSE2
template <>
struct AddWeighted_SIMD<schar, float>
{
AddWeighted_SIMD()
{
haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
{
int x = 0;
if (!haveSSE2)
return x;
__m128i v_zero = _mm_setzero_si128();
__m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
v_gamma = _mm_set1_ps(gamma);
for( ; x <= width - 8; x += 8 )
{
__m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
__m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
__m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
__m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
__m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
__m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
__m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
_mm_cvtps_epi32(v_dstf1));
_mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
}
return x;
}
bool haveSSE2;
};
template <>
struct AddWeighted_SIMD<short, float>
{
AddWeighted_SIMD()
{
haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
}
int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
{
int x = 0;
if (!haveSSE2)
return x;
__m128i v_zero = _mm_setzero_si128();
__m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
v_gamma = _mm_set1_ps(gamma);
for( ; x <= width - 8; x += 8 )
{
__m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
__m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
__m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
_mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
_mm_cvtps_epi32(v_dstf1)));
}
return x;
}
bool haveSSE2;
};
#if CV_SSE4_1
template <>
struct AddWeighted_SIMD<ushort, float>
{
AddWeighted_SIMD()
{
haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
}
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
{
int x = 0;
if (!haveSSE4_1)
return x;
__m128i v_zero = _mm_setzero_si128();
__m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
v_gamma = _mm_set1_ps(gamma);
for( ; x <= width - 8; x += 8 )
{
__m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
__m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
__m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
__m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
_mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
_mm_cvtps_epi32(v_dstf1)));
}
return x;
}
bool haveSSE4_1;
};
#endif
#elif CV_NEON
template <>
struct AddWeighted_SIMD<schar, float>
{
int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
{
int x = 0;
float32x4_t g = vdupq_n_f32 (gamma);
for( ; x <= width - 8; x += 8 )
{
int8x8_t in1 = vld1_s8(src1 + x);
int16x8_t in1_16 = vmovl_s8(in1);
float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
int8x8_t in2 = vld1_s8(src2+x);
int16x8_t in2_16 = vmovl_s8(in2);
float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
out_f_l = vaddq_f32(out_f_l, g);
out_f_h = vaddq_f32(out_f_h, g);
int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
int8x8_t out = vqmovn_s16(out_16);
vst1_s8(dst + x, out);
}
return x;
}
};
template <>
struct AddWeighted_SIMD<ushort, float>
{
int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
{
int x = 0;
float32x4_t g = vdupq_n_f32(gamma);
for( ; x <= width - 8; x += 8 )
{
uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
}
return x;
}
};
template <>
struct AddWeighted_SIMD<short, float>
{
int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
{
int x = 0;
float32x4_t g = vdupq_n_f32(gamma);
for( ; x <= width - 8; x += 8 )
{
int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
}
return x;
}
};
#endif
}}
#endif // __OPENCV_HAL_ARITHM_SIMD_HPP__
#include "precomp.hpp"
#if defined WIN32 || defined _WIN32 || defined WINCE
#include <windows.h>
#if defined _MSC_VER
#if _MSC_VER >= 1400
#include <intrin.h>
#elif defined _M_IX86
static void __cpuid(int* cpuid_data, int)
{
__asm
{
push ebx
push edi
mov edi, cpuid_data
mov eax, 1
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
pop ebx
}
}
static void __cpuidex(int* cpuid_data, int, int)
{
__asm
{
push edi
mov edi, cpuid_data
mov eax, 7
mov ecx, 0
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
pop edi
}
}
#endif
#endif
#endif
#if defined ANDROID || defined __linux__
# include <unistd.h>
# include <fcntl.h>
# include <elf.h>
# include <linux/auxvec.h>
#endif
#if defined __linux__ || defined __APPLE__ || defined __EMSCRIPTEN__
#include <unistd.h>
#include <stdio.h>
#include <sys/types.h>
#if defined ANDROID
#include <sys/sysconf.h>
#endif
#endif
#ifdef ANDROID
# include <android/log.h>
#endif
struct HWFeatures
{
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
HWFeatures(void)
{
memset( have, 0, sizeof(have) );
x86_family = 0;
}
static HWFeatures initialize(void)
{
HWFeatures f;
int cpuid_data[4] = { 0, 0, 0, 0 };
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuid(cpuid_data, 1);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $1, %%eax\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%ebx\n\t"
"movl $1,%%eax\n\t"
"cpuid\n\t"
"popl %%ebx\n\t"
: "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
:
: "cc"
);
#endif
#endif
f.x86_family = (cpuid_data[0] >> 8) & 15;
if( f.x86_family >= 6 )
{
f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0;
f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0;
f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0;
f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0;
f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0;
f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0;
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
// make the second call to the cpuid command in order to get
// information about extended features like AVX2
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
__cpuidex(cpuid_data, 7, 0);
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
#ifdef __x86_64__
asm __volatile__
(
"movl $7, %%eax\n\t"
"movl $0, %%ecx\n\t"
"cpuid\n\t"
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
:
: "cc"
);
#else
asm volatile
(
"pushl %%ebx\n\t"
"movl $7,%%eax\n\t"
"movl $0,%%ecx\n\t"
"cpuid\n\t"
"movl %%ebx, %0\n\t"
"popl %%ebx\n\t"
: "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
:
: "cc"
);
#endif
#endif
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0;
f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0;
f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0;
f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0;
f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0;
f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0;
f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0;
f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0;
f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0;
}
#if defined ANDROID || defined __linux__
#ifdef __aarch64__
f.have[CV_CPU_NEON] = true;
#else
int cpufile = open("/proc/self/auxv", O_RDONLY);
if (cpufile >= 0)
{
Elf32_auxv_t auxv;
const size_t size_auxv_t = sizeof(auxv);
while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
{
if (auxv.a_type == AT_HWCAP)
{
f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
break;
}
}
close(cpufile);
}
#endif
#elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
f.have[CV_CPU_NEON] = true;
#endif
return f;
}
int x86_family;
bool have[MAX_FEATURE+1];
};
static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
static HWFeatures* currentFeatures = &featuresEnabled;
volatile bool useOptimizedFlag = true;
namespace cv { namespace hal {
bool checkHardwareSupport(int feature)
{
// CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
return currentFeatures->have[feature];
}
void setUseOptimized( bool flag )
{
useOptimizedFlag = flag;
currentFeatures = flag ? &featuresEnabled : &featuresDisabled;
}
bool useOptimized(void)
{
return useOptimizedFlag;
}
}}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
namespace cv { namespace hal {
#if CV_NEON
template<typename T> struct VMerge2;
template<typename T> struct VMerge3;
template<typename T> struct VMerge4;
#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
store_func(dst, r); \
} \
}
#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
store_func(dst, r); \
} \
}
#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type>{ \
void operator()(const data_type* src0, const data_type* src1, \
const data_type* src2, const data_type* src3, \
data_type* dst){ \
reg_type r; \
r.val[0] = load_func(src0); \
r.val[1] = load_func(src1); \
r.val[2] = load_func(src2); \
r.val[3] = load_func(src3); \
store_func(dst, r); \
} \
}
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 );
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16);
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32);
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 );
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 );
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16);
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32);
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 );
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 );
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16);
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32);
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 );
#elif CV_SSE2
template <typename T>
struct VMerge2
{
VMerge2() : support(false) { }
void operator()(const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge3
{
VMerge3() : support(false) { }
void operator()(const T *, const T *, const T *, T *) const { }
bool support;
};
template <typename T>
struct VMerge4
{
VMerge4() : support(false) { }
void operator()(const T *, const T *, const T *, const T *, T *) const { }
bool support;
};
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge2() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
} \
\
bool support; \
}
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge3() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
} \
\
bool support; \
}
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
template <> \
struct VMerge4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VMerge4() \
{ \
support = checkHardwareSupport(se); \
} \
\
void operator()(const data_type * src0, const data_type * src1, \
const data_type * src2, const data_type * src3, \
data_type * dst) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \
reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \
\
_mm_interleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \
} \
\
bool support; \
}
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
#if CV_SSE4_1
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
#endif
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2);
#endif
template<typename T> static void
merge_( const T** src, T* dst, int len, int cn )
{
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
const T* src0 = src[0];
for( i = j = 0; i < len; i++, j += cn )
dst[j] = src0[i];
}
else if( k == 2 )
{
const T *src0 = src[0], *src1 = src[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#elif CV_SSE2
if(cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VMerge2<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
}
}
else if( k == 3 )
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#elif CV_SSE2
if(cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VMerge3<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i];
dst[j+1] = src1[i];
dst[j+2] = src2[i];
}
}
else
{
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#elif CV_SSE2
if(cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VMerge4<T> vmerge;
if (vmerge.support)
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
}
#endif
for( ; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
for( ; k < cn; k += 4 )
{
const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst[j] = src0[i]; dst[j+1] = src1[i];
dst[j+2] = src2[i]; dst[j+3] = src3[i];
}
}
}
void merge8u(const uchar** src, uchar* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
void merge16u(const ushort** src, ushort* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
void merge32s(const int** src, int* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
void merge64s(const int64** src, int64* dst, int len, int cn )
{
merge_(src, dst, len, cn);
}
}}
......@@ -47,3 +47,13 @@
#include <cstdlib>
#include <limits>
#include <float.h>
#include <cstring>
#include <cassert>
#include "opencv2/hal/sse_utils.hpp"
#if defined HAVE_IPP && (IPP_VERSION_X100 >= 700)
#define ARITHM_USE_IPP 1
#else
#define ARITHM_USE_IPP 0
#endif
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_HAL_REPLACEMENT_HPP__
#define __OPENCV_HAL_REPLACEMENT_HPP__
#include "opencv2/hal.hpp"
inline int hal_t_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_not8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
#define hal_add8u hal_t_add8u
#define hal_add8s hal_t_add8s
#define hal_add16u hal_t_add16u
#define hal_add16s hal_t_add16s
#define hal_add32s hal_t_add32s
#define hal_add32f hal_t_add32f
#define hal_add64f hal_t_add64f
#define hal_sub8u hal_t_sub8u
#define hal_sub8s hal_t_sub8s
#define hal_sub16u hal_t_sub16u
#define hal_sub16s hal_t_sub16s
#define hal_sub32s hal_t_sub32s
#define hal_sub32f hal_t_sub32f
#define hal_sub64f hal_t_sub64f
#define hal_max8u hal_t_max8u
#define hal_max8s hal_t_max8s
#define hal_max16u hal_t_max16u
#define hal_max16s hal_t_max16s
#define hal_max32s hal_t_max32s
#define hal_max32f hal_t_max32f
#define hal_max64f hal_t_max64f
#define hal_min8u hal_t_min8u
#define hal_min8s hal_t_min8s
#define hal_min16u hal_t_min16u
#define hal_min16s hal_t_min16s
#define hal_min32s hal_t_min32s
#define hal_min32f hal_t_min32f
#define hal_min64f hal_t_min64f
#define hal_absdiff8u hal_t_absdiff8u
#define hal_absdiff8s hal_t_absdiff8s
#define hal_absdiff16u hal_t_absdiff16u
#define hal_absdiff16s hal_t_absdiff16s
#define hal_absdiff32s hal_t_absdiff32s
#define hal_absdiff32f hal_t_absdiff32f
#define hal_absdiff64f hal_t_absdiff64f
#define hal_and8u hal_t_and8u
#define hal_or8u hal_t_or8u
#define hal_xor8u hal_t_xor8u
#define hal_not8u hal_t_not8u
inline int hal_t_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
inline int hal_t_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
#define hal_cmp8u hal_t_cmp8u
#define hal_cmp8s hal_t_cmp8s
#define hal_cmp16u hal_t_cmp16u
#define hal_cmp16s hal_t_cmp16s
#define hal_cmp32s hal_t_cmp32s
#define hal_cmp32f hal_t_cmp32f
#define hal_cmp64f hal_t_cmp64f
inline int hal_t_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
inline int hal_t_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
#define hal_mul8u hal_t_mul8u
#define hal_mul8s hal_t_mul8s
#define hal_mul16u hal_t_mul16u
#define hal_mul16s hal_t_mul16s
#define hal_mul32s hal_t_mul32s
#define hal_mul32f hal_t_mul32f
#define hal_mul64f hal_t_mul64f
#define hal_div8u hal_t_div8u
#define hal_div8s hal_t_div8s
#define hal_div16u hal_t_div16u
#define hal_div16s hal_t_div16s
#define hal_div32s hal_t_div32s
#define hal_div32f hal_t_div32f
#define hal_div64f hal_t_div64f
#define hal_recip8u hal_t_recip8u
#define hal_recip8s hal_t_recip8s
#define hal_recip16u hal_t_recip16u
#define hal_recip16s hal_t_recip16s
#define hal_recip32s hal_t_recip32s
#define hal_recip32f hal_t_recip32f
#define hal_recip64f hal_t_recip64f
inline int hal_t_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
inline int hal_t_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
inline int hal_t_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
inline int hal_t_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
inline int hal_t_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
inline int hal_t_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
inline int hal_t_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
#define hal_addWeighted8u hal_t_addWeighted8u
#define hal_addWeighted8s hal_t_addWeighted8s
#define hal_addWeighted16u hal_t_addWeighted16u
#define hal_addWeighted16s hal_t_addWeighted16s
#define hal_addWeighted32s hal_t_addWeighted32s
#define hal_addWeighted32f hal_t_addWeighted32f
#define hal_addWeighted64f hal_t_addWeighted64f
#include "custom_hal.hpp"
#endif
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
namespace cv { namespace hal {
#if CV_NEON
template<typename T> struct VSplit2;
template<typename T> struct VSplit3;
template<typename T> struct VSplit4;
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, \
data_type* dst1) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
} \
}
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
} \
}
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \
template<> \
struct name<data_type> \
{ \
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
data_type* dst2, data_type* dst3) const \
{ \
reg_type r = load_func(src); \
store_func(dst0, r.val[0]); \
store_func(dst1, r.val[1]); \
store_func(dst2, r.val[2]); \
store_func(dst3, r.val[3]); \
} \
}
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 );
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32);
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 );
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32);
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 );
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32);
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 );
#elif CV_SSE2
template <typename T>
struct VSplit2
{
VSplit2() : support(false) { }
void operator()(const T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit3
{
VSplit3() : support(false) { }
void operator()(const T *, T *, T *, T *) const { }
bool support;
};
template <typename T>
struct VSplit4
{
VSplit4() : support(false) { }
void operator()(const T *, T *, T *, T *, T *) const { }
bool support;
};
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit2<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit2() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
} \
\
bool support; \
}
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit3<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit3() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, \
data_type * dst0, data_type * dst1, data_type * dst2) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, \
v_src3, v_src4, v_src5); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
} \
\
bool support; \
}
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \
template <> \
struct VSplit4<data_type> \
{ \
enum \
{ \
ELEMS_IN_VEC = 16 / sizeof(data_type) \
}; \
\
VSplit4() \
{ \
support = checkHardwareSupport(CV_CPU_SSE2); \
} \
\
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
data_type * dst2, data_type * dst3) const \
{ \
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
\
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
v_src4, v_src5, v_src6, v_src7); \
\
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
} \
\
bool support; \
}
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps);
#endif
template<typename T> static void
split_( const T* src, T** dst, int len, int cn )
{
int k = cn % 4 ? cn % 4 : 4;
int i, j;
if( k == 1 )
{
T* dst0 = dst[0];
if(cn == 1)
{
memcpy(dst0, src, len * sizeof(T));
}
else
{
for( i = 0, j = 0 ; i < len; i++, j += cn )
dst0[i] = src[j];
}
}
else if( k == 2 )
{
T *dst0 = dst[0], *dst1 = dst[1];
i = j = 0;
#if CV_NEON
if(cn == 2)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
for( ; i < len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
#elif CV_SSE2
if (cn == 2)
{
int inc_i = 32/sizeof(T);
int inc_j = 2 * inc_i;
VSplit2<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
}
}
else if( k == 3 )
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
i = j = 0;
#if CV_NEON
if(cn == 3)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
#elif CV_SSE2
if (cn == 3)
{
int inc_i = 32/sizeof(T);
int inc_j = 3 * inc_i;
VSplit3<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j];
dst1[i] = src[j+1];
dst2[i] = src[j+2];
}
}
else
{
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
i = j = 0;
#if CV_NEON
if(cn == 4)
{
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
#elif CV_SSE2
if (cn == 4)
{
int inc_i = 32/sizeof(T);
int inc_j = 4 * inc_i;
VSplit4<T> vsplit;
if (vsplit.support)
{
for( ; i <= len - inc_i; i += inc_i, j += inc_j)
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
}
}
#endif
for( ; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
dst2[i] = src[j+2]; dst3[i] = src[j+3];
}
}
for( ; k < cn; k += 4 )
{
T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
for( i = 0, j = k; i < len; i++, j += cn )
{
dst0[i] = src[j]; dst1[i] = src[j+1];
dst2[i] = src[j+2]; dst3[i] = src[j+3];
}
}
}
void split8u(const uchar* src, uchar** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
void split16u(const ushort* src, ushort** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
void split32s(const int* src, int** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
void split64s(const int64* src, int64** dst, int len, int cn )
{
split_(src, dst, len, cn);
}
}}
......@@ -94,4 +94,6 @@ extern const float icv8x32fSqrTab[];
#include "_geom.h"
#include "filterengine.hpp"
#include "opencv2/hal/sse_utils.hpp"
#endif /*__OPENCV_CV_INTERNAL_H_*/
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment