Commit 841cccca authored by Tomoaki Teshima's avatar Tomoaki Teshima Committed by Tomoaki Teshima

use universal intrinsic in canny

  * add v_abs for universal intrinsic
  * add test of v_abs in test_intrin
  * fix compile error on gcc
  * fix bool OR operation
parent 69704692
......@@ -549,6 +549,13 @@ inline v_float32x4 v_invsqrt(const v_float32x4& x)
}
#endif
#define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
inline v_float32x4 v_abs(v_float32x4 x)
{ return v_float32x4(vabsq_f32(x.val)); }
......
......@@ -739,6 +739,18 @@ inline v_float64x2 v_invsqrt(const v_float64x2& x)
return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
}
#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
inline _Tpuvec v_abs(const _Tpsvec& x) \
{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
inline v_uint32x4 v_abs(const v_int32x4& x)
{
__m128i s = _mm_srli_epi32(x.val, 31);
__m128i f = _mm_srai_epi32(x.val, 31);
return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
}
inline v_float32x4 v_abs(const v_float32x4& x)
{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
inline v_float64x2 v_abs(const v_float64x2& x)
......
......@@ -277,6 +277,24 @@ template<typename R> struct TheTest
return *this;
}
TheTest & test_abs()
{
typedef typename V_RegTrait128<LaneType>::u_reg Ru;
typedef typename Ru::lane_type u_type;
Data<R> dataA, dataB(10);
R a = dataA, b = dataB;
a = a - b;
Data<Ru> resC = v_abs(a);
for (int i = 0; i < Ru::nlanes; ++i)
{
EXPECT_EQ((u_type)std::abs(dataA[i] - dataB[i]), resC[i]);
}
return *this;
}
template <int s>
TheTest & test_shift()
{
......@@ -799,6 +817,7 @@ TEST(hal_intrin, int8x16) {
.test_logic()
.test_min_max()
.test_absdiff()
.test_abs()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_unpack()
......@@ -847,6 +866,7 @@ TEST(hal_intrin, int16x8) {
.test_logic()
.test_min_max()
.test_absdiff()
.test_abs()
.test_mask()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_unpack()
......@@ -886,6 +906,7 @@ TEST(hal_intrin, int32x4) {
.test_expand()
.test_addsub()
.test_mul()
.test_abs()
.test_cmp()
.test_shift<1>().test_shift<8>()
.test_logic()
......
......@@ -42,6 +42,7 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <queue>
#ifdef _MSC_VER
......@@ -299,8 +300,8 @@ public:
void operator()(const Range &boundaries) const
{
#if CV_SSE2
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
#endif
const int type = src.type(), cn = CV_MAT_CN(type);
......@@ -409,39 +410,28 @@ public:
if (!L2gradient)
{
int j = 0, width = src.cols * cn;
#if CV_SSE2
if (haveSSE2)
#if CV_SIMD128
if (haveSIMD)
{
__m128i v_zero = _mm_setzero_si128();
for ( ; j <= width - 8; j += 8)
{
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
v_int16x8 v_dx = v_load((const short *)(_dx + j));
v_int16x8 v_dy = v_load((const short *)(_dy + j));
__m128i v_dx_abs = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx));
__m128i v_dy_abs = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy));
v_dx = v_reinterpret_as_s16(v_abs(v_dx));
v_dy = v_reinterpret_as_s16(v_abs(v_dy));
__m128i v_dx_ml = _mm_unpacklo_epi16(v_dx_abs, v_zero);
__m128i v_dy_ml = _mm_unpacklo_epi16(v_dy_abs, v_zero);
__m128i v_dx_mh = _mm_unpackhi_epi16(v_dx_abs, v_zero);
__m128i v_dy_mh = _mm_unpackhi_epi16(v_dy_abs, v_zero);
v_int32x4 v_dx_ml;
v_int32x4 v_dy_ml;
v_int32x4 v_dx_mh;
v_int32x4 v_dy_mh;
v_expand(v_dx, v_dx_ml, v_dx_mh);
v_expand(v_dy, v_dy_ml, v_dy_mh);
__m128i v_norm_ml = _mm_add_epi32(v_dx_ml, v_dy_ml);
__m128i v_norm_mh = _mm_add_epi32(v_dx_mh, v_dy_mh);
_mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml);
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh);
v_store((int *)(_norm + j), v_dx_ml + v_dy_ml);
v_store((int *)(_norm + j + 4), v_dx_mh + v_dy_mh);
}
}
#elif CV_NEON
for ( ; j <= width - 8; j += 8)
{
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
}
#endif
for ( ; j < width; ++j)
_norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
......@@ -449,36 +439,23 @@ public:
else
{
int j = 0, width = src.cols * cn;
#if CV_SSE2
if (haveSSE2)
#if CV_SIMD128
if (haveSIMD)
{
for ( ; j <= width - 8; j += 8)
{
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
__m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy);
__m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy);
v_int16x8 v_dx = v_load((const short*)(_dx + j));
v_int16x8 v_dy = v_load((const short*)(_dy + j));
__m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml);
__m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh);
v_int32x4 v_dxp_low, v_dxp_high;
v_int32x4 v_dyp_low, v_dyp_high;
v_expand(v_dx, v_dxp_low, v_dxp_high);
v_expand(v_dy, v_dyp_low, v_dyp_high);
_mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml);
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh);
v_store((int *)(_norm + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low);
v_store((int *)(_norm + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high);
}
}
#elif CV_NEON
for ( ; j <= width - 8; j += 8)
{
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
vst1q_s32(_norm + j, v_dst);
v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
vst1q_s32(_norm + j + 4, v_dst);
}
#endif
for ( ; j < width; ++j)
_norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
......@@ -529,30 +506,31 @@ public:
const int TG22 = (int)(0.4142135623730950488016887242097*(1 << CANNY_SHIFT) + 0.5);
int prev_flag = 0, j = 0;
#if CV_SSE2
if (checkHardwareSupport(CPU_SSE2))
#if CV_SIMD128
if (haveSIMD)
{
__m128i v_low = _mm_set1_epi32(low), v_one = _mm_set1_epi8(1);
v_int32x4 v_low = v_setall_s32(low);
v_int8x16 v_one = v_setall_s8(1);
for (; j <= src.cols - 16; j += 16)
{
__m128i v_m1 = _mm_loadu_si128((const __m128i*)(_mag + j));
__m128i v_m2 = _mm_loadu_si128((const __m128i*)(_mag + j + 4));
__m128i v_m3 = _mm_loadu_si128((const __m128i*)(_mag + j + 8));
__m128i v_m4 = _mm_loadu_si128((const __m128i*)(_mag + j + 12));
v_int32x4 v_m1 = v_load((const int*)(_mag + j));
v_int32x4 v_m2 = v_load((const int*)(_mag + j + 4));
v_int32x4 v_m3 = v_load((const int*)(_mag + j + 8));
v_int32x4 v_m4 = v_load((const int*)(_mag + j + 12));
_mm_storeu_si128((__m128i*)(_map + j), v_one);
v_store((signed char*)(_map + j), v_one);
__m128i v_cmp1 = _mm_cmpgt_epi32(v_m1, v_low);
__m128i v_cmp2 = _mm_cmpgt_epi32(v_m2, v_low);
__m128i v_cmp3 = _mm_cmpgt_epi32(v_m3, v_low);
__m128i v_cmp4 = _mm_cmpgt_epi32(v_m4, v_low);
v_int32x4 v_cmp1 = v_m1 > v_low;
v_int32x4 v_cmp2 = v_m2 > v_low;
v_int32x4 v_cmp3 = v_m3 > v_low;
v_int32x4 v_cmp4 = v_m4 > v_low;
v_cmp1 = _mm_packs_epi32(v_cmp1, v_cmp2);
v_cmp2 = _mm_packs_epi32(v_cmp3, v_cmp4);
v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
v_cmp1 = _mm_packs_epi16(v_cmp1, v_cmp2);
unsigned int mask = _mm_movemask_epi8(v_cmp1);
v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
unsigned int mask = v_signmask(v_cmp);
if (mask)
{
......@@ -730,54 +708,57 @@ public:
const uchar* pmap = map + mapstep + 1 + (ptrdiff_t)(mapstep * boundaries.start);
uchar* pdst = dst.ptr() + (ptrdiff_t)(dst.step * boundaries.start);
#if CV_SSE2
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
#endif
for (int i = boundaries.start; i < boundaries.end; i++, pmap += mapstep, pdst += dst.step)
{
int j = 0;
#if CV_SSE2
if(haveSSE2) {
const __m128i v_zero = _mm_setzero_si128();
#if CV_SIMD128
if(haveSIMD) {
const v_int8x16 v_zero = v_setzero_s8();
for(; j <= dst.cols - 32; j += 32) {
__m128i v_pmap1 = _mm_loadu_si128((const __m128i*)(pmap + j));
__m128i v_pmap2 = _mm_loadu_si128((const __m128i*)(pmap + j + 16));
v_uint8x16 v_pmap1 = v_load((const unsigned char*)(pmap + j));
v_uint8x16 v_pmap2 = v_load((const unsigned char*)(pmap + j + 16));
__m128i v_pmaplo1 = _mm_unpacklo_epi8(v_pmap1, v_zero);
__m128i v_pmaphi1 = _mm_unpackhi_epi8(v_pmap1, v_zero);
__m128i v_pmaplo2 = _mm_unpacklo_epi8(v_pmap2, v_zero);
__m128i v_pmaphi2 = _mm_unpackhi_epi8(v_pmap2, v_zero);
v_uint16x8 v_pmaplo1;
v_uint16x8 v_pmaphi1;
v_uint16x8 v_pmaplo2;
v_uint16x8 v_pmaphi2;
v_expand(v_pmap1, v_pmaplo1, v_pmaphi1);
v_expand(v_pmap2, v_pmaplo2, v_pmaphi2);
v_pmaplo1 = _mm_srli_epi16(v_pmaplo1, 1);
v_pmaphi1 = _mm_srli_epi16(v_pmaphi1, 1);
v_pmaplo2 = _mm_srli_epi16(v_pmaplo2, 1);
v_pmaphi2 = _mm_srli_epi16(v_pmaphi2, 1);
v_pmaplo1 = v_pmaplo1 >> 1;
v_pmaphi1 = v_pmaphi1 >> 1;
v_pmaplo2 = v_pmaplo2 >> 1;
v_pmaphi2 = v_pmaphi2 >> 1;
v_pmap1 = _mm_packus_epi16(v_pmaplo1, v_pmaphi1);
v_pmap2 = _mm_packus_epi16(v_pmaplo2, v_pmaphi2);
v_pmap1 = v_pack(v_pmaplo1, v_pmaphi1);
v_pmap2 = v_pack(v_pmaplo2, v_pmaphi2);
v_pmap1 = _mm_sub_epi8(v_zero, v_pmap1);
v_pmap2 = _mm_sub_epi8(v_zero, v_pmap2);
v_pmap1 = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap1));
v_pmap2 = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap2));
_mm_storeu_si128((__m128i*)(pdst + j), v_pmap1);
_mm_storeu_si128((__m128i*)(pdst + j + 16), v_pmap2);
v_store((pdst + j), v_pmap1);
v_store((pdst + j + 16), v_pmap2);
}
for(; j <= dst.cols - 16; j += 16) {
__m128i v_pmap = _mm_loadu_si128((const __m128i*)(pmap + j));
v_uint8x16 v_pmap = v_load((const unsigned char*)(pmap + j));
__m128i v_pmaplo = _mm_unpacklo_epi8(v_pmap, v_zero);
__m128i v_pmaphi = _mm_unpackhi_epi8(v_pmap, v_zero);
v_uint16x8 v_pmaplo;
v_uint16x8 v_pmaphi;
v_expand(v_pmap, v_pmaplo, v_pmaphi);
v_pmaplo = _mm_srli_epi16(v_pmaplo, 1);
v_pmaphi = _mm_srli_epi16(v_pmaphi, 1);
v_pmaplo = v_pmaplo >> 1;
v_pmaphi = v_pmaphi >> 1;
v_pmap = _mm_packus_epi16(v_pmaplo, v_pmaphi);
v_pmap = _mm_sub_epi8(v_zero, v_pmap);
v_pmap = v_pack(v_pmaplo, v_pmaphi);
v_pmap = v_reinterpret_as_u8(v_zero - v_reinterpret_as_s8(v_pmap));
_mm_storeu_si128((__m128i*)(pdst + j), v_pmap);
v_store((pdst + j), v_pmap);
}
}
#endif
......@@ -980,8 +961,8 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
#define CANNY_PUSH(d) *(d) = uchar(2), *stack_top++ = (d)
#define CANNY_POP(d) (d) = *--stack_top
#if CV_SSE2
bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
#if CV_SIMD128
bool haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON);
#endif
// calculate magnitude and angle of gradient, perform non-maxima suppression.
......@@ -1000,32 +981,26 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
if (!L2gradient)
{
int j = 0, width = cols * cn;
#if CV_SSE2
if (haveSSE2)
#if CV_SIMD128
if (haveSIMD)
{
__m128i v_zero = _mm_setzero_si128();
for ( ; j <= width - 8; j += 8)
{
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx));
v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy));
v_int16x8 v_dx = v_load((const short*)(_dx + j));
v_int16x8 v_dy = v_load((const short*)(_dy + j));
__m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero));
_mm_storeu_si128((__m128i *)(_norm + j), v_norm);
v_int32x4 v_dx0, v_dx1, v_dy0, v_dy1;
v_expand(v_dx, v_dx0, v_dx1);
v_expand(v_dy, v_dy0, v_dy1);
v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero));
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
}
v_dx0 = v_reinterpret_as_s32(v_abs(v_dx0));
v_dx1 = v_reinterpret_as_s32(v_abs(v_dx1));
v_dy0 = v_reinterpret_as_s32(v_abs(v_dy0));
v_dy1 = v_reinterpret_as_s32(v_abs(v_dy1));
v_store(_norm + j, v_dx0 + v_dy0);
v_store(_norm + j + 4, v_dx1 + v_dy1);
}
#elif CV_NEON
for ( ; j <= width - 8; j += 8)
{
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
}
#endif
for ( ; j < width; ++j)
......@@ -1034,33 +1009,23 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
else
{
int j = 0, width = cols * cn;
#if CV_SSE2
if (haveSSE2)
#if CV_SIMD128
if (haveSIMD)
{
for ( ; j <= width - 8; j += 8)
{
__m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
__m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
v_int16x8 v_dx = v_load((const short*)(_dx + j));
v_int16x8 v_dy = v_load((const short*)(_dy + j));
__m128i v_dx_dy_ml = _mm_unpacklo_epi16(v_dx, v_dy);
__m128i v_dx_dy_mh = _mm_unpackhi_epi16(v_dx, v_dy);
__m128i v_norm_ml = _mm_madd_epi16(v_dx_dy_ml, v_dx_dy_ml);
__m128i v_norm_mh = _mm_madd_epi16(v_dx_dy_mh, v_dx_dy_mh);
_mm_storeu_si128((__m128i *)(_norm + j), v_norm_ml);
_mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm_mh);
}
}
#elif CV_NEON
for ( ; j <= width - 8; j += 8)
{
int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
vst1q_s32(_norm + j, v_dst);
v_int16x8 v_dx_dy0, v_dx_dy1;
v_zip(v_dx, v_dy, v_dx_dy0, v_dx_dy1);
v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
vst1q_s32(_norm + j + 4, v_dst);
v_int32x4 v_dst0 = v_dotprod(v_dx_dy0, v_dx_dy0);
v_int32x4 v_dst1 = v_dotprod(v_dx_dy1, v_dx_dy1);
v_store(_norm + j, v_dst0);
v_store(_norm + j + 4, v_dst1);
}
}
#endif
for ( ; j < width; ++j)
......@@ -1112,30 +1077,31 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
int prev_flag = 0, j = 0;
#if CV_SSE2
if (checkHardwareSupport(CPU_SSE2))
#if CV_SIMD128
if (haveSIMD)
{
__m128i v_low = _mm_set1_epi32(low), v_one = _mm_set1_epi8(1);
v_int32x4 v_low = v_setall_s32(low);
v_int8x16 v_one = v_setall_s8(1);
for (; j <= cols - 16; j += 16)
{
__m128i v_m1 = _mm_loadu_si128((const __m128i*)(_mag + j));
__m128i v_m2 = _mm_loadu_si128((const __m128i*)(_mag + j + 4));
__m128i v_m3 = _mm_loadu_si128((const __m128i*)(_mag + j + 8));
__m128i v_m4 = _mm_loadu_si128((const __m128i*)(_mag + j + 12));
v_int32x4 v_m1 = v_load((const int*)(_mag + j));
v_int32x4 v_m2 = v_load((const int*)(_mag + j + 4));
v_int32x4 v_m3 = v_load((const int*)(_mag + j + 8));
v_int32x4 v_m4 = v_load((const int*)(_mag + j + 12));
_mm_storeu_si128((__m128i*)(_map + j), v_one);
v_store((signed char*)(_map + j), v_one);
__m128i v_cmp1 = _mm_cmpgt_epi32(v_m1, v_low);
__m128i v_cmp2 = _mm_cmpgt_epi32(v_m2, v_low);
__m128i v_cmp3 = _mm_cmpgt_epi32(v_m3, v_low);
__m128i v_cmp4 = _mm_cmpgt_epi32(v_m4, v_low);
v_int32x4 v_cmp1 = v_m1 > v_low;
v_int32x4 v_cmp2 = v_m2 > v_low;
v_int32x4 v_cmp3 = v_m3 > v_low;
v_int32x4 v_cmp4 = v_m4 > v_low;
v_cmp1 = _mm_packs_epi32(v_cmp1, v_cmp2);
v_cmp2 = _mm_packs_epi32(v_cmp3, v_cmp4);
v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2);
v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4);
v_cmp1 = _mm_packs_epi16(v_cmp1, v_cmp2);
unsigned int mask = _mm_movemask_epi8(v_cmp1);
v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81);
unsigned int mask = v_signmask(v_cmp);
if (mask)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment