Commit 37a5af36 authored by winice's avatar winice Committed by Alexander Alekhin

Merge pull request #13737 from winice-test:master

* Optical Flow rework to use wide universal intrinsics

* remove if (useSIMD) check as review requested
parent b19bb95d
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "opencl_kernels_video.hpp" #include "opencl_kernels_video.hpp"
#include "opencv2/core/hal/intrin.hpp"
#if defined __APPLE__ || defined __ANDROID__ #if defined __APPLE__ || defined __ANDROID__
#define SMALL_LOCALSIZE #define SMALL_LOCALSIZE
...@@ -433,13 +434,11 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, ...@@ -433,13 +434,11 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
for( i = 0; i <= m; i++ ) for( i = 0; i <= m; i++ )
kernel[i] = (float)(kernel[i]*s); kernel[i] = (float)(kernel[i]*s);
#if CV_SSE2 #if CV_SIMD128
float* simd_kernel = alignPtr(kernel + m+1, 16); float* simd_kernel = alignPtr(kernel + m+1, 16);
volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
if( useSIMD )
{ {
for( i = 0; i <= m; i++ ) for( i = 0; i <= m; i++ )
_mm_store_ps(simd_kernel + i*4, _mm_set1_ps(kernel[i])); v_store(simd_kernel + i*4, v_setall_f32(kernel[i]));
} }
#endif #endif
...@@ -457,54 +456,53 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, ...@@ -457,54 +456,53 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
} }
x = 0; x = 0;
#if CV_SSE2 #if CV_SIMD128
if( useSIMD )
{ {
for( ; x <= width*5 - 16; x += 16 ) for( ; x <= width*5 - 16; x += 16 )
{ {
const float *sptr0 = srow[m], *sptr1; const float *sptr0 = srow[m], *sptr1;
__m128 g4 = _mm_load_ps(simd_kernel); v_float32x4 g4 = v_load(simd_kernel);
__m128 s0, s1, s2, s3; v_float32x4 s0, s1, s2, s3;
s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4); s0 = v_load(sptr0 + x) * g4;
s1 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 4), g4); s1 = v_load(sptr0 + x + 4) * g4;
s2 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 8), g4); s2 = v_load(sptr0 + x + 8) * g4;
s3 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x + 12), g4); s3 = v_load(sptr0 + x + 12) * g4;
for( i = 1; i <= m; i++ ) for( i = 1; i <= m; i++ )
{ {
__m128 x0, x1; v_float32x4 x0, x1;
sptr0 = srow[m+i], sptr1 = srow[m-i]; sptr0 = srow[m+i], sptr1 = srow[m-i];
g4 = _mm_load_ps(simd_kernel + i*4); g4 = v_load(simd_kernel + i*4);
x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x)); x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 4), _mm_loadu_ps(sptr1 + x + 4)); x1 = v_load(sptr0 + x + 4) + v_load(sptr1 + x + 4);
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); s0 = v_muladd(x0, g4, s0);
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4)); s1 = v_muladd(x1, g4, s1);
x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 8), _mm_loadu_ps(sptr1 + x + 8)); x0 = v_load(sptr0 + x + 8) + v_load(sptr1 + x + 8);
x1 = _mm_add_ps(_mm_loadu_ps(sptr0 + x + 12), _mm_loadu_ps(sptr1 + x + 12)); x1 = v_load(sptr0 + x + 12) + v_load(sptr1 + x + 12);
s2 = _mm_add_ps(s2, _mm_mul_ps(x0, g4)); s2 = v_muladd(x0, g4, s2);
s3 = _mm_add_ps(s3, _mm_mul_ps(x1, g4)); s3 = v_muladd(x1, g4, s3);
} }
_mm_store_ps(vsum + x, s0); v_store(vsum + x, s0);
_mm_store_ps(vsum + x + 4, s1); v_store(vsum + x + 4, s1);
_mm_store_ps(vsum + x + 8, s2); v_store(vsum + x + 8, s2);
_mm_store_ps(vsum + x + 12, s3); v_store(vsum + x + 12, s3);
} }
for( ; x <= width*5 - 4; x += 4 ) for( ; x <= width*5 - 4; x += 4 )
{ {
const float *sptr0 = srow[m], *sptr1; const float *sptr0 = srow[m], *sptr1;
__m128 g4 = _mm_load_ps(simd_kernel); v_float32x4 g4 = v_load(simd_kernel);
__m128 s0 = _mm_mul_ps(_mm_loadu_ps(sptr0 + x), g4); v_float32x4 s0 = v_load(sptr0 + x) * g4;
for( i = 1; i <= m; i++ ) for( i = 1; i <= m; i++ )
{ {
sptr0 = srow[m+i], sptr1 = srow[m-i]; sptr0 = srow[m+i], sptr1 = srow[m-i];
g4 = _mm_load_ps(simd_kernel + i*4); g4 = v_load(simd_kernel + i*4);
__m128 x0 = _mm_add_ps(_mm_loadu_ps(sptr0 + x), _mm_loadu_ps(sptr1 + x)); v_float32x4 x0 = v_load(sptr0 + x) + v_load(sptr1 + x);
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4)); s0 = v_muladd(x0, g4, s0);
} }
_mm_store_ps(vsum + x, s0); v_store(vsum + x, s0);
} }
} }
#endif #endif
...@@ -525,28 +523,25 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1, ...@@ -525,28 +523,25 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
// horizontal blur // horizontal blur
x = 0; x = 0;
#if CV_SSE2 #if CV_SIMD128
if( useSIMD )
{ {
for( ; x <= width*5 - 8; x += 8 ) for( ; x <= width*5 - 8; x += 8 )
{ {
__m128 g4 = _mm_load_ps(simd_kernel); v_float32x4 g4 = v_load(simd_kernel);
__m128 s0 = _mm_mul_ps(_mm_loadu_ps(vsum + x), g4); v_float32x4 s0 = v_load(vsum + x) * g4;
__m128 s1 = _mm_mul_ps(_mm_loadu_ps(vsum + x + 4), g4); v_float32x4 s1 = v_load(vsum + x + 4) * g4;
for( i = 1; i <= m; i++ ) for( i = 1; i <= m; i++ )
{ {
g4 = _mm_load_ps(simd_kernel + i*4); g4 = v_load(simd_kernel + i*4);
__m128 x0 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5), v_float32x4 x0 = v_load(vsum + x - i*5) + v_load(vsum + x+ i*5);
_mm_loadu_ps(vsum + x + i*5)); v_float32x4 x1 = v_load(vsum + x - i*5 + 4) + v_load(vsum + x+ i*5 + 4);
__m128 x1 = _mm_add_ps(_mm_loadu_ps(vsum + x - i*5 + 4), s0 = v_muladd(x0, g4, s0);
_mm_loadu_ps(vsum + x + i*5 + 4)); s1 = v_muladd(x1, g4, s1);
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, g4));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1, g4));
} }
_mm_store_ps(hsum + x, s0); v_store(hsum + x, s0);
_mm_store_ps(hsum + x + 4, s1); v_store(hsum + x + 4, s1);
} }
} }
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment