Commit 19464a3e authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

Merge pull request #8780 from vpisarev:fix_boxfilter

parents 246f47fe 883d925f
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencl_kernels_imgproc.hpp" #include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp" #include "opencv2/core/openvx/ovx_defs.hpp"
...@@ -467,6 +468,8 @@ template<> ...@@ -467,6 +468,8 @@ template<>
struct ColumnSum<ushort, uchar> : struct ColumnSum<ushort, uchar> :
public BaseColumnFilter public BaseColumnFilter
{ {
enum { SHIFT = 23 };
ColumnSum( int _ksize, int _anchor, double _scale ) : ColumnSum( int _ksize, int _anchor, double _scale ) :
BaseColumnFilter() BaseColumnFilter()
{ {
...@@ -479,7 +482,7 @@ public BaseColumnFilter ...@@ -479,7 +482,7 @@ public BaseColumnFilter
if( scale != 1 ) if( scale != 1 )
{ {
int d = cvRound(1./scale); int d = cvRound(1./scale);
double scalef = ((double)(1 << 16))/d; double scalef = ((double)(1 << SHIFT))/d;
divScale = cvFloor(scalef); divScale = cvFloor(scalef);
scalef -= divScale; scalef -= divScale;
divDelta = d/2; divDelta = d/2;
...@@ -554,35 +557,43 @@ public BaseColumnFilter ...@@ -554,35 +557,43 @@ public BaseColumnFilter
if( haveScale ) if( haveScale )
{ {
int i = 0; int i = 0;
#if CV_SSE2 #if CV_SIMD128
if(haveSSE2) v_uint32x4 ds4 = v_setall_u32((unsigned)ds);
{ v_uint16x8 dd8 = v_setall_u16((ushort)dd);
__m128i ds8 = _mm_set1_epi16((short)ds);
__m128i dd8 = _mm_set1_epi16((short)dd);
for( ; i <= width-16; i+=16 ) for( ; i <= width-16; i+=16 )
{ {
__m128i _sm0 = _mm_loadu_si128((const __m128i*)(Sm+i)); v_uint16x8 _sm0 = v_load(Sm + i);
__m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+8)); v_uint16x8 _sm1 = v_load(Sm + i + 8);
__m128i _s0 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i)), v_uint16x8 _s0 = v_add_wrap(v_load(SUM + i), v_load(Sp + i));
_mm_loadu_si128((const __m128i*)(Sp+i))); v_uint16x8 _s1 = v_add_wrap(v_load(SUM + i + 8), v_load(Sp + i + 8));
__m128i _s1 = _mm_add_epi16(_mm_loadu_si128((const __m128i*)(SUM+i+8)),
_mm_loadu_si128((const __m128i*)(Sp+i+8))); v_uint32x4 _s00, _s01, _s10, _s11;
__m128i _s2 = _mm_mulhi_epu16(_mm_adds_epu16(_s0, dd8), ds8);
__m128i _s3 = _mm_mulhi_epu16(_mm_adds_epu16(_s1, dd8), ds8); v_expand(_s0 + dd8, _s00, _s01);
_s0 = _mm_sub_epi16(_s0, _sm0); v_expand(_s1 + dd8, _s10, _s11);
_s1 = _mm_sub_epi16(_s1, _sm1);
_mm_storeu_si128((__m128i*)(D+i), _mm_packus_epi16(_s2, _s3)); _s00 = v_shr<SHIFT>(_s00*ds4);
_mm_storeu_si128((__m128i*)(SUM+i), _s0); _s01 = v_shr<SHIFT>(_s01*ds4);
_mm_storeu_si128((__m128i*)(SUM+i+8), _s1); _s10 = v_shr<SHIFT>(_s10*ds4);
} _s11 = v_shr<SHIFT>(_s11*ds4);
v_int16x8 r0 = v_pack(v_reinterpret_as_s32(_s00), v_reinterpret_as_s32(_s01));
v_int16x8 r1 = v_pack(v_reinterpret_as_s32(_s10), v_reinterpret_as_s32(_s11));
_s0 = v_sub_wrap(_s0, _sm0);
_s1 = v_sub_wrap(_s1, _sm1);
v_store(D + i, v_pack_u(r0, r1));
v_store(SUM + i, _s0);
v_store(SUM + i + 8, _s1);
} }
#endif #endif
for( ; i < width; i++ ) for( ; i < width; i++ )
{ {
int s0 = SUM[i] + Sp[i]; int s0 = SUM[i] + Sp[i];
D[i] = (uchar)((s0 + dd)*ds >> 16); D[i] = (uchar)((s0 + dd)*ds >> SHIFT);
SUM[i] = (ushort)(s0 - Sm[i]); SUM[i] = (ushort)(s0 - Sm[i]);
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment