Commit d620ef0d authored by Andrey Kamaev's avatar Andrey Kamaev Committed by OpenCV Buildbot

Merge pull request #577 from ilya-lavrenov:BilateralFilter

parents 7b79eaf5 3eed5d8c
...@@ -1787,6 +1787,7 @@ public: ...@@ -1787,6 +1787,7 @@ public:
#if CV_SSE3 #if CV_SSE3
if( haveSSE3 ) if( haveSSE3 )
{ {
const __m128i izero = _mm_setzero_si128();
const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0)); const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0)); const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0)); const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
...@@ -1794,14 +1795,17 @@ public: ...@@ -1794,14 +1795,17 @@ public:
for( ; k <= maxk - 4; k += 4 ) for( ; k <= maxk - 4; k += 4 )
{ {
const uchar* sptr_k = sptr + j + space_ofs[k]; const int* const sptr_k0 = reinterpret_cast<const int*>(sptr + j + space_ofs[k]);
const uchar* sptr_k1 = sptr + j + space_ofs[k+1]; const int* const sptr_k1 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+1]);
const uchar* sptr_k2 = sptr + j + space_ofs[k+2]; const int* const sptr_k2 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+2]);
const uchar* sptr_k3 = sptr + j + space_ofs[k+3]; const int* const sptr_k3 = reinterpret_cast<const int*>(sptr + j + space_ofs[k+3]);
__m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]); __m128 _b = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k0[0]), izero), izero));
__m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]); __m128 _g = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k1[0]), izero), izero));
__m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]); __m128 _r = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k2[0]), izero), izero));
__m128 _z = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(sptr_k3[0]), izero), izero));
_MM_TRANSPOSE4_PS(_b, _g, _r, _z);
__m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0)); __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
__m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0)); __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
...@@ -1961,6 +1965,7 @@ public: ...@@ -1961,6 +1965,7 @@ public:
#if CV_SSE3 #if CV_SSE3
if( haveSSE3 ) if( haveSSE3 )
{ {
__m128 psum = _mm_setzero_ps();
const __m128 _val0 = _mm_set1_ps(sptr[j]); const __m128 _val0 = _mm_set1_ps(sptr[j]);
const __m128 _scale_index = _mm_set1_ps(scale_index); const __m128 _scale_index = _mm_set1_ps(scale_index);
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask); const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
...@@ -1986,11 +1991,12 @@ public: ...@@ -1986,11 +1991,12 @@ public:
_sw = _mm_hadd_ps(_w, _val); _sw = _mm_hadd_ps(_w, _val);
_sw = _mm_hadd_ps(_sw, _sw); _sw = _mm_hadd_ps(_sw, _sw);
_mm_storel_pi((__m64*)bufSum32, _sw); psum = _mm_add_ps(_sw, psum);
sum += bufSum32[1];
wsum += bufSum32[0];
} }
_mm_storel_pi((__m64*)bufSum32, psum);
sum = bufSum32[1];
wsum = bufSum32[0];
} }
#endif #endif
...@@ -2009,7 +2015,7 @@ public: ...@@ -2009,7 +2015,7 @@ public:
} }
else else
{ {
assert( cn == 3 ); CV_Assert( cn == 3 );
for( j = 0; j < size.width*3; j += 3 ) for( j = 0; j < size.width*3; j += 3 )
{ {
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
...@@ -2018,6 +2024,7 @@ public: ...@@ -2018,6 +2024,7 @@ public:
#if CV_SSE3 #if CV_SSE3
if( haveSSE3 ) if( haveSSE3 )
{ {
__m128 sum = _mm_setzero_ps();
const __m128 _b0 = _mm_set1_ps(b0); const __m128 _b0 = _mm_set1_ps(b0);
const __m128 _g0 = _mm_set1_ps(g0); const __m128 _g0 = _mm_set1_ps(g0);
const __m128 _r0 = _mm_set1_ps(r0); const __m128 _r0 = _mm_set1_ps(r0);
...@@ -2028,14 +2035,16 @@ public: ...@@ -2028,14 +2035,16 @@ public:
{ {
__m128 _sw = _mm_loadu_ps(space_weight + k); __m128 _sw = _mm_loadu_ps(space_weight + k);
const float* sptr_k = sptr + j + space_ofs[k]; const float* const sptr_k0 = sptr + j + space_ofs[k];
const float* sptr_k1 = sptr + j + space_ofs[k+1]; const float* const sptr_k1 = sptr + j + space_ofs[k+1];
const float* sptr_k2 = sptr + j + space_ofs[k+2]; const float* const sptr_k2 = sptr + j + space_ofs[k+2];
const float* sptr_k3 = sptr + j + space_ofs[k+3]; const float* const sptr_k3 = sptr + j + space_ofs[k+3];
__m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]); __m128 _b = _mm_loadu_ps(sptr_k0);
__m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]); __m128 _g = _mm_loadu_ps(sptr_k1);
__m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]); __m128 _r = _mm_loadu_ps(sptr_k2);
__m128 _z = _mm_loadu_ps(sptr_k3);
_MM_TRANSPOSE4_PS(_b, _g, _r, _z);
__m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0)); __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
__m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0)); __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
...@@ -2060,14 +2069,13 @@ public: ...@@ -2060,14 +2069,13 @@ public:
_g = _mm_hadd_ps(_g, _r); _g = _mm_hadd_ps(_g, _r);
_w = _mm_hadd_ps(_w, _g); _w = _mm_hadd_ps(_w, _g);
_mm_store_ps(bufSum32, _w); sum = _mm_add_ps(sum, _w);
wsum += bufSum32[0];
sum_b += bufSum32[1];
sum_g += bufSum32[2];
sum_r += bufSum32[3];
} }
_mm_store_ps(bufSum32, sum);
wsum = bufSum32[0];
sum_b = bufSum32[1];
sum_g = bufSum32[2];
sum_r = bufSum32[3];
} }
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment