Commit 4ad938af authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

fixed incorrect sign of the result of the convolution with normalized asymmetric…

fixed incorrect sign of the result of the convolution with normalized asymmetric kernels (ticket #779)
parent 6b8b42bb
...@@ -1596,8 +1596,6 @@ struct SymmColumnVec_32f ...@@ -1596,8 +1596,6 @@ struct SymmColumnVec_32f
{ {
f = _mm_load_ss(ky+k); f = _mm_load_ss(ky+k);
f = _mm_shuffle_ps(f, f, 0); f = _mm_shuffle_ps(f, f, 0);
S = src[k] + i;
S2 = src[-k] + i;
x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
} }
...@@ -1724,8 +1722,8 @@ struct SymmColumnSmallVec_32f ...@@ -1724,8 +1722,8 @@ struct SymmColumnSmallVec_32f
for( ; i <= width - 8; i += 8 ) for( ; i <= width - 8; i += 8 )
{ {
__m128 s0 = d4, s1 = d4, x0, x1; __m128 s0 = d4, s1 = d4, x0, x1;
x0 = _mm_sub_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i)); x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
x1 = _mm_sub_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4)); x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
_mm_storeu_ps(dst + i, s0); _mm_storeu_ps(dst + i, s0);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment