let the test of AccSqr_SIMD<ushort, float> pass

* The difference becomes too large when multiply is done in int16 * To reproduce the test failure, IPP has to be switched off

let the test of AccSqr_SIMD<ushort, float> pass
* The difference becomes too large when multiply is done in int16 * To reproduce the test failure, IPP has to be switched off
ba73249d · Tomoaki Teshima · 32c23908 · ba73249d
Commit ba73249d authored Aug 10, 2016 by Tomoaki Teshima
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

accum.cpp modules/imgproc/src/accum.cpp +8 -6

No files found.
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -964,13 +964,15 @@ struct AccSqr_SIMD<ushort, float>
            for ( ; x <= len - 8; x += 8)
            {
                __m128i v_src = _mm_loadu_si128((const __m128i*)(src + x));
-                __m128i v_src0 = _mm_unpacklo_epi16(v_src, v_0);
+                __m128i v_int0 = _mm_unpacklo_epi16(v_src, v_0);
-                __m128i v_src1 = _mm_unpackhi_epi16(v_src, v_0);
+                __m128i v_int1 = _mm_unpackhi_epi16(v_src, v_0);
-                v_src0 = _mm_mullo_epi16(v_src0, v_src0);
+                __m128 v_src0 = _mm_cvtepi32_ps(v_int0);
-                v_src1 = _mm_mullo_epi16(v_src1, v_src1);
+                __m128 v_src1 = _mm_cvtepi32_ps(v_int1);
+                v_src0 = _mm_mul_ps(v_src0, v_src0);
+                v_src1 = _mm_mul_ps(v_src1, v_src1);
-                _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), _mm_cvtepi32_ps(v_src0)));
+                _mm_storeu_ps(dst + x, _mm_add_ps(_mm_loadu_ps(dst + x), v_src0));
-                _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), _mm_cvtepi32_ps(v_src1)));
+                _mm_storeu_ps(dst + x + 4, _mm_add_ps(_mm_loadu_ps(dst + x + 4), v_src1));
            }
        }