Merge pull request #15339 from pmur:dotprod-32s-vsx

048ddbf9 · Alexander Alekhin · 2a6527e7 · 33fb253a · 048ddbf9 · 048ddbf9
Commit 048ddbf9 authored Aug 31, 2019 by Alexander Alekhin
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 0 deletions

intrin_vsx.hpp modules/core/include/opencv2/core/hal/intrin_vsx.hpp +9 -0

matmul.simd.hpp modules/core/src/matmul.simd.hpp +29 -0

No files found.
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -1039,6 +1039,15 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 { return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
+// The altivec intrinsic is missing for this 2.06 insn
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+vec_double2 out;
+__asm__ ("xvcvsxddp %x0,%x1" : "=wa"(out) : "wa"(a.val));
+return v_float64x2(out);
+}
 ////////////// Lookup table access ////////////////////
 inline v_int8x16 v_lut(const schar* tab, const int* idx)

--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -2493,7 +2493,36 @@ double dotProd_16s(const short* src1, const short* src2, int len)
 double dotProd_32s(const int* src1, const int* src2, int len)
 {
+#if CV_SIMD128_64F
+    double r = 0.0;
+    int i = 0;
+    int lenAligned = len & -v_int32x4::nlanes;
+    v_float64x2 a(0.0, 0.0);
+    v_float64x2 b(0.0, 0.0);
+    for( i = 0; i < lenAligned; i += v_int32x4::nlanes )
+        {
+        v_int32x4 s1 = v_load(src1);
+        v_int32x4 s2 = v_load(src2);
+#if CV_VSX
+        // Do 32x32->64 multiplies, convert/round to double, accumulate
+        // Potentially less precise than FMA, but 1.5x faster than fma below.
+        a += v_cvt_f64(v_int64(vec_mule(s1.val, s2.val)));
+        b += v_cvt_f64(v_int64(vec_mulo(s1.val, s2.val)));
+#else
+        a = v_fma(v_cvt_f64(s1), v_cvt_f64(s2), a);
+        b = v_fma(v_cvt_f64_high(s1), v_cvt_f64_high(s2), b);
+#endif
+        src1 += v_int32x4::nlanes;
+        src2 += v_int32x4::nlanes;
+        }
+    a += b;
+    r = v_reduce_sum(a);
+    return r + dotProd_(src1, src2, len - i);
+#else
    return dotProd_(src1, src2, len);
+#endif
 }
 double dotProd_32f(const float* src1, const float* src2, int len)