core(mathfuncs_core): cpu optimization dispatched code

1e6ce1d2 · Alexander Alekhin · 17e5e4cd · 1e6ce1d2 · 1e6ce1d2 · 1e6ce1d2
Commit 1e6ce1d2 authored Mar 23, 2017 by Alexander Alekhin
Showing with 252 additions and 157 deletions

CMakeLists.txt modules/core/CMakeLists.txt +3 -0

mathfuncs_core.dispatch.cpp modules/core/src/mathfuncs_core.dispatch.cpp +215 -0

mathfuncs_core.simd.hpp modules/core/src/mathfuncs_core.simd.hpp +34 -157

No files found.
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
 set(the_description "The Core Functionality")
+ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
 ocv_add_module(core
               "${OPENCV_HAL_LINKER_LIBS}"
               OPTIONAL opencv_cudev

--- a/modules/core/src/mathfuncs_core.dispatch.cpp
+++ b/modules/core/src/mathfuncs_core.dispatch.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "precomp.hpp"
+#include "mathfuncs_core.simd.hpp"
+#include "mathfuncs_core.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
+namespace cv { namespace hal {
+///////////////////////////////////// ATAN2 ////////////////////////////////////
+void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
+    CV_CPU_DISPATCH(fastAtan32f, (Y, X, angle, len, angleInDegrees),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
+    CV_CPU_DISPATCH(fastAtan64f, (Y, X, angle, len, angleInDegrees),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+// deprecated
+void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
+{
+    CV_INSTRUMENT_REGION()
+    fastAtan32f(Y, X, angle, len, angleInDegrees);
+}
+void magnitude32f(const float* x, const float* y, float* mag, int len)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
+    CV_CPU_DISPATCH(magnitude32f, (x, y, mag, len),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void magnitude64f(const double* x, const double* y, double* mag, int len)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0);
+    CV_CPU_DISPATCH(magnitude64f, (x, y, mag, len),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void invSqrt32f(const float* src, float* dst, int len)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0);
+    CV_CPU_DISPATCH(invSqrt32f, (src, dst, len),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void invSqrt64f(const double* src, double* dst, int len)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0);
+    CV_CPU_DISPATCH(invSqrt64f, (src, dst, len),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void sqrt32f(const float* src, float* dst, int len)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0);
+    CV_CPU_DISPATCH(sqrt32f, (src, dst, len),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void sqrt64f(const double* src, double* dst, int len)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0);
+    CV_CPU_DISPATCH(sqrt64f, (src, dst, len),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void exp32f(const float *src, float *dst, int n)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0);
+    CV_CPU_DISPATCH(exp32f, (src, dst, n),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void exp64f(const double *src, double *dst, int n)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0);
+    CV_CPU_DISPATCH(exp64f, (src, dst, n),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void log32f(const float *src, float *dst, int n)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(log32f, cv_hal_log32f, src, dst, n);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0);
+    CV_CPU_DISPATCH(log32f, (src, dst, n),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+void log64f(const double *src, double *dst, int n)
+{
+    CV_INSTRUMENT_REGION()
+    CALL_HAL(log64f, cv_hal_log64f, src, dst, n);
+    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0);
+    CV_CPU_DISPATCH(log64f, (src, dst, n),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+//=============================================================================
+// for compatibility with 3.0
+void exp(const float* src, float* dst, int n)
+{
+    exp32f(src, dst, n);
+}
+void exp(const double* src, double* dst, int n)
+{
+    exp64f(src, dst, n);
+}
+void log(const float* src, float* dst, int n)
+{
+    log32f(src, dst, n);
+}
+void log(const double* src, double* dst, int n)
+{
+    log64f(src, dst, n);
+}
+void magnitude(const float* x, const float* y, float* dst, int n)
+{
+    magnitude32f(x, y, dst, n);
+}
+void magnitude(const double* x, const double* y, double* dst, int n)
+{
+    magnitude64f(x, y, dst, n);
+}
+void sqrt(const float* src, float* dst, int len)
+{
+    sqrt32f(src, dst, len);
+}
+void sqrt(const double* src, double* dst, int len)
+{
+    sqrt64f(src, dst, len);
+}
+void invSqrt(const float* src, float* dst, int len)
+{
+    invSqrt32f(src, dst, len);
+}
+void invSqrt(const double* src, double* dst, int len)
+{
+    invSqrt64f(src, dst, len);
+}
+}} // namespace cv::hal::
+float cv::fastAtan2( float y, float x )
+{
+    using namespace cv::hal;
+    CV_CPU_CALL_BASELINE(fastAtan2, (y, x));
+}
--- a/modules/core/src/mathfuncs_core.cpp
+++ b/modules/core/src/mathfuncs_core.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
+// This file is part of OpenCV project.
-//
+// It is subject to the license terms in the LICENSE file found in the top-level directory
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+// of this distribution and at http://opencv.org/license.html.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
+namespace cv { namespace hal {
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-//
-//
+// forward declarations
-//                           License Agreement
+void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
-//                For Open Source Computer Vision Library
+void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees);
-//
+void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees);
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+void magnitude32f(const float* x, const float* y, float* mag, int len);
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+void magnitude64f(const double* x, const double* y, double* mag, int len);
-// Third party copyrights are property of their respective owners.
+void invSqrt32f(const float* src, float* dst, int len);
-//
+void invSqrt64f(const double* src, double* dst, int len);
-// Redistribution and use in source and binary forms, with or without modification,
+void sqrt32f(const float* src, float* dst, int len);
-// are permitted provided that the following conditions are met:
+void sqrt64f(const double* src, double* dst, int len);
-//
+void exp32f(const float *src, float *dst, int n);
-//   * Redistribution's of source code must retain the above copyright notice,
+void exp64f(const double *src, double *dst, int n);
-//     this list of conditions and the following disclaimer.
+void log32f(const float *src, float *dst, int n);
-//
+void log64f(const double *src, double *dst, int n);
-//   * Redistribution's in binary form must reproduce the above copyright notice,
+float fastAtan2(float y, float x);
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
 using namespace std;
@@ -197,23 +180,17 @@ static inline void atanImpl(const T *Y, const T *X, T *angle, int len, bool angl
 } // anonymous::
-namespace cv { namespace hal {
 ///////////////////////////////////// ATAN2 ////////////////////////////////////
 void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(fastAtan32f, cv_hal_fastAtan32f, Y, X, angle, len, angleInDegrees);
    atanImpl<float>(Y, X, angle, len, angleInDegrees);
 }
 void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(fastAtan64f, cv_hal_fastAtan64f, Y, X, angle, len, angleInDegrees);
    atanImpl<double>(Y, X, angle, len, angleInDegrees);
 }
@@ -221,7 +198,6 @@ void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool
 void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees )
 {
    CV_INSTRUMENT_REGION()
    fastAtan32f(Y, X, angle, len, angleInDegrees);
 }
@@ -229,9 +205,6 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(magnitude32f, cv_hal_magnitude32f, x, y, mag, len);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_32f, x, y, mag, len) >= 0);
    int i = 0;
 #if CV_SIMD128
@@ -257,9 +230,6 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(magnitude64f, cv_hal_magnitude64f, x, y, mag, len);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsMagnitude_64f, x, y, mag, len) >= 0);
    int i = 0;
 #if CV_SIMD128_64F
@@ -286,9 +256,6 @@ void invSqrt32f(const float* src, float* dst, int len)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(invSqrt32f, cv_hal_invSqrt32f, src, dst, len);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_32f_A21, src, dst, len) >= 0);
    int i = 0;
 #if CV_SIMD128
@@ -310,9 +277,6 @@ void invSqrt64f(const double* src, double* dst, int len)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(invSqrt64f, cv_hal_invSqrt64f, src, dst, len);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsInvSqrt_64f_A50, src, dst, len) >= 0);
    int i = 0;
 #if CV_SSE2
@@ -330,9 +294,6 @@ void sqrt32f(const float* src, float* dst, int len)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(sqrt32f, cv_hal_sqrt32f, src, dst, len);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_32f_A21, src, dst, len) >= 0);
    int i = 0;
 #if CV_SIMD128
@@ -354,9 +315,6 @@ void sqrt64f(const double* src, double* dst, int len)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(sqrt64f, cv_hal_sqrt64f, src, dst, len);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsSqrt_64f_A50, src, dst, len) >= 0);
    int i = 0;
 #if CV_SIMD128_64F
@@ -381,9 +339,6 @@ void exp32f(const float *src, float *dst, int n)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(exp32f, cv_hal_exp32f, src, dst, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, src, dst, n) >= 0);
    for (int i = 0; i < n; i++)
    {
        dst[i] = std::exp(src[i]);
@@ -394,9 +349,6 @@ void exp64f(const double *src, double *dst, int n)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(exp64f, cv_hal_exp64f, src, dst, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, src, dst, n) >= 0);
    for (int i = 0; i < n; i++)
    {
        dst[i] = std::exp(src[i]);
@@ -407,9 +359,6 @@ void log32f(const float *src, float *dst, int n)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(log32f, cv_hal_log32f, src, dst, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, src, dst, n) >= 0);
    for (int i = 0; i < n; i++)
    {
        dst[i] = std::log(src[i]);
@@ -419,9 +368,6 @@ void log64f(const double *src, double *dst, int n)
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(log64f, cv_hal_log64f, src, dst, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, src, dst, n) >= 0);
    for (int i = 0; i < n; i++)
    {
        dst[i] = std::log(src[i]);
@@ -534,9 +480,6 @@ void exp32f( const float *_x, float *y, int n )
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(exp32f, cv_hal_exp32f, _x, y, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_32f_A21, _x, y, n) >= 0);
    static const float
    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
@@ -569,7 +512,6 @@ void exp32f( const float *_x, float *y, int n )
        for( ; i <= n - 8; i += 8 )
        {
-            __m256 xf;
            __m128i xi0, xi1;
            __m256d xd0 = _mm256_cvtps_pd(_mm_min_ps(_mm_max_ps(_mm_loadu_ps(&x[i].f), minval4), maxval4));
@@ -586,8 +528,7 @@ void exp32f( const float *_x, float *y, int n )
            // gcc does not support _mm256_set_m128
            //xf = _mm256_set_m128(_mm256_cvtpd_ps(xd1), _mm256_cvtpd_ps(xd0));
-            xf = _mm256_insertf128_ps(xf, _mm256_cvtpd_ps(xd0), 0);
+            __m256 xf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(xd0)), _mm256_cvtpd_ps(xd1), 1);
-            xf = _mm256_insertf128_ps(xf, _mm256_cvtpd_ps(xd1), 1);
            xf = _mm256_mul_ps(xf, postscale8);
@@ -606,14 +547,10 @@ void exp32f( const float *_x, float *y, int n )
            // gcc does not support _mm256_set_m128
            //__m256 yf = _mm256_set_m128(_mm256_cvtpd_ps(yd1), _mm256_cvtpd_ps(yd0));
-            __m256 yf;
+            __m256 yf = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm256_cvtpd_ps(yd0)), _mm256_cvtpd_ps(yd1), 1);
-            yf = _mm256_insertf128_ps(yf, _mm256_cvtpd_ps(yd0), 0);
-            yf = _mm256_insertf128_ps(yf, _mm256_cvtpd_ps(yd1), 1);
            //_mm256_set_m128i(xi1, xi0)
-            __m256i temp;
+            __m256i temp = (__m256i)_mm256_insertf128_ps(_mm256_castps128_ps256((__m128)xi0), (__m128)xi1, 1);
-            temp = _mm256_inserti128_si256(temp, xi0, 0);
-            temp = _mm256_inserti128_si256(temp, xi1, 1);
            yf = _mm256_mul_ps(yf, _mm256_castsi256_ps(_mm256_slli_epi32(temp, 23)));
@@ -827,9 +764,6 @@ void exp64f( const double *_x, double *y, int n )
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(exp64f, cv_hal_exp64f, _x, y, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsExp_64f_A50, _x, y, n) >= 0);
    static const double
    A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0,
    A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0,
@@ -1276,9 +1210,6 @@ void log32f( const float *_x, float *y, int n )
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(log32f, cv_hal_log32f, _x, y, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_32f_A21, _x, y, n) >= 0);
    static const float shift[] = { 0, -1.f/512 };
    static const float
    A0 = 0.3333333333333333333333333f,
@@ -1425,9 +1356,6 @@ void log64f( const double *x, double *y, int n )
 {
    CV_INSTRUMENT_REGION()
-    CALL_HAL(log64f, cv_hal_log64f, x, y, n);
-    CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsLn_64f_A50, x, y, n) >= 0);
    static const double shift[] = { 0, -1./512 };
    static const double
    A7 = 1.0,
@@ -1613,64 +1541,13 @@ void log64f( const double *x, double *y, int n )
 #endif // issue 7795
-//=============================================================================
+float fastAtan2( float y, float x )
-// for compatibility with 3.0
-void exp(const float* src, float* dst, int n)
-{
-    exp32f(src, dst, n);
-}
-void exp(const double* src, double* dst, int n)
 {
-    exp64f(src, dst, n);
+    return atanImpl<float>(y, x);
-}
-void log(const float* src, float* dst, int n)
-{
-    log32f(src, dst, n);
-}
-void log(const double* src, double* dst, int n)
-{
-    log64f(src, dst, n);
-}
-void magnitude(const float* x, const float* y, float* dst, int n)
-{
-    magnitude32f(x, y, dst, n);
-}
-void magnitude(const double* x, const double* y, double* dst, int n)
-{
-    magnitude64f(x, y, dst, n);
-}
-void sqrt(const float* src, float* dst, int len)
-{
-    sqrt32f(src, dst, len);
-}
-void sqrt(const double* src, double* dst, int len)
-{
-    sqrt64f(src, dst, len);
-}
-void invSqrt(const float* src, float* dst, int len)
-{
-    invSqrt32f(src, dst, len);
-}
-void invSqrt(const double* src, double* dst, int len)
-{
-    invSqrt64f(src, dst, len);
 }
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-} // cv::hal::
+CV_CPU_OPTIMIZATION_NAMESPACE_END
-} // cv::
-float cv::fastAtan2( float y, float x )
+}} // namespace cv::hal
-{
-    return atanImpl<float>(y, x);
-}