Merge pull request #13162 from elatkin:el/gapi_perf_rgb2gray

GAPI (fluid): RGB/BGR to gray: optimization (#13162) * GAPI (fluid): RGB/BGR to Gray: add performance tests * GAPI (fluid): RGB/BGR to Gray: speedup 8-12x with manual CV_SIMD * GAPI (fluid): RGB/BGR to Gray: fix compiler warning * GAPI (fluid): RGB/BGR to Gray: dynamic dispatching to AVX2 * GAPI (fluid): RGB/BGR to Gray: check R/G/B coefficients * GAPI (fluid): RGB/BGR to Gray: fixed compilation error (caused by change in master)

Merge pull request #13162 from elatkin:el/gapi_perf_rgb2gray
GAPI (fluid): RGB/BGR to gray: optimization (#13162) * GAPI (fluid): RGB/BGR to Gray: add performance tests * GAPI (fluid): RGB/BGR to Gray: speedup 8-12x with manual CV_SIMD * GAPI (fluid): RGB/BGR to Gray: fix compiler warning * GAPI (fluid): RGB/BGR to Gray: dynamic dispatching to AVX2 * GAPI (fluid): RGB/BGR to Gray: check R/G/B coefficients * GAPI (fluid): RGB/BGR to Gray: fixed compilation error (caused by change in master)
f8137023 · Evgeny Latkin · Alexander Alekhin · 85fad150 · f8137023 · f8137023
Commit f8137023 authored Nov 15, 2018 by Evgeny Latkin Committed by Alexander Alekhin Nov 15, 2018
6 changed files
--- a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
@@ -602,7 +602,7 @@ PERF_TEST_P_(RGB2GrayPerfTest, TestPerformance)
    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }
    // Comparison //////////////////////////////////////////////////////////////
@@ -640,7 +640,7 @@ PERF_TEST_P_(BGR2GrayPerfTest, TestPerformance)
    TEST_CYCLE()
    {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
    }
    // Comparison //////////////////////////////////////////////////////////////

--- a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
@@ -33,4 +33,14 @@ namespace opencv_test
            Values(1, 2),
            Values(cv::compile_args(IMGPROC_FLUID))));
+    INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+    INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
 }
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@@ -60,20 +60,15 @@ static void run_rgb2gray(Buffer &dst, const View &src, float coef_r, float coef_
    GAPI_Assert(dst.meta().chan == 1);
    GAPI_Assert(src.length() == dst.length());
+    GAPI_Assert(coef_r < 1 && coef_g < 1 && coef_b < 1);
+    GAPI_Assert(std::abs(coef_r + coef_g + coef_b - 1) < 0.001);
    const auto *in  = src.InLine<uchar>(0);
          auto *out = dst.OutLine<uchar>();
    int width = dst.length();
-    // TODO: Vectorize for SIMD
+    run_rgb2gray_impl(out, in, width, coef_r, coef_g, coef_b);
-    for (int w=0; w < width; w++)
-    {
-        uchar r = in[3*w    ];
-        uchar g = in[3*w + 1];
-        uchar b = in[3*w + 2];
-        float result = coef_r*r + coef_g*g + coef_b*b;
-        out[w] = saturate<uchar>(result, roundf);
-    }
 }
 GAPI_FLUID_KERNEL(GFluidRGB2GrayCustom, cv::gapi::imgproc::GRGB2GrayCustom, false)

--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@@ -8,13 +8,7 @@
 #include "gfluidimgproc_func.hpp"
 #include "gfluidimgproc_func.simd.hpp"
-#if 1
+#include "backends/fluid/gfluidimgproc_func.simd_declarations.hpp"
-  // NB: workaround for CV_SIMD bug (or feature?):
-  // - dynamic dispatcher assumes *.simd.hpp is directly in src dir
-  #include "backends/fluid/gfluidimgproc_func.simd_declarations.hpp"
-#else
-  #include                "gfluidimgproc_func.simd_declarations.hpp"
-#endif
 #include "gfluidutils.hpp"
@@ -33,6 +27,26 @@ namespace cv {
 namespace gapi {
 namespace fluid {
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b)
+{
+    CV_CPU_DISPATCH(run_rgb2gray_impl,
+        (out, in, width, coef_r, coef_g, coef_b),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
 #define RUN_SOBEL_ROW(DST, SRC)                                          \
 void run_sobel_row(DST out[], const SRC *in[], int width, int chan,      \
                   const float kx[], const float ky[], int border,       \

--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@@ -14,6 +14,15 @@ namespace cv {
 namespace gapi {
 namespace fluid {
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b);
 //---------------------
 //
 // Fluid kernels: Sobel

--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -14,6 +14,8 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include <cstdint>
 #ifdef __GNUC__
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wstrict-overflow"
@@ -25,7 +27,20 @@ namespace fluid {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
-//----------------------------------------------------------------------
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b);
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
 #define RUN_SOBEL_ROW(DST, SRC)                                     \
 void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
@@ -46,8 +61,93 @@ RUN_SOBEL_ROW( float,  float)
 #undef RUN_SOBEL_ROW
 //----------------------------------------------------------------------
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b)
+{
+    // assume:
+    // - coefficients are less than 1
+    // - and their sum equals 1
+    constexpr int unity = 1 << 16;  // Q0.0.16 inside ushort:
+    ushort rc = static_cast<ushort>(coef_r * unity + 0.5f);
+    ushort gc = static_cast<ushort>(coef_g * unity + 0.5f);
+    ushort bc = static_cast<ushort>(coef_b * unity + 0.5f);
+    GAPI_Assert(rc + gc + bc <= unity);
+    GAPI_Assert(rc + gc + bc >= USHRT_MAX);
+#if CV_SIMD
+    constexpr int nlanes = v_uint8::nlanes;
+    if (width >= nlanes)
+    {
+        for (int w=0; w < width; )
+        {
+            // process main part of pixels row
+            for ( ; w <= width - nlanes; w += nlanes)
+            {
+                v_uint8 r, g, b;
+                v_load_deinterleave(&in[3*w], r, g, b);
+                v_uint16 r0, r1, g0, g1, b0, b1;
+                v_expand(r, r0, r1);
+                v_expand(g, g0, g1);
+                v_expand(b, b0, b1);
+                v_uint16 y0, y1;
+                static const ushort half = 1 << 7; // Q0.8.8
+                y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
+                      v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
+                      v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
+                                        vx_setall_u16(half)) >> 8;
+                y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
+                      v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
+                      v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
+                                        vx_setall_u16(half)) >> 8;
+                v_uint8 y;
+                y = v_pack(y0, y1);
+                v_store(&out[w], y);
+            }
+            // process tail (if any)
+            if (w < width)
+            {
+                GAPI_DbgAssert(width - nlanes >= 0);
+                w = width - nlanes;
+            }
+        }
+        return;
+    }
+#endif
+    for (int w=0; w < width; w++)
+    {
+        uchar r = in[3*w    ];
+        uchar g = in[3*w + 1];
+        uchar b = in[3*w + 2];
+        static const int half = 1 << 15;  // Q0.0.16
+        ushort y = (r*rc + b*bc + g*gc + half) >> 16;
+        out[w] = static_cast<uchar>(y);
+    }
+}
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
 // Sobel 3x3: vertical pass
 template<bool noscale, typename DST>
 static void run_sobel3x3_vert(DST out[], int length, const float ky[],
@@ -285,7 +385,6 @@ RUN_SOBEL_ROW( float,  float)
 #undef RUN_SOBEL_ROW
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
-//----------------------------------------------------------------------
 CV_CPU_OPTIMIZATION_NAMESPACE_END