Commit f8137023 authored by Evgeny Latkin's avatar Evgeny Latkin Committed by Alexander Alekhin

Merge pull request #13162 from elatkin:el/gapi_perf_rgb2gray

GAPI (fluid): RGB/BGR to gray: optimization (#13162)

* GAPI (fluid): RGB/BGR to Gray: add performance tests

* GAPI (fluid): RGB/BGR to Gray: speedup 8-12x with manual CV_SIMD

* GAPI (fluid): RGB/BGR to Gray: fix compiler warning

* GAPI (fluid): RGB/BGR to Gray: dynamic dispatching to AVX2

* GAPI (fluid): RGB/BGR to Gray: check R/G/B coefficients

* GAPI (fluid): RGB/BGR to Gray: fixed compilation error (caused by change in master)
parent 85fad150
......@@ -602,7 +602,7 @@ PERF_TEST_P_(RGB2GrayPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
......@@ -640,7 +640,7 @@ PERF_TEST_P_(BGR2GrayPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
......
......@@ -33,4 +33,14 @@ namespace opencv_test
Values(1, 2),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
}
......@@ -60,20 +60,15 @@ static void run_rgb2gray(Buffer &dst, const View &src, float coef_r, float coef_
GAPI_Assert(dst.meta().chan == 1);
GAPI_Assert(src.length() == dst.length());
GAPI_Assert(coef_r < 1 && coef_g < 1 && coef_b < 1);
GAPI_Assert(std::abs(coef_r + coef_g + coef_b - 1) < 0.001);
const auto *in = src.InLine<uchar>(0);
auto *out = dst.OutLine<uchar>();
int width = dst.length();
// TODO: Vectorize for SIMD
for (int w=0; w < width; w++)
{
uchar r = in[3*w ];
uchar g = in[3*w + 1];
uchar b = in[3*w + 2];
float result = coef_r*r + coef_g*g + coef_b*b;
out[w] = saturate<uchar>(result, roundf);
}
run_rgb2gray_impl(out, in, width, coef_r, coef_g, coef_b);
}
GAPI_FLUID_KERNEL(GFluidRGB2GrayCustom, cv::gapi::imgproc::GRGB2GrayCustom, false)
......
......@@ -8,13 +8,7 @@
#include "gfluidimgproc_func.hpp"
#include "gfluidimgproc_func.simd.hpp"
#if 1
// NB: workaround for CV_SIMD bug (or feature?):
// - dynamic dispatcher assumes *.simd.hpp is directly in src dir
#include "backends/fluid/gfluidimgproc_func.simd_declarations.hpp"
#else
#include "gfluidimgproc_func.simd_declarations.hpp"
#endif
#include "backends/fluid/gfluidimgproc_func.simd_declarations.hpp"
#include "gfluidutils.hpp"
......@@ -33,6 +27,26 @@ namespace cv {
namespace gapi {
namespace fluid {
//----------------------------------
//
// Fluid kernels: RGB2Gray, BGR2Gray
//
//----------------------------------
void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
float coef_r, float coef_g, float coef_b)
{
CV_CPU_DISPATCH(run_rgb2gray_impl,
(out, in, width, coef_r, coef_g, coef_b),
CV_CPU_DISPATCH_MODES_ALL);
}
//---------------------
//
// Fluid kernels: Sobel
//
//---------------------
#define RUN_SOBEL_ROW(DST, SRC) \
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
......
......@@ -14,6 +14,15 @@ namespace cv {
namespace gapi {
namespace fluid {
//----------------------------------
//
// Fluid kernels: RGB2Gray, BGR2Gray
//
//----------------------------------
void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
float coef_r, float coef_g, float coef_b);
//---------------------
//
// Fluid kernels: Sobel
......
......@@ -14,6 +14,8 @@
#include "opencv2/core.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <cstdint>
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-overflow"
......@@ -25,7 +27,20 @@ namespace fluid {
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
//----------------------------------------------------------------------
//----------------------------------
//
// Fluid kernels: RGB2Gray, BGR2Gray
//
//----------------------------------
void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
float coef_r, float coef_g, float coef_b);
//---------------------
//
// Fluid kernels: Sobel
//
//---------------------
#define RUN_SOBEL_ROW(DST, SRC) \
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
......@@ -46,8 +61,93 @@ RUN_SOBEL_ROW( float, float)
#undef RUN_SOBEL_ROW
//----------------------------------------------------------------------
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
//----------------------------------
//
// Fluid kernels: RGB2Gray, BGR2Gray
//
//----------------------------------
void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
float coef_r, float coef_g, float coef_b)
{
// assume:
// - coefficients are less than 1
// - and their sum equals 1
constexpr int unity = 1 << 16; // Q0.0.16 inside ushort:
ushort rc = static_cast<ushort>(coef_r * unity + 0.5f);
ushort gc = static_cast<ushort>(coef_g * unity + 0.5f);
ushort bc = static_cast<ushort>(coef_b * unity + 0.5f);
GAPI_Assert(rc + gc + bc <= unity);
GAPI_Assert(rc + gc + bc >= USHRT_MAX);
#if CV_SIMD
constexpr int nlanes = v_uint8::nlanes;
if (width >= nlanes)
{
for (int w=0; w < width; )
{
// process main part of pixels row
for ( ; w <= width - nlanes; w += nlanes)
{
v_uint8 r, g, b;
v_load_deinterleave(&in[3*w], r, g, b);
v_uint16 r0, r1, g0, g1, b0, b1;
v_expand(r, r0, r1);
v_expand(g, g0, g1);
v_expand(b, b0, b1);
v_uint16 y0, y1;
static const ushort half = 1 << 7; // Q0.8.8
y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
vx_setall_u16(half)) >> 8;
y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
vx_setall_u16(half)) >> 8;
v_uint8 y;
y = v_pack(y0, y1);
v_store(&out[w], y);
}
// process tail (if any)
if (w < width)
{
GAPI_DbgAssert(width - nlanes >= 0);
w = width - nlanes;
}
}
return;
}
#endif
for (int w=0; w < width; w++)
{
uchar r = in[3*w ];
uchar g = in[3*w + 1];
uchar b = in[3*w + 2];
static const int half = 1 << 15; // Q0.0.16
ushort y = (r*rc + b*bc + g*gc + half) >> 16;
out[w] = static_cast<uchar>(y);
}
}
//---------------------
//
// Fluid kernels: Sobel
//
//---------------------
// Sobel 3x3: vertical pass
template<bool noscale, typename DST>
static void run_sobel3x3_vert(DST out[], int length, const float ky[],
......@@ -285,7 +385,6 @@ RUN_SOBEL_ROW( float, float)
#undef RUN_SOBEL_ROW
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
//----------------------------------------------------------------------
CV_CPU_OPTIMIZATION_NAMESPACE_END
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment