Commit 0e1dd63f authored by Alexander Alekhin's avatar Alexander Alekhin

video(DIS): use OpenCL shared mem

- fix perf test iterations
parent 076ee65c
...@@ -37,10 +37,11 @@ OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf, ...@@ -37,10 +37,11 @@ OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf,
Ptr<DenseOpticalFlow> algo = DISOpticalFlow::create(preset); Ptr<DenseOpticalFlow> algo = DISOpticalFlow::create(preset);
OCL_TEST_CYCLE_N(10) PERF_SAMPLE_BEGIN()
{ {
algo->calc(frame1, frame2, flow); algo->calc(frame1, frame2, flow);
} }
PERF_SAMPLE_END()
SANITY_CHECK_NOTHING(); SANITY_CHECK_NOTHING();
} }
......
...@@ -1055,11 +1055,16 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy, ...@@ -1055,11 +1055,16 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
int idx; int idx;
int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter); int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
String subgroups_build_options;
if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups"))
subgroups_build_options = "-DCV_USE_SUBGROUPS=1";
for (int iter = 0; iter < num_iter; iter++) for (int iter = 0; iter < num_iter; iter++)
{ {
if (iter == 0) if (iter == 0)
{ {
ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc); ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, subgroups_build_options);
size_t global_sz[] = {(size_t)hs * 8}; size_t global_sz[] = {(size_t)hs * 8};
size_t local_sz[] = {8}; size_t local_sz[] = {8};
idx = 0; idx = 0;
...@@ -1111,7 +1116,7 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy, ...@@ -1111,7 +1116,7 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
} }
else else
{ {
ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc); ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, subgroups_build_options);
size_t global_sz[] = {(size_t)hs * 8}; size_t global_sz[] = {(size_t)hs * 8};
size_t local_sz[] = {8}; size_t local_sz[] = {8};
idx = 0; idx = 0;
...@@ -1368,7 +1373,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo ...@@ -1368,7 +1373,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
CV_Assert(I0.isContinuous()); CV_Assert(I0.isContinuous());
CV_Assert(I1.isContinuous()); CV_Assert(I1.isContinuous());
CV_OCL_RUN(ocl::Device::getDefault().isIntel() && flow.isUMat() && CV_OCL_RUN(flow.isUMat() &&
(patch_size == 8) && (use_spatial_propagation == true), (patch_size == 8) && (use_spatial_propagation == true),
ocl_calc(I0, I1, flow)); ocl_calc(I0, I1, flow));
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory // It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html. // of this distribution and at http://opencv.org/license.html.
//#define CV_USE_SUBGROUPS
#define EPS 0.001f #define EPS 0.001f
#define INF 1E+10F #define INF 1E+10F
...@@ -193,7 +195,11 @@ __kernel void dis_densification(__global const float *sx, __global const float * ...@@ -193,7 +195,11 @@ __kernel void dis_densification(__global const float *sx, __global const float *
float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr, float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
int I0_stride, int I1_stride, int I0_stride, int I1_stride,
float w00, float w01, float w10, float w11, int patch_sz, int i) float w00, float w01, float w10, float w11, int patch_sz, int i
#ifndef CV_USE_SUBGROUPS
, __local float2 *smem /*[8]*/
#endif
)
{ {
float sum_diff = 0.0f, sum_diff_sq = 0.0f; float sum_diff = 0.0f, sum_diff_sq = 0.0f;
int n = patch_sz * patch_sz; int n = patch_sz * patch_sz;
...@@ -214,12 +220,31 @@ float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ ...@@ -214,12 +220,31 @@ float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_
sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0)); sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi)); sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
#ifdef CV_USE_SUBGROUPS
sum_diff = sub_group_reduce_add(sum_diff); sum_diff = sub_group_reduce_add(sum_diff);
sum_diff_sq = sub_group_reduce_add(sum_diff_sq); sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
#else
barrier(CLK_LOCAL_MEM_FENCE);
smem[i] = (float2)(sum_diff, sum_diff_sq);
barrier(CLK_LOCAL_MEM_FENCE);
if (i < 4)
smem[i] += smem[i + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if (i < 2)
smem[i] += smem[i + 2];
barrier(CLK_LOCAL_MEM_FENCE);
if (i == 0)
smem[0] += smem[1];
barrier(CLK_LOCAL_MEM_FENCE);
float2 reduce_add_result = smem[0];
sum_diff = reduce_add_result.x;
sum_diff_sq = reduce_add_result.y;
#endif
return sum_diff_sq - sum_diff * sum_diff / n; return sum_diff_sq - sum_diff * sum_diff / n;
} }
__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr, __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
__global const uchar *I0_ptr, __global const uchar *I1_ptr, __global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride, int border_size, int patch_size, int patch_stride,
...@@ -227,8 +252,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo ...@@ -227,8 +252,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
__global float *Sx_ptr, __global float *Sy_ptr) __global float *Sx_ptr, __global float *Sy_ptr)
{ {
int id = get_global_id(0); int id = get_global_id(0);
int is = id / 8; int is = get_group_id(0);
if (id >= (hs * 8)) return;
int i = is * patch_stride; int i = is * patch_stride;
int j = 0; int j = 0;
...@@ -249,7 +273,14 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo ...@@ -249,7 +273,14 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
Sy_ptr[is * ws] = prev_Uy; Sy_ptr[is * ws] = prev_Uy;
j += patch_stride; j += patch_stride;
#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id(); int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
__local float2 smem[8];
int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif
for (int js = 1; js < ws; js++, j += patch_stride) for (int js = 1; js < ws; js++, j += patch_stride)
{ {
float min_SSD, cur_SSD; float min_SSD, cur_SSD;
...@@ -258,11 +289,11 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo ...@@ -258,11 +289,11 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
INIT_BILINEAR_WEIGHTS(Ux, Uy); INIT_BILINEAR_WEIGHTS(Ux, Uy);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid); w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy); INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid); w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD) if (cur_SSD < min_SSD)
{ {
Ux = prev_Ux; Ux = prev_Ux;
...@@ -274,6 +305,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo ...@@ -274,6 +305,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
Sx_ptr[is * ws + js] = Ux; Sx_ptr[is * ws + js] = Ux;
Sy_ptr[is * ws + js] = Uy; Sy_ptr[is * ws + js] = Uy;
} }
#undef EXTRA_ARGS_computeSSDMeanNorm
} }
float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr, float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
...@@ -396,14 +428,14 @@ __kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __glo ...@@ -396,14 +428,14 @@ __kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __glo
} }
} }
__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr, __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride, int border_size, int patch_size, int patch_stride,
int w, int h, int ws, int hs, int pyr_level, int w, int h, int ws, int hs, int pyr_level,
__global float *Sx_ptr, __global float *Sy_ptr) __global float *Sx_ptr, __global float *Sy_ptr)
{ {
int id = get_global_id(0); int id = get_global_id(0);
int is = id / 8; int is = get_group_id(0);
if (id >= (hs * 8)) return;
is = (hs - 1 - is); is = (hs - 1 - is);
int i = is * patch_stride; int i = is * patch_stride;
...@@ -419,7 +451,14 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo ...@@ -419,7 +451,14 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
float j_upper_limit = bsz + w - 1.0f; float j_upper_limit = bsz + w - 1.0f;
float i_I1, j_I1, w00, w01, w10, w11; float i_I1, j_I1, w00, w01, w10, w11;
#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id(); int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
__local float2 smem[8];
int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif
for (int js = (ws - 2); js > -1; js--, j -= patch_stride) for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
{ {
float min_SSD, cur_SSD; float min_SSD, cur_SSD;
...@@ -428,17 +467,18 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo ...@@ -428,17 +467,18 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x); INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid); w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y); INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid); w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD) if (cur_SSD < min_SSD)
{ {
Sx_ptr[is * ws + js] = Ux.y; Sx_ptr[is * ws + js] = Ux.y;
Sy_ptr[is * ws + js] = Uy.y; Sy_ptr[is * ws + js] = Uy.y;
} }
} }
#undef EXTRA_ARGS_computeSSDMeanNorm
} }
__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr, __kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment