Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
0e1dd63f
Commit
0e1dd63f
authored
Apr 22, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
video(DIS): use OpenCL shared mem
- fix perf test iterations
parent
076ee65c
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
59 additions
and
13 deletions
+59
-13
perf_dis_optflow.cpp
modules/video/perf/opencl/perf_dis_optflow.cpp
+2
-1
dis_flow.cpp
modules/video/src/dis_flow.cpp
+8
-3
dis_flow.cl
modules/video/src/opencl/dis_flow.cl
+49
-9
No files found.
modules/video/perf/opencl/perf_dis_optflow.cpp
View file @
0e1dd63f
...
@@ -37,10 +37,11 @@ OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf,
...
@@ -37,10 +37,11 @@ OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf,
Ptr
<
DenseOpticalFlow
>
algo
=
DISOpticalFlow
::
create
(
preset
);
Ptr
<
DenseOpticalFlow
>
algo
=
DISOpticalFlow
::
create
(
preset
);
OCL_TEST_CYCLE_N
(
10
)
PERF_SAMPLE_BEGIN
(
)
{
{
algo
->
calc
(
frame1
,
frame2
,
flow
);
algo
->
calc
(
frame1
,
frame2
,
flow
);
}
}
PERF_SAMPLE_END
()
SANITY_CHECK_NOTHING
();
SANITY_CHECK_NOTHING
();
}
}
...
...
modules/video/src/dis_flow.cpp
View file @
0e1dd63f
...
@@ -1055,11 +1055,16 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
...
@@ -1055,11 +1055,16 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
int
idx
;
int
idx
;
int
num_inner_iter
=
(
int
)
floor
(
grad_descent_iter
/
(
float
)
num_iter
);
int
num_inner_iter
=
(
int
)
floor
(
grad_descent_iter
/
(
float
)
num_iter
);
String
subgroups_build_options
;
if
(
ocl
::
Device
::
getDefault
().
isExtensionSupported
(
"cl_khr_subgroups"
))
subgroups_build_options
=
"-DCV_USE_SUBGROUPS=1"
;
for
(
int
iter
=
0
;
iter
<
num_iter
;
iter
++
)
for
(
int
iter
=
0
;
iter
<
num_iter
;
iter
++
)
{
{
if
(
iter
==
0
)
if
(
iter
==
0
)
{
{
ocl
::
Kernel
k1
(
"dis_patch_inverse_search_fwd_1"
,
ocl
::
video
::
dis_flow_oclsrc
);
ocl
::
Kernel
k1
(
"dis_patch_inverse_search_fwd_1"
,
ocl
::
video
::
dis_flow_oclsrc
,
subgroups_build_options
);
size_t
global_sz
[]
=
{(
size_t
)
hs
*
8
};
size_t
global_sz
[]
=
{(
size_t
)
hs
*
8
};
size_t
local_sz
[]
=
{
8
};
size_t
local_sz
[]
=
{
8
};
idx
=
0
;
idx
=
0
;
...
@@ -1111,7 +1116,7 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
...
@@ -1111,7 +1116,7 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
}
}
else
else
{
{
ocl
::
Kernel
k3
(
"dis_patch_inverse_search_bwd_1"
,
ocl
::
video
::
dis_flow_oclsrc
);
ocl
::
Kernel
k3
(
"dis_patch_inverse_search_bwd_1"
,
ocl
::
video
::
dis_flow_oclsrc
,
subgroups_build_options
);
size_t
global_sz
[]
=
{(
size_t
)
hs
*
8
};
size_t
global_sz
[]
=
{(
size_t
)
hs
*
8
};
size_t
local_sz
[]
=
{
8
};
size_t
local_sz
[]
=
{
8
};
idx
=
0
;
idx
=
0
;
...
@@ -1368,7 +1373,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
...
@@ -1368,7 +1373,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
CV_Assert
(
I0
.
isContinuous
());
CV_Assert
(
I0
.
isContinuous
());
CV_Assert
(
I1
.
isContinuous
());
CV_Assert
(
I1
.
isContinuous
());
CV_OCL_RUN
(
ocl
::
Device
::
getDefault
().
isIntel
()
&&
flow
.
isUMat
()
&&
CV_OCL_RUN
(
flow
.
isUMat
()
&&
(
patch_size
==
8
)
&&
(
use_spatial_propagation
==
true
),
(
patch_size
==
8
)
&&
(
use_spatial_propagation
==
true
),
ocl_calc
(
I0
,
I1
,
flow
));
ocl_calc
(
I0
,
I1
,
flow
));
...
...
modules/video/src/opencl/dis_flow.cl
View file @
0e1dd63f
...
@@ -2,6 +2,8 @@
...
@@ -2,6 +2,8 @@
//
It
is
subject
to
the
license
terms
in
the
LICENSE
file
found
in
the
top-level
directory
//
It
is
subject
to
the
license
terms
in
the
LICENSE
file
found
in
the
top-level
directory
//
of
this
distribution
and
at
http://opencv.org/license.html.
//
of
this
distribution
and
at
http://opencv.org/license.html.
//#define
CV_USE_SUBGROUPS
#
define
EPS
0.001f
#
define
EPS
0.001f
#
define
INF
1E+10F
#
define
INF
1E+10F
...
@@ -193,7 +195,11 @@ __kernel void dis_densification(__global const float *sx, __global const float *
...
@@ -193,7 +195,11 @@ __kernel void dis_densification(__global const float *sx, __global const float *
float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
int I0_stride, int I1_stride,
int I0_stride, int I1_stride,
float w00, float w01, float w10, float w11, int patch_sz, int i)
float w00, float w01, float w10, float w11, int patch_sz, int i
#ifndef CV_USE_SUBGROUPS
, __local float2 *smem /*[8]*/
#endif
)
{
{
float sum_diff = 0.0f, sum_diff_sq = 0.0f;
float sum_diff = 0.0f, sum_diff_sq = 0.0f;
int n = patch_sz * patch_sz;
int n = patch_sz * patch_sz;
...
@@ -214,12 +220,31 @@ float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_
...
@@ -214,12 +220,31 @@ float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_
sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
#ifdef CV_USE_SUBGROUPS
sum_diff = sub_group_reduce_add(sum_diff);
sum_diff = sub_group_reduce_add(sum_diff);
sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
#else
barrier(CLK_LOCAL_MEM_FENCE);
smem[i] = (float2)(sum_diff, sum_diff_sq);
barrier(CLK_LOCAL_MEM_FENCE);
if (i < 4)
smem[i] += smem[i + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if (i < 2)
smem[i] += smem[i + 2];
barrier(CLK_LOCAL_MEM_FENCE);
if (i == 0)
smem[0] += smem[1];
barrier(CLK_LOCAL_MEM_FENCE);
float2 reduce_add_result = smem[0];
sum_diff = reduce_add_result.x;
sum_diff_sq = reduce_add_result.y;
#endif
return sum_diff_sq - sum_diff * sum_diff / n;
return sum_diff_sq - sum_diff * sum_diff / n;
}
}
__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
__global const uchar *I0_ptr, __global const uchar *I1_ptr,
__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride,
int border_size, int patch_size, int patch_stride,
...
@@ -227,8 +252,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
...
@@ -227,8 +252,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
__global float *Sx_ptr, __global float *Sy_ptr)
__global float *Sx_ptr, __global float *Sy_ptr)
{
{
int id = get_global_id(0);
int id = get_global_id(0);
int is = id / 8;
int is = get_group_id(0);
if (id >= (hs * 8)) return;
int i = is * patch_stride;
int i = is * patch_stride;
int j = 0;
int j = 0;
...
@@ -249,7 +273,14 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
...
@@ -249,7 +273,14 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
Sy_ptr[is * ws] = prev_Uy;
Sy_ptr[is * ws] = prev_Uy;
j += patch_stride;
j += patch_stride;
#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id();
int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
__local float2 smem[8];
int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif
for (int js = 1; js < ws; js++, j += patch_stride)
for (int js = 1; js < ws; js++, j += patch_stride)
{
{
float min_SSD, cur_SSD;
float min_SSD, cur_SSD;
...
@@ -258,11 +289,11 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
...
@@ -258,11 +289,11 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
INIT_BILINEAR_WEIGHTS(Ux, Uy);
INIT_BILINEAR_WEIGHTS(Ux, Uy);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz,
sid
);
w, w_ext, w00, w01, w10, w11, psz,
EXTRA_ARGS_computeSSDMeanNorm
);
INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz,
sid
);
w, w_ext, w00, w01, w10, w11, psz,
EXTRA_ARGS_computeSSDMeanNorm
);
if (cur_SSD < min_SSD)
if (cur_SSD < min_SSD)
{
{
Ux = prev_Ux;
Ux = prev_Ux;
...
@@ -274,6 +305,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
...
@@ -274,6 +305,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
Sx_ptr[is * ws + js] = Ux;
Sx_ptr[is * ws + js] = Ux;
Sy_ptr[is * ws + js] = Uy;
Sy_ptr[is * ws + js] = Uy;
}
}
#undef EXTRA_ARGS_computeSSDMeanNorm
}
}
float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
...
@@ -396,14 +428,14 @@ __kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __glo
...
@@ -396,14 +428,14 @@ __kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __glo
}
}
}
}
__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride,
int border_size, int patch_size, int patch_stride,
int w, int h, int ws, int hs, int pyr_level,
int w, int h, int ws, int hs, int pyr_level,
__global float *Sx_ptr, __global float *Sy_ptr)
__global float *Sx_ptr, __global float *Sy_ptr)
{
{
int id = get_global_id(0);
int id = get_global_id(0);
int is = id / 8;
int is = get_group_id(0);
if (id >= (hs * 8)) return;
is = (hs - 1 - is);
is = (hs - 1 - is);
int i = is * patch_stride;
int i = is * patch_stride;
...
@@ -419,7 +451,14 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
...
@@ -419,7 +451,14 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
float j_upper_limit = bsz + w - 1.0f;
float j_upper_limit = bsz + w - 1.0f;
float i_I1, j_I1, w00, w01, w10, w11;
float i_I1, j_I1, w00, w01, w10, w11;
#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id();
int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
__local float2 smem[8];
int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif
for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
{
{
float min_SSD, cur_SSD;
float min_SSD, cur_SSD;
...
@@ -428,17 +467,18 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
...
@@ -428,17 +467,18 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz,
sid
);
w, w_ext, w00, w01, w10, w11, psz,
EXTRA_ARGS_computeSSDMeanNorm
);
INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz,
sid
);
w, w_ext, w00, w01, w10, w11, psz,
EXTRA_ARGS_computeSSDMeanNorm
);
if (cur_SSD < min_SSD)
if (cur_SSD < min_SSD)
{
{
Sx_ptr[is * ws + js] = Ux.y;
Sx_ptr[is * ws + js] = Ux.y;
Sy_ptr[is * ws + js] = Uy.y;
Sy_ptr[is * ws + js] = Uy.y;
}
}
}
}
#undef EXTRA_ARGS_computeSSDMeanNorm
}
}
__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment