Unverified Commit 53c77155 authored by Alexander Alekhin's avatar Alexander Alekhin Committed by GitHub

Merge pull request #14473 from alalek:video_dis_update_opencl

video(DISOpticalFlow): update OpenCL implementation (#14473)

* video(DIS): add code for profiling

* video(DIS): fix test parameters

* video(DIS): simplify OpenCL kernels

- parameters -> defines
- avoid float3
- const / local scope
- improve readability, replace Kernel::set() -> args()

* video(DIS): use CV_32FC2 buffers
parent 43467a2a
...@@ -48,8 +48,7 @@ using namespace std; ...@@ -48,8 +48,7 @@ using namespace std;
#define EPS 0.001F #define EPS 0.001F
#define INF 1E+10F #define INF 1E+10F
namespace cv namespace cv {
{
class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
{ {
...@@ -177,16 +176,10 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow ...@@ -177,16 +176,10 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
vector<UMat> u_Ux; //!< x component of the flow vectors vector<UMat> u_U; //!< (x,y) component of the flow vectors (CV_32FC2)
vector<UMat> u_Uy; //!< y component of the flow vectors vector<UMat> u_initial_U; //!< (x, y) components of the initial flow field, if one was passed as an input (CV_32FC2)
vector<UMat> u_initial_Ux; //!< x component of the initial flow field, if one was passed as an input
vector<UMat> u_initial_Uy; //!< y component of the initial flow field, if one was passed as an input
UMat u_U; //!< a buffer for the merged flow UMat u_S; //!< intermediate sparse flow representation (x,y components - CV_32FC2)
UMat u_Sx; //!< intermediate sparse flow representation (x component)
UMat u_Sy; //!< intermediate sparse flow representation (y component)
/* Structure tensor components: */ /* Structure tensor components: */
UMat u_I0xx_buf; //!< sum of squares of x gradient values UMat u_I0xx_buf; //!< sum of squares of x gradient values
...@@ -206,16 +199,18 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow ...@@ -206,16 +199,18 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy, bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y); UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
void ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow); void ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow);
bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow); bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
bool ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1); bool ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1);
bool ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy, bool ocl_PatchInverseSearch(UMat &src_U,
UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level); UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
#endif #endif
}; };
DISOpticalFlowImpl::DISOpticalFlowImpl() DISOpticalFlowImpl::DISOpticalFlowImpl()
{ {
CV_INSTRUMENT_REGION();
finest_scale = 2; finest_scale = 2;
patch_size = 8; patch_size = 8;
patch_stride = 4; patch_stride = 4;
...@@ -239,6 +234,8 @@ DISOpticalFlowImpl::DISOpticalFlowImpl() ...@@ -239,6 +234,8 @@ DISOpticalFlowImpl::DISOpticalFlowImpl()
void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow) void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow)
{ {
CV_INSTRUMENT_REGION();
I0s.resize(coarsest_scale + 1); I0s.resize(coarsest_scale + 1);
I1s.resize(coarsest_scale + 1); I1s.resize(coarsest_scale + 1);
I1s_ext.resize(coarsest_scale + 1); I1s_ext.resize(coarsest_scale + 1);
...@@ -332,6 +329,8 @@ void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_fl ...@@ -332,6 +329,8 @@ void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_fl
void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
Mat &dst_I0y, Mat &I0x, Mat &I0y) Mat &dst_I0y, Mat &I0x, Mat &I0y)
{ {
CV_INSTRUMENT_REGION();
float *I0xx_ptr = dst_I0xx.ptr<float>(); float *I0xx_ptr = dst_I0xx.ptr<float>();
float *I0yy_ptr = dst_I0yy.ptr<float>(); float *I0yy_ptr = dst_I0yy.ptr<float>();
float *I0xy_ptr = dst_I0xy.ptr<float>(); float *I0xy_ptr = dst_I0xy.ptr<float>();
...@@ -596,8 +595,8 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar * ...@@ -596,8 +595,8 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *
SSD = v_reduce_sum(SSD_vec); SSD = v_reduce_sum(SSD_vec);
} }
else else
{
#endif #endif
{
dst_dUx = 0.0f; dst_dUx = 0.0f;
dst_dUy = 0.0f; dst_dUy = 0.0f;
float diff; float diff;
...@@ -612,9 +611,7 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar * ...@@ -612,9 +611,7 @@ inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *
dst_dUx += diff * I0x_ptr[i * I0_stride + j]; dst_dUx += diff * I0x_ptr[i * I0_stride + j];
dst_dUy += diff * I0y_ptr[i * I0_stride + j]; dst_dUy += diff * I0y_ptr[i * I0_stride + j];
} }
#if CV_SIMD128
} }
#endif
return SSD; return SSD;
} }
...@@ -668,8 +665,8 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, ...@@ -668,8 +665,8 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr,
sum_diff_sq = v_reduce_sum(sum_diff_sq_vec); sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
} }
else else
{
#endif #endif
{
float diff; float diff;
for (int i = 0; i < patch_sz; i++) for (int i = 0; i < patch_sz; i++)
for (int j = 0; j < patch_sz; j++) for (int j = 0; j < patch_sz; j++)
...@@ -684,9 +681,7 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, ...@@ -684,9 +681,7 @@ inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr,
sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j]; sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j]; sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
} }
#if CV_SIMD128
} }
#endif
dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n; dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n; dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
return sum_diff_sq - sum_diff * sum_diff / n; return sum_diff_sq - sum_diff * sum_diff / n;
...@@ -711,8 +706,8 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri ...@@ -711,8 +706,8 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri
SSD = v_reduce_sum(SSD_vec); SSD = v_reduce_sum(SSD_vec);
} }
else else
{
#endif #endif
{
float diff; float diff;
for (int i = 0; i < patch_sz; i++) for (int i = 0; i < patch_sz; i++)
for (int j = 0; j < patch_sz; j++) for (int j = 0; j < patch_sz; j++)
...@@ -722,9 +717,7 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri ...@@ -722,9 +717,7 @@ inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stri
I0_ptr[i * I0_stride + j]; I0_ptr[i * I0_stride + j];
SSD += diff * diff; SSD += diff * diff;
} }
#if CV_SIMD128
} }
#endif
return SSD; return SSD;
} }
...@@ -777,6 +770,8 @@ inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int ...@@ -777,6 +770,8 @@ inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int
void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
{ {
CV_INSTRUMENT_REGION();
// force separate processing of stripes if we are using spatial propagation: // force separate processing of stripes if we are using spatial propagation:
if (dis->use_spatial_propagation && range.end > range.start + 1) if (dis->use_spatial_propagation && range.end > range.start + 1)
{ {
...@@ -831,11 +826,14 @@ void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &ran ...@@ -831,11 +826,14 @@ void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &ran
#define INIT_BILINEAR_WEIGHTS(Ux, Uy) \ #define INIT_BILINEAR_WEIGHTS(Ux, Uy) \
i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \ i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \
j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \ j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \
\ { \
w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1)); \ float di = i_I1 - floor(i_I1); \
w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1); \ float dj = j_I1 - floor(j_I1); \
w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1)); \ w11 = di * dj; \
w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1); w10 = di * (1 - dj); \
w01 = (1 - di) * dj; \
w00 = (1 - di) * (1 - dj); \
}
#define COMPUTE_SSD(dst, Ux, Uy) \ #define COMPUTE_SSD(dst, Ux, Uy) \
INIT_BILINEAR_WEIGHTS(Ux, Uy); \ INIT_BILINEAR_WEIGHTS(Ux, Uy); \
...@@ -951,14 +949,16 @@ void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &ran ...@@ -951,14 +949,16 @@ void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &ran
{ {
INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy); INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
if (dis->use_mean_normalization) if (dis->use_mean_normalization)
SSD = processPatchMeanNorm(dUx, dUy, I0_ptr + i * dis->w + j, SSD = processPatchMeanNorm(dUx, dUy,
I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * dis->w + j, I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01, w10, w11, psz, I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j,
dis->w, w_ext, w00, w01, w10, w11, psz,
x_grad_sum, y_grad_sum); x_grad_sum, y_grad_sum);
else else
SSD = processPatch(dUx, dUy, I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, SSD = processPatch(dUx, dUy,
I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j, dis->w, w_ext, w00, w01, I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w10, w11, psz); I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j,
dis->w, w_ext, w00, w01, w10, w11, psz);
dx = invH11 * dUx + invH12 * dUy; dx = invH11 * dUx + invH12 * dUy;
dy = invH12 * dUx + invH22 * dUy; dy = invH12 * dUx + invH22 * dUy;
...@@ -1002,6 +1002,8 @@ DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowI ...@@ -1002,6 +1002,8 @@ DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowI
*/ */
void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
{ {
CV_INSTRUMENT_REGION();
int start_i = min(range.start * stripe_sz, h); int start_i = min(range.start * stripe_sz, h);
int end_i = min(range.end * stripe_sz, h); int end_i = min(range.end * stripe_sz, h);
...@@ -1087,117 +1089,100 @@ void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) c ...@@ -1087,117 +1089,100 @@ void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) c
} }
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy, bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_U,
UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level) UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int /*pyr_level*/)
{ {
CV_INSTRUMENT_REGION();
CV_INSTRUMENT_REGION_OPENCL();
size_t globalSize[] = {(size_t)ws, (size_t)hs}; size_t globalSize[] = {(size_t)ws, (size_t)hs};
size_t localSize[] = {16, 16}; size_t localSize[] = {16, 16};
int idx;
int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter); int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
String subgroups_build_options; String subgroups_build_options;
if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups")) if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups"))
subgroups_build_options = "-DCV_USE_SUBGROUPS=1"; subgroups_build_options = " -DCV_USE_SUBGROUPS=1";
String build_options = cv::format(
"-DDIS_BORDER_SIZE=%d -DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
border_size, patch_size, patch_stride
) + subgroups_build_options;
#if 0 // OpenCL debug
u_Sx = Scalar::all(0);
u_Sy = Scalar::all(0);
#endif
CV_Assert(num_iter == 2);
for (int iter = 0; iter < num_iter; iter++) for (int iter = 0; iter < num_iter; iter++)
{ {
if (iter == 0) if (iter == 0)
{ {
ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, subgroups_build_options); ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, build_options);
size_t global_sz[] = {(size_t)hs * 8}; size_t global_sz[] = {(size_t)hs * 8};
size_t local_sz[] = {8}; size_t local_sz[] = {8};
idx = 0;
k1.args(
idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux)); ocl::KernelArg::PtrReadOnly(src_U),
idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy)); ocl::KernelArg::PtrReadOnly(I0),
idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I0)); ocl::KernelArg::PtrReadOnly(I1),
idx = k1.set(idx, ocl::KernelArg::PtrReadOnly(I1)); (int)w, (int)h, (int)ws, (int)hs,
idx = k1.set(idx, (int)border_size); ocl::KernelArg::PtrWriteOnly(u_S)
idx = k1.set(idx, (int)patch_size); );
idx = k1.set(idx, (int)patch_stride);
idx = k1.set(idx, (int)w);
idx = k1.set(idx, (int)h);
idx = k1.set(idx, (int)ws);
idx = k1.set(idx, (int)hs);
idx = k1.set(idx, (int)pyr_level);
idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sx));
idx = k1.set(idx, ocl::KernelArg::PtrWriteOnly(u_Sy));
if (!k1.run(1, global_sz, local_sz, false)) if (!k1.run(1, global_sz, local_sz, false))
return false; return false;
ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc); ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc, build_options);
idx = 0;
k2.args(
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Ux)); ocl::KernelArg::PtrReadOnly(src_U),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(src_Uy)); ocl::KernelArg::PtrReadOnly(I0),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0)); ocl::KernelArg::PtrReadOnly(I1),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I1)); ocl::KernelArg::PtrReadOnly(I0x),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0x)); ocl::KernelArg::PtrReadOnly(I0y),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(I0y)); ocl::KernelArg::PtrReadOnly(u_I0xx_buf),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf)); ocl::KernelArg::PtrReadOnly(u_I0yy_buf),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf)); ocl::KernelArg::PtrReadOnly(u_I0xy_buf),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf)); ocl::KernelArg::PtrReadOnly(u_I0x_buf),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf)); ocl::KernelArg::PtrReadOnly(u_I0y_buf),
idx = k2.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf)); (int)w, (int)h, (int)ws, (int)hs,
idx = k2.set(idx, (int)border_size); (int)num_inner_iter,
idx = k2.set(idx, (int)patch_size); ocl::KernelArg::PtrReadWrite(u_S)
idx = k2.set(idx, (int)patch_stride); );
idx = k2.set(idx, (int)w);
idx = k2.set(idx, (int)h);
idx = k2.set(idx, (int)ws);
idx = k2.set(idx, (int)hs);
idx = k2.set(idx, (int)num_inner_iter);
idx = k2.set(idx, (int)pyr_level);
idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
idx = k2.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
if (!k2.run(2, globalSize, localSize, false)) if (!k2.run(2, globalSize, localSize, false))
return false; return false;
} }
else else
{ {
ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, subgroups_build_options); ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, build_options);
size_t global_sz[] = {(size_t)hs * 8}; size_t global_sz[] = {(size_t)hs * 8};
size_t local_sz[] = {8}; size_t local_sz[] = {8};
idx = 0;
k3.args(
idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I0)); ocl::KernelArg::PtrReadOnly(I0),
idx = k3.set(idx, ocl::KernelArg::PtrReadOnly(I1)); ocl::KernelArg::PtrReadOnly(I1),
idx = k3.set(idx, (int)border_size); (int)w, (int)h, (int)ws, (int)hs,
idx = k3.set(idx, (int)patch_size); ocl::KernelArg::PtrReadWrite(u_S)
idx = k3.set(idx, (int)patch_stride); );
idx = k3.set(idx, (int)w);
idx = k3.set(idx, (int)h);
idx = k3.set(idx, (int)ws);
idx = k3.set(idx, (int)hs);
idx = k3.set(idx, (int)pyr_level);
idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
idx = k3.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
if (!k3.run(1, global_sz, local_sz, false)) if (!k3.run(1, global_sz, local_sz, false))
return false; return false;
ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc); ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc, build_options);
idx = 0;
k4.args(
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0)); ocl::KernelArg::PtrReadOnly(I0),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I1)); ocl::KernelArg::PtrReadOnly(I1),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0x)); ocl::KernelArg::PtrReadOnly(I0x),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(I0y)); ocl::KernelArg::PtrReadOnly(I0y),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xx_buf)); ocl::KernelArg::PtrReadOnly(u_I0xx_buf),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0yy_buf)); ocl::KernelArg::PtrReadOnly(u_I0yy_buf),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0xy_buf)); ocl::KernelArg::PtrReadOnly(u_I0xy_buf),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0x_buf)); ocl::KernelArg::PtrReadOnly(u_I0x_buf),
idx = k4.set(idx, ocl::KernelArg::PtrReadOnly(u_I0y_buf)); ocl::KernelArg::PtrReadOnly(u_I0y_buf),
idx = k4.set(idx, (int)border_size); (int)w, (int)h,(int)ws, (int)hs,
idx = k4.set(idx, (int)patch_size); (int)num_inner_iter,
idx = k4.set(idx, (int)patch_stride); ocl::KernelArg::PtrReadWrite(u_S)
idx = k4.set(idx, (int)w); );
idx = k4.set(idx, (int)h);
idx = k4.set(idx, (int)ws);
idx = k4.set(idx, (int)hs);
idx = k4.set(idx, (int)num_inner_iter);
idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sx));
idx = k4.set(idx, ocl::KernelArg::PtrReadWrite(u_Sy));
if (!k4.run(2, globalSize, localSize, false)) if (!k4.run(2, globalSize, localSize, false))
return false; return false;
} }
...@@ -1205,39 +1190,45 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy, ...@@ -1205,39 +1190,45 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
return true; return true;
} }
bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_Ux, UMat &dst_Uy, UMat &src_Sx, UMat &src_Sy, UMat &_I0, UMat &_I1) bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1)
{ {
CV_INSTRUMENT_REGION();
CV_INSTRUMENT_REGION_OPENCL();
size_t globalSize[] = {(size_t)w, (size_t)h}; size_t globalSize[] = {(size_t)w, (size_t)h};
size_t localSize[] = {16, 16}; size_t localSize[] = {16, 16};
ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc); String build_options = cv::format(
kernel.args(ocl::KernelArg::PtrReadOnly(src_Sx), "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
ocl::KernelArg::PtrReadOnly(src_Sy), patch_size, patch_stride
);
ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc, build_options);
kernel.args(
ocl::KernelArg::PtrReadOnly(src_S),
ocl::KernelArg::PtrReadOnly(_I0), ocl::KernelArg::PtrReadOnly(_I0),
ocl::KernelArg::PtrReadOnly(_I1), ocl::KernelArg::PtrReadOnly(_I1),
(int)patch_size, (int)patch_stride,
(int)w, (int)h, (int)ws, (int)w, (int)h, (int)ws,
ocl::KernelArg::PtrWriteOnly(dst_Ux), ocl::KernelArg::PtrWriteOnly(dst_U)
ocl::KernelArg::PtrWriteOnly(dst_Uy)); );
return kernel.run(2, globalSize, localSize, false); return kernel.run(2, globalSize, localSize, false);
} }
void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool use_flow) void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow)
{ {
CV_INSTRUMENT_REGION();
// not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
u_I0s.resize(coarsest_scale + 1); u_I0s.resize(coarsest_scale + 1);
u_I1s.resize(coarsest_scale + 1); u_I1s.resize(coarsest_scale + 1);
u_I1s_ext.resize(coarsest_scale + 1); u_I1s_ext.resize(coarsest_scale + 1);
u_I0xs.resize(coarsest_scale + 1); u_I0xs.resize(coarsest_scale + 1);
u_I0ys.resize(coarsest_scale + 1); u_I0ys.resize(coarsest_scale + 1);
u_Ux.resize(coarsest_scale + 1); u_U.resize(coarsest_scale + 1);
u_Uy.resize(coarsest_scale + 1);
vector<UMat> flow_uv(2);
if (use_flow) if (use_flow)
{ {
split(flow, flow_uv); u_initial_U.resize(coarsest_scale + 1);
u_initial_Ux.resize(coarsest_scale + 1);
u_initial_Uy.resize(coarsest_scale + 1);
} }
int fraction = 1; int fraction = 1;
...@@ -1245,6 +1236,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool ...@@ -1245,6 +1236,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool
for (int i = 0; i <= coarsest_scale; i++) for (int i = 0; i <= coarsest_scale; i++)
{ {
CV_TRACE_REGION("coarsest_scale_iteration");
/* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */ /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
if (i == finest_scale) if (i == finest_scale)
{ {
...@@ -1256,8 +1248,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool ...@@ -1256,8 +1248,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool
resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA); resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
/* These buffers are reused in each scale so we initialize them once on the finest scale: */ /* These buffers are reused in each scale so we initialize them once on the finest scale: */
u_Sx.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1); u_S.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC2);
u_Sy.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
u_I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1); u_I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
u_I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1); u_I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
u_I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1); u_I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
...@@ -1269,8 +1260,6 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool ...@@ -1269,8 +1260,6 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool
u_I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1); u_I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
u_I0x_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1); u_I0x_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
u_I0y_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1); u_I0y_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
u_U.create(cur_rows, cur_cols, CV_32FC2);
} }
else if (i > finest_scale) else if (i > finest_scale)
{ {
...@@ -1289,8 +1278,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool ...@@ -1289,8 +1278,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool
u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1); u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1); u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]); spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
u_Ux[i].create(cur_rows, cur_cols, CV_32FC1); u_U[i].create(cur_rows, cur_cols, CV_32FC2);
u_Uy[i].create(cur_rows, cur_cols, CV_32FC1);
variational_refinement_processors[i]->setAlpha(variational_refinement_alpha); variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
variational_refinement_processors[i]->setDelta(variational_refinement_delta); variational_refinement_processors[i]->setDelta(variational_refinement_delta);
variational_refinement_processors[i]->setGamma(variational_refinement_gamma); variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
...@@ -1299,10 +1287,10 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool ...@@ -1299,10 +1287,10 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool
if (use_flow) if (use_flow)
{ {
resize(flow_uv[0], u_initial_Ux[i], Size(cur_cols, cur_rows)); UMat resized_flow;
divide(u_initial_Ux[i], static_cast<float>(fraction), u_initial_Ux[i]); resize(flow, resized_flow, Size(cur_cols, cur_rows));
resize(flow_uv[1], u_initial_Uy[i], Size(cur_cols, cur_rows)); float scale = 1.0f / fraction;
divide(u_initial_Uy[i], static_cast<float>(fraction), u_initial_Uy[i]); resized_flow.convertTo(u_initial_U[i], CV_32FC2, scale, 0.0f);
} }
} }
...@@ -1313,51 +1301,74 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool ...@@ -1313,51 +1301,74 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, UMat &flow, bool
bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy, bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y) UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
{ {
CV_INSTRUMENT_REGION();
CV_INSTRUMENT_REGION_OPENCL();
size_t globalSizeX[] = {(size_t)h}; size_t globalSizeX[] = {(size_t)h};
size_t localSizeX[] = {16}; size_t localSizeX[] = {16};
ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc); #if 0 // OpenCL debug
kernelX.args(ocl::KernelArg::PtrReadOnly(I0x), u_I0xx_buf_aux = Scalar::all(0);
u_I0yy_buf_aux = Scalar::all(0);
u_I0xy_buf_aux = Scalar::all(0);
u_I0x_buf_aux = Scalar::all(0);
u_I0y_buf_aux = Scalar::all(0);
dst_I0xx = Scalar::all(0);
dst_I0yy = Scalar::all(0);
dst_I0xy = Scalar::all(0);
dst_I0x = Scalar::all(0);
dst_I0y = Scalar::all(0);
#endif
String build_options = cv::format(
"-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
patch_size, patch_stride
);
ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc, build_options);
kernelX.args(
ocl::KernelArg::PtrReadOnly(I0x),
ocl::KernelArg::PtrReadOnly(I0y), ocl::KernelArg::PtrReadOnly(I0y),
(int)patch_size, (int)patch_stride,
(int)w, (int)h, (int)ws, (int)w, (int)h, (int)ws,
ocl::KernelArg::PtrWriteOnly(u_I0xx_buf_aux), ocl::KernelArg::PtrWriteOnly(u_I0xx_buf_aux),
ocl::KernelArg::PtrWriteOnly(u_I0yy_buf_aux), ocl::KernelArg::PtrWriteOnly(u_I0yy_buf_aux),
ocl::KernelArg::PtrWriteOnly(u_I0xy_buf_aux), ocl::KernelArg::PtrWriteOnly(u_I0xy_buf_aux),
ocl::KernelArg::PtrWriteOnly(u_I0x_buf_aux), ocl::KernelArg::PtrWriteOnly(u_I0x_buf_aux),
ocl::KernelArg::PtrWriteOnly(u_I0y_buf_aux)); ocl::KernelArg::PtrWriteOnly(u_I0y_buf_aux)
);
if (!kernelX.run(1, globalSizeX, localSizeX, false)) if (!kernelX.run(1, globalSizeX, localSizeX, false))
return false; return false;
size_t globalSizeY[] = {(size_t)ws}; size_t globalSizeY[] = {(size_t)ws};
size_t localSizeY[] = {16}; size_t localSizeY[] = {16};
ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc); ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc, build_options);
kernelY.args(ocl::KernelArg::PtrReadOnly(u_I0xx_buf_aux), kernelY.args(
ocl::KernelArg::PtrReadOnly(u_I0xx_buf_aux),
ocl::KernelArg::PtrReadOnly(u_I0yy_buf_aux), ocl::KernelArg::PtrReadOnly(u_I0yy_buf_aux),
ocl::KernelArg::PtrReadOnly(u_I0xy_buf_aux), ocl::KernelArg::PtrReadOnly(u_I0xy_buf_aux),
ocl::KernelArg::PtrReadOnly(u_I0x_buf_aux), ocl::KernelArg::PtrReadOnly(u_I0x_buf_aux),
ocl::KernelArg::PtrReadOnly(u_I0y_buf_aux), ocl::KernelArg::PtrReadOnly(u_I0y_buf_aux),
(int)patch_size, (int)patch_stride,
(int)w, (int)h, (int)ws, (int)w, (int)h, (int)ws,
ocl::KernelArg::PtrWriteOnly(dst_I0xx), ocl::KernelArg::PtrWriteOnly(dst_I0xx),
ocl::KernelArg::PtrWriteOnly(dst_I0yy), ocl::KernelArg::PtrWriteOnly(dst_I0yy),
ocl::KernelArg::PtrWriteOnly(dst_I0xy), ocl::KernelArg::PtrWriteOnly(dst_I0xy),
ocl::KernelArg::PtrWriteOnly(dst_I0x), ocl::KernelArg::PtrWriteOnly(dst_I0x),
ocl::KernelArg::PtrWriteOnly(dst_I0y)); ocl::KernelArg::PtrWriteOnly(dst_I0y)
);
return kernelY.run(1, globalSizeY, localSizeY, false); return kernelY.run(1, globalSizeY, localSizeY, false);
} }
bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow) bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
{ {
CV_INSTRUMENT_REGION();
// not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
UMat I0Mat = I0.getUMat(); UMat I0Mat = I0.getUMat();
UMat I1Mat = I1.getUMat(); UMat I1Mat = I1.getUMat();
bool use_input_flow = false; bool use_input_flow = false;
if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2) if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
use_input_flow = true; use_input_flow = true;
else
flow.create(I1Mat.size(), CV_32FC2);
UMat &u_flowMat = flow.getUMatRef();
coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code search for maximal movement of width/4 */ coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code search for maximal movement of width/4 */
(int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0))); /* Deepest pyramid level greater or equal than patch*/ (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0))); /* Deepest pyramid level greater or equal than patch*/
...@@ -1372,12 +1383,12 @@ bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray ...@@ -1372,12 +1383,12 @@ bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray
autoSelectPatchSizeAndScales(original_img_width); autoSelectPatchSizeAndScales(original_img_width);
} }
ocl_prepareBuffers(I0Mat, I1Mat, u_flowMat, use_input_flow); ocl_prepareBuffers(I0Mat, I1Mat, flow, use_input_flow);
u_Ux[coarsest_scale].setTo(0.0f); u_U[coarsest_scale].setTo(0.0f);
u_Uy[coarsest_scale].setTo(0.0f);
for (int i = coarsest_scale; i >= finest_scale; i--) for (int i = coarsest_scale; i >= finest_scale; i--)
{ {
CV_TRACE_REGION("coarsest_scale_iteration");
w = u_I0s[i].cols; w = u_I0s[i].cols;
h = u_I0s[i].rows; h = u_I0s[i].rows;
ws = 1 + (w - patch_size) / patch_stride; ws = 1 + (w - patch_size) / patch_stride;
...@@ -1387,30 +1398,32 @@ bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray ...@@ -1387,30 +1398,32 @@ bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray
u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i])) u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
return false; return false;
if (!ocl_PatchInverseSearch(u_Ux[i], u_Uy[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i)) if (!ocl_PatchInverseSearch(u_U[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i))
return false; return false;
if (!ocl_Densification(u_Ux[i], u_Uy[i], u_Sx, u_Sy, u_I0s[i], u_I1s[i])) if (!ocl_Densification(u_U[i], u_S, u_I0s[i], u_I1s[i]))
return false; return false;
if (variational_refinement_iter > 0) if (variational_refinement_iter > 0)
{
std::vector<Mat> U_channels;
split(u_U[i], U_channels); CV_Assert(U_channels.size() == 2);
variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i], variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
u_Ux[i].getMat(ACCESS_WRITE), u_Uy[i].getMat(ACCESS_WRITE)); U_channels[0], U_channels[1]);
merge(U_channels, u_U[i]);
}
if (i > finest_scale) if (i > finest_scale)
{ {
resize(u_Ux[i], u_Ux[i - 1], u_Ux[i - 1].size()); UMat resized;
resize(u_Uy[i], u_Uy[i - 1], u_Uy[i - 1].size()); resize(u_U[i], resized, u_U[i - 1].size());
multiply(u_Ux[i - 1], 2, u_Ux[i - 1]); multiply(resized, 2, u_U[i - 1]);
multiply(u_Uy[i - 1], 2, u_Uy[i - 1]);
} }
} }
vector<UMat> uxy(2);
uxy[0] = u_Ux[finest_scale]; UMat resized_flow;
uxy[1] = u_Uy[finest_scale]; resize(u_U[finest_scale], resized_flow, I1Mat.size());
merge(uxy, u_U); multiply(resized_flow, 1 << finest_scale, flow);
resize(u_U, u_flowMat, u_flowMat.size());
multiply(u_flowMat, 1 << finest_scale, u_flowMat);
return true; return true;
} }
...@@ -1418,6 +1431,8 @@ bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray ...@@ -1418,6 +1431,8 @@ bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray
void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow) void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
{ {
CV_INSTRUMENT_REGION();
CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1); CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1); CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
CV_Assert(I0.sameSize(I1)); CV_Assert(I0.sameSize(I1));
...@@ -1458,6 +1473,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo ...@@ -1458,6 +1473,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
for (int i = coarsest_scale; i >= finest_scale; i--) for (int i = coarsest_scale; i >= finest_scale; i--)
{ {
CV_TRACE_REGION("coarsest_scale_iteration");
w = I0s[i].cols; w = I0s[i].cols;
h = I0s[i].rows; h = I0s[i].rows;
ws = 1 + (w - patch_size) / patch_stride; ws = 1 + (w - patch_size) / patch_stride;
...@@ -1500,6 +1516,8 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo ...@@ -1500,6 +1516,8 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
void DISOpticalFlowImpl::collectGarbage() void DISOpticalFlowImpl::collectGarbage()
{ {
CV_INSTRUMENT_REGION();
I0s.clear(); I0s.clear();
I1s.clear(); I1s.clear();
I1s_ext.clear(); I1s_ext.clear();
...@@ -1523,11 +1541,8 @@ void DISOpticalFlowImpl::collectGarbage() ...@@ -1523,11 +1541,8 @@ void DISOpticalFlowImpl::collectGarbage()
u_I1s_ext.clear(); u_I1s_ext.clear();
u_I0xs.clear(); u_I0xs.clear();
u_I0ys.clear(); u_I0ys.clear();
u_Ux.clear(); u_U.clear();
u_Uy.clear(); u_S.release();
u_U.release();
u_Sx.release();
u_Sy.release();
u_I0xx_buf.release(); u_I0xx_buf.release();
u_I0yy_buf.release(); u_I0yy_buf.release();
u_I0xy_buf.release(); u_I0xy_buf.release();
...@@ -1543,6 +1558,8 @@ void DISOpticalFlowImpl::collectGarbage() ...@@ -1543,6 +1558,8 @@ void DISOpticalFlowImpl::collectGarbage()
Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset) Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset)
{ {
CV_INSTRUMENT_REGION();
Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>(); Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
dis->setPatchSize(8); dis->setPatchSize(8);
if (preset == DISOpticalFlow::PRESET_ULTRAFAST) if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
...@@ -1569,4 +1586,6 @@ Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset) ...@@ -1569,4 +1586,6 @@ Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset)
return dis; return dis;
} }
}
} // namespace
...@@ -7,9 +7,16 @@ ...@@ -7,9 +7,16 @@
#define EPS 0.001f #define EPS 0.001f
#define INF 1E+10F #define INF 1E+10F
//#define DIS_BORDER_SIZE xxx
//#define DIS_PATCH_SIZE xxx
//#define DIS_PATCH_STRIDE xxx
#define DIS_PATCH_SIZE_HALF (DIS_PATCH_SIZE / 2)
#ifndef DIS_BORDER_SIZE
__kernel void dis_precomputeStructureTensor_hor(__global const short *I0x, __kernel void dis_precomputeStructureTensor_hor(__global const short *I0x,
__global const short *I0y, __global const short *I0y,
int patch_size, int patch_stride,
int w, int h, int ws, int w, int h, int ws,
__global float *I0xx_aux_ptr, __global float *I0xx_aux_ptr,
__global float *I0yy_aux_ptr, __global float *I0yy_aux_ptr,
...@@ -41,18 +48,18 @@ __kernel void dis_precomputeStructureTensor_hor(__global const short *I0x, ...@@ -41,18 +48,18 @@ __kernel void dis_precomputeStructureTensor_hor(__global const short *I0x,
I0y_aux_ptr[i * ws] = sum_y; I0y_aux_ptr[i * ws] = sum_y;
int js = 1; int js = 1;
for (int j = patch_size; j < w; j++) for (int j = DIS_PATCH_SIZE; j < w; j++)
{ {
short x_val1 = x_row[j]; short x_val1 = x_row[j];
short x_val2 = x_row[j - patch_size]; short x_val2 = x_row[j - DIS_PATCH_SIZE];
short y_val1 = y_row[j]; short y_val1 = y_row[j];
short y_val2 = y_row[j - patch_size]; short y_val2 = y_row[j - DIS_PATCH_SIZE];
sum_xx += (x_val1 * x_val1 - x_val2 * x_val2); sum_xx += (x_val1 * x_val1 - x_val2 * x_val2);
sum_yy += (y_val1 * y_val1 - y_val2 * y_val2); sum_yy += (y_val1 * y_val1 - y_val2 * y_val2);
sum_xy += (x_val1 * y_val1 - x_val2 * y_val2); sum_xy += (x_val1 * y_val1 - x_val2 * y_val2);
sum_x += (x_val1 - x_val2); sum_x += (x_val1 - x_val2);
sum_y += (y_val1 - y_val2); sum_y += (y_val1 - y_val2);
if ((j - patch_size + 1) % patch_stride == 0) if ((j - DIS_PATCH_SIZE + 1) % DIS_PATCH_STRIDE == 0)
{ {
int index = i * ws + js; int index = i * ws + js;
I0xx_aux_ptr[index] = sum_xx; I0xx_aux_ptr[index] = sum_xx;
...@@ -70,7 +77,6 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p ...@@ -70,7 +77,6 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p
__global const float *I0xy_aux_ptr, __global const float *I0xy_aux_ptr,
__global const float *I0x_aux_ptr, __global const float *I0x_aux_ptr,
__global const float *I0y_aux_ptr, __global const float *I0y_aux_ptr,
int patch_size, int patch_stride,
int w, int h, int ws, int w, int h, int ws,
__global float *I0xx_ptr, __global float *I0xx_ptr,
__global float *I0yy_ptr, __global float *I0yy_ptr,
...@@ -85,7 +91,7 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p ...@@ -85,7 +91,7 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p
float sum_xx, sum_yy, sum_xy, sum_x, sum_y; float sum_xx, sum_yy, sum_xy, sum_x, sum_y;
sum_xx = sum_yy = sum_xy = sum_x = sum_y = 0.0f; sum_xx = sum_yy = sum_xy = sum_x = sum_y = 0.0f;
for (int i = 0; i < patch_size; i++) for (int i = 0; i < DIS_PATCH_SIZE; i++)
{ {
sum_xx += I0xx_aux_ptr[i * ws + j]; sum_xx += I0xx_aux_ptr[i * ws + j];
sum_yy += I0yy_aux_ptr[i * ws + j]; sum_yy += I0yy_aux_ptr[i * ws + j];
...@@ -100,15 +106,15 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p ...@@ -100,15 +106,15 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p
I0y_ptr[j] = sum_y; I0y_ptr[j] = sum_y;
int is = 1; int is = 1;
for (int i = patch_size; i < h; i++) for (int i = DIS_PATCH_SIZE; i < h; i++)
{ {
sum_xx += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]); sum_xx += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
sum_yy += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]); sum_yy += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
sum_xy += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]); sum_xy += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
sum_x += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]); sum_x += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
sum_y += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]); sum_y += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - DIS_PATCH_SIZE) * ws + j]);
if ((i - patch_size + 1) % patch_stride == 0) if ((i - DIS_PATCH_SIZE + 1) % DIS_PATCH_STRIDE == 0)
{ {
I0xx_ptr[is * ws + j] = sum_xx; I0xx_ptr[is * ws + j] = sum_xx;
I0yy_ptr[is * ws + j] = sum_yy; I0yy_ptr[is * ws + j] = sum_yy;
...@@ -120,11 +126,10 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p ...@@ -120,11 +126,10 @@ __kernel void dis_precomputeStructureTensor_ver(__global const float *I0xx_aux_p
} }
} }
__kernel void dis_densification(__global const float *sx, __global const float *sy, __kernel void dis_densification(__global const float2 *S_ptr,
__global const uchar *i0, __global const uchar *i1, __global const uchar *i0, __global const uchar *i1,
int psz, int pstr,
int w, int h, int ws, int w, int h, int ws,
__global float *ux, __global float *uy) __global float2 *U_ptr)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -135,17 +140,16 @@ __kernel void dis_densification(__global const float *sx, __global const float * ...@@ -135,17 +140,16 @@ __kernel void dis_densification(__global const float *sx, __global const float *
int start_is, end_is; int start_is, end_is;
int start_js, end_js; int start_js, end_js;
end_is = min(y / pstr, (h - psz) / pstr); end_is = min(y / DIS_PATCH_STRIDE, (h - DIS_PATCH_SIZE) / DIS_PATCH_STRIDE);
start_is = max(0, y - psz + pstr) / pstr; start_is = max(0, y - DIS_PATCH_SIZE + DIS_PATCH_STRIDE) / DIS_PATCH_STRIDE;
start_is = min(start_is, end_is); start_is = min(start_is, end_is);
end_js = min(x / pstr, (w - psz) / pstr); end_js = min(x / DIS_PATCH_STRIDE, (w - DIS_PATCH_SIZE) / DIS_PATCH_STRIDE);
start_js = max(0, x - psz + pstr) / pstr; start_js = max(0, x - DIS_PATCH_SIZE + DIS_PATCH_STRIDE) / DIS_PATCH_STRIDE;
start_js = min(start_js, end_js); start_js = min(start_js, end_js);
float coef, sum_coef = 0.0f; float sum_coef = 0.0f;
float sum_Ux = 0.0f; float2 sum_U = (float2)(0.0f, 0.0f);
float sum_Uy = 0.0f;
int i_l, i_u; int i_l, i_u;
int j_l, j_u; int j_l, j_u;
...@@ -158,12 +162,11 @@ __kernel void dis_densification(__global const float *sx, __global const float * ...@@ -158,12 +162,11 @@ __kernel void dis_densification(__global const float *sx, __global const float *
for (int is = start_is; is <= end_is; is++) for (int is = start_is; is <= end_is; is++)
for (int js = start_js; js <= end_js; js++) for (int js = start_js; js <= end_js; js++)
{ {
float sx_val = sx[is * ws + js]; float2 s_val = S_ptr[is * ws + js];
float sy_val = sy[is * ws + js];
uchar2 i1_vec1, i1_vec2; uchar2 i1_vec1, i1_vec2;
j_m = min(max(j + sx_val, 0.0f), w - 1.0f - EPS); j_m = min(max(j + s_val.x, 0.0f), w - 1.0f - EPS);
i_m = min(max(i + sy_val, 0.0f), h - 1.0f - EPS); i_m = min(max(i + s_val.y, 0.0f), h - 1.0f - EPS);
j_l = (int)j_m; j_l = (int)j_m;
j_u = j_l + 1; j_u = j_l + 1;
i_l = (int)i_m; i_l = (int)i_m;
...@@ -174,35 +177,39 @@ __kernel void dis_densification(__global const float *sx, __global const float * ...@@ -174,35 +177,39 @@ __kernel void dis_densification(__global const float *sx, __global const float *
(j_u - j_m) * (i_m - i_l) * i1_vec1.x + (j_u - j_m) * (i_m - i_l) * i1_vec1.x +
(j_m - j_l) * (i_u - i_m) * i1_vec2.y + (j_m - j_l) * (i_u - i_m) * i1_vec2.y +
(j_u - j_m) * (i_u - i_m) * i1_vec2.x - i0[i * w + j]; (j_u - j_m) * (i_u - i_m) * i1_vec2.x - i0[i * w + j];
coef = 1 / max(1.0f, fabs(diff)); float coef = 1.0f / max(1.0f, fabs(diff));
sum_Ux += coef * sx_val; sum_U += coef * s_val;
sum_Uy += coef * sy_val;
sum_coef += coef; sum_coef += coef;
} }
ux[i * w + j] = sum_Ux / sum_coef; float inv_sum_coef = 1.0 / sum_coef;
uy[i * w + j] = sum_Uy / sum_coef; U_ptr[i * w + j] = sum_U * inv_sum_coef;
} }
#else // DIS_BORDER_SIZE
#define INIT_BILINEAR_WEIGHTS(Ux, Uy) \ #define INIT_BILINEAR_WEIGHTS(Ux, Uy) \
i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \ i_I1 = clamp(i + Uy + DIS_BORDER_SIZE, i_lower_limit, i_upper_limit); \
j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \ j_I1 = clamp(j + Ux + DIS_BORDER_SIZE, j_lower_limit, j_upper_limit); \
\ { \
w11 = (i_I1 - floor(i_I1)) * (j_I1 - floor(j_I1)); \ float di = i_I1 - floor(i_I1); \
w10 = (i_I1 - floor(i_I1)) * (floor(j_I1) + 1 - j_I1); \ float dj = j_I1 - floor(j_I1); \
w01 = (floor(i_I1) + 1 - i_I1) * (j_I1 - floor(j_I1)); \ w11 = di * dj; \
w00 = (floor(i_I1) + 1 - i_I1) * (floor(j_I1) + 1 - j_I1); w10 = di * (1 - dj); \
w01 = (1 - di) * dj; \
w00 = (1 - di) * (1 - dj); \
}
float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr, float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
int I0_stride, int I1_stride, int I0_stride, int I1_stride,
float w00, float w01, float w10, float w11, int patch_sz, int i float w00, float w01, float w10, float w11, int i
#ifndef CV_USE_SUBGROUPS #ifndef CV_USE_SUBGROUPS
, __local float2 *smem /*[8]*/ , __local float2 *smem /*[8]*/
#endif #endif
) )
{ {
float sum_diff = 0.0f, sum_diff_sq = 0.0f; float sum_diff = 0.0f, sum_diff_sq = 0.0f;
int n = patch_sz * patch_sz; int n = DIS_PATCH_SIZE * DIS_PATCH_SIZE;
uchar8 I1_vec1, I1_vec2, I0_vec; uchar8 I1_vec1, I1_vec2, I0_vec;
uchar I1_val1, I1_val2; uchar I1_val1, I1_val2;
...@@ -245,33 +252,26 @@ float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ ...@@ -245,33 +252,26 @@ float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_
} }
__attribute__((reqd_work_group_size(8, 1, 1))) __attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr, __kernel void dis_patch_inverse_search_fwd_1(__global const float2 *U_ptr,
__global const uchar *I0_ptr, __global const uchar *I1_ptr, __global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride, int w, int h, int ws, int hs,
int w, int h, int ws, int hs, int pyr_level, __global float2 *S_ptr)
__global float *Sx_ptr, __global float *Sy_ptr)
{ {
int id = get_global_id(0); int id = get_global_id(0);
int is = get_group_id(0); int is = get_group_id(0);
int i = is * patch_stride; int i = is * DIS_PATCH_STRIDE;
int j = 0; int j = 0;
int psz = patch_size; int w_ext = w + 2 * DIS_BORDER_SIZE;
int psz2 = psz / 2;
int w_ext = w + 2 * border_size; float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
int bsz = border_size; float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
float i_lower_limit = bsz - psz + 1.0f; float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;
float i_upper_limit = bsz + h - 1.0f;
float j_lower_limit = bsz - psz + 1.0f;
float j_upper_limit = bsz + w - 1.0f;
float i_I1, j_I1, w00, w01, w10, w11;
float prev_Ux = Ux_ptr[(i + psz2) * w + j + psz2]; float2 prev_U = U_ptr[(i + DIS_PATCH_SIZE_HALF) * w + j + DIS_PATCH_SIZE_HALF];
float prev_Uy = Uy_ptr[(i + psz2) * w + j + psz2]; S_ptr[is * ws] = prev_U;
Sx_ptr[is * ws] = prev_Ux; j += DIS_PATCH_STRIDE;
Sy_ptr[is * ws] = prev_Uy;
j += patch_stride;
#ifdef CV_USE_SUBGROUPS #ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id(); int sid = get_sub_group_local_id();
...@@ -281,45 +281,44 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo ...@@ -281,45 +281,44 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
int sid = get_local_id(0); int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem #define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif #endif
for (int js = 1; js < ws; js++, j += patch_stride) for (int js = 1; js < ws; js++, j += DIS_PATCH_STRIDE)
{ {
float min_SSD, cur_SSD; float2 U = U_ptr[(i + DIS_PATCH_SIZE_HALF) * w + j + DIS_PATCH_SIZE_HALF];
float Ux = Ux_ptr[(i + psz2) * w + j + psz2];
float Uy = Uy_ptr[(i + psz2) * w + j + psz2];
INIT_BILINEAR_WEIGHTS(Ux, Uy);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD)
{
Ux = prev_Ux;
Uy = prev_Uy;
}
prev_Ux = Ux; float i_I1, j_I1, w00, w01, w10, w11;
prev_Uy = Uy;
Sx_ptr[is * ws + js] = Ux; INIT_BILINEAR_WEIGHTS(U.x, U.y);
Sy_ptr[is * ws + js] = Uy; float min_SSD = computeSSDMeanNorm(
I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(prev_U.x, prev_U.y);
float cur_SSD = computeSSDMeanNorm(
I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);
prev_U = (cur_SSD < min_SSD) ? prev_U : U;
S_ptr[is * ws + js] = prev_U;
} }
#undef EXTRA_ARGS_computeSSDMeanNorm #undef EXTRA_ARGS_computeSSDMeanNorm
} }
#endif // DIS_BORDER_SIZE
float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr, float4 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
const __global short *I0x_ptr, const __global short *I0y_ptr, const __global short *I0x_ptr, const __global short *I0y_ptr,
int I0_stride, int I1_stride, float w00, float w01, float w10, int I0_stride, int I1_stride, float w00, float w01, float w10,
float w11, int patch_sz, float x_grad_sum, float y_grad_sum) float w11, float x_grad_sum, float y_grad_sum)
{ {
const float inv_n = 1.0f / (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE);
float sum_diff = 0.0, sum_diff_sq = 0.0; float sum_diff = 0.0, sum_diff_sq = 0.0;
float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0; float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
int n = patch_sz * patch_sz;
uchar8 I1_vec1; uchar8 I1_vec1;
uchar8 I1_vec2 = vload8(0, I1_ptr); uchar8 I1_vec2 = vload8(0, I1_ptr);
uchar I1_val1; uchar I1_val1;
uchar I1_val2 = I1_ptr[patch_sz]; uchar I1_val2 = I1_ptr[DIS_PATCH_SIZE];
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
{ {
...@@ -328,7 +327,7 @@ float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar * ...@@ -328,7 +327,7 @@ float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *
I1_vec1 = I1_vec2; I1_vec1 = I1_vec2;
I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride); I1_vec2 = vload8(0, I1_ptr + (i + 1) * I1_stride);
I1_val1 = I1_val2; I1_val1 = I1_val2;
I1_val2 = I1_ptr[(i + 1) * I1_stride + patch_sz]; I1_val2 = I1_ptr[(i + 1) * I1_stride + DIS_PATCH_SIZE];
float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) + float8 vec = w00 * convert_float8(I1_vec1) + w01 * convert_float8((uchar8)(I1_vec1.s123, I1_vec1.s4567, I1_val1)) +
w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) - w10 * convert_float8(I1_vec2) + w11 * convert_float8((uchar8)(I1_vec2.s123, I1_vec2.s4567, I1_val2)) -
...@@ -346,112 +345,98 @@ float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar * ...@@ -346,112 +345,98 @@ float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *
sum_I0y_mul += dot(vec.hi, convert_float4(I0y_vec.hi)); sum_I0y_mul += dot(vec.hi, convert_float4(I0y_vec.hi));
} }
float dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n; float dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum * inv_n;
float dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n; float dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum * inv_n;
float SSD = sum_diff_sq - sum_diff * sum_diff / n; float SSD = sum_diff_sq - sum_diff * sum_diff * inv_n;
return (float3)(SSD, dst_dUx, dst_dUy); return (float4)(SSD, dst_dUx, dst_dUy, 0);
} }
__kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __global const float *Uy_ptr, #ifdef DIS_BORDER_SIZE
__kernel void dis_patch_inverse_search_fwd_2(__global const float2 *U_ptr,
__global const uchar *I0_ptr, __global const uchar *I1_ptr, __global const uchar *I0_ptr, __global const uchar *I1_ptr,
__global const short *I0x_ptr, __global const short *I0y_ptr, __global const short *I0x_ptr, __global const short *I0y_ptr,
__global const float *xx_ptr, __global const float *yy_ptr, __global const float *xx_ptr, __global const float *yy_ptr,
__global const float *xy_ptr, __global const float *xy_ptr,
__global const float *x_ptr, __global const float *y_ptr, __global const float *x_ptr, __global const float *y_ptr,
int border_size, int patch_size, int patch_stride, int w, int h, int ws, int hs, int num_inner_iter,
int w, int h, int ws, int hs, int num_inner_iter, int pyr_level, __global float2 *S_ptr)
__global float *Sx_ptr, __global float *Sy_ptr)
{ {
int js = get_global_id(0); int js = get_global_id(0);
int is = get_global_id(1); int is = get_global_id(1);
int i = is * patch_stride; int i = is * DIS_PATCH_STRIDE;
int j = js * patch_stride; int j = js * DIS_PATCH_STRIDE;
int psz = patch_size; const int psz = DIS_PATCH_SIZE;
int psz2 = psz / 2; int w_ext = w + 2 * DIS_BORDER_SIZE;
int w_ext = w + 2 * border_size;
int bsz = border_size;
int index = is * ws + js; int index = is * ws + js;
if (js >= ws || is >= hs) return; if (js >= ws || is >= hs) return;
float Ux = Sx_ptr[index]; float2 U0 = S_ptr[index];
float Uy = Sy_ptr[index]; float2 cur_U = U0;
float cur_Ux = Ux;
float cur_Uy = Uy;
float cur_xx = xx_ptr[index]; float cur_xx = xx_ptr[index];
float cur_yy = yy_ptr[index]; float cur_yy = yy_ptr[index];
float cur_xy = xy_ptr[index]; float cur_xy = xy_ptr[index];
float detH = cur_xx * cur_yy - cur_xy * cur_xy; float detH = cur_xx * cur_yy - cur_xy * cur_xy;
if (fabs(detH) < EPS) detH = EPS; float inv_detH = (fabs(detH) < EPS) ? 1.0 / EPS : 1.0 / detH;
float invH11 = cur_yy * inv_detH;
float invH12 = -cur_xy * inv_detH;
float invH22 = cur_xx * inv_detH;
float invH11 = cur_yy / detH; float prev_SSD = INF;
float invH12 = -cur_xy / detH;
float invH22 = cur_xx / detH;
float prev_SSD = INF, SSD;
float x_grad_sum = x_ptr[index]; float x_grad_sum = x_ptr[index];
float y_grad_sum = y_ptr[index]; float y_grad_sum = y_ptr[index];
float i_lower_limit = bsz - psz + 1.0f; const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
float i_upper_limit = bsz + h - 1.0f; const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
float j_lower_limit = bsz - psz + 1.0f; const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
float j_upper_limit = bsz + w - 1.0f; const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;
float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
float3 res;
for (int t = 0; t < num_inner_iter; t++) for (int t = 0; t < num_inner_iter; t++)
{ {
INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy); float i_I1, j_I1, w00, w01, w10, w11;
res = processPatchMeanNorm(I0_ptr + i * w + j, INIT_BILINEAR_WEIGHTS(cur_U.x, cur_U.y);
I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j, float4 res = processPatchMeanNorm(
I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz, I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
I0x_ptr + i * w + j, I0y_ptr + i * w + j,
w, w_ext, w00, w01, w10, w11,
x_grad_sum, y_grad_sum); x_grad_sum, y_grad_sum);
SSD = res.x; float SSD = res.x;
dUx = res.y; float dUx = res.y;
dUy = res.z; float dUy = res.z;
dx = invH11 * dUx + invH12 * dUy; float dx = invH11 * dUx + invH12 * dUy;
dy = invH12 * dUx + invH22 * dUy; float dy = invH12 * dUx + invH22 * dUy;
cur_Ux -= dx; cur_U -= (float2)(dx, dy);
cur_Uy -= dy;
if (SSD >= prev_SSD) if (SSD >= prev_SSD)
break; break;
prev_SSD = SSD; prev_SSD = SSD;
} }
float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy); float2 vec = cur_U - U0;
if (dot(vec, vec) <= (float)(psz * psz)) S_ptr[index] = (dot(vec, vec) <= (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE)) ? cur_U : U0;
{
Sx_ptr[index] = cur_Ux;
Sy_ptr[index] = cur_Uy;
}
} }
__attribute__((reqd_work_group_size(8, 1, 1))) __attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr, __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride, int w, int h, int ws, int hs,
int w, int h, int ws, int hs, int pyr_level, __global float2 *S_ptr)
__global float *Sx_ptr, __global float *Sy_ptr)
{ {
int id = get_global_id(0); int id = get_global_id(0);
int is = get_group_id(0); int is = get_group_id(0);
is = (hs - 1 - is); is = (hs - 1 - is);
int i = is * patch_stride; int i = is * DIS_PATCH_STRIDE;
int j = (ws - 2) * patch_stride; int j = (ws - 2) * DIS_PATCH_STRIDE;
int psz = patch_size; const int w_ext = w + 2 * DIS_BORDER_SIZE;
int psz2 = psz / 2;
int w_ext = w + 2 * border_size; const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
int bsz = border_size; const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
float i_lower_limit = bsz - psz + 1.0f; const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;
float i_upper_limit = bsz + h - 1.0f;
float j_lower_limit = bsz - psz + 1.0f;
float j_upper_limit = bsz + w - 1.0f;
float i_I1, j_I1, w00, w01, w10, w11;
#ifdef CV_USE_SUBGROUPS #ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id(); int sid = get_sub_group_local_id();
...@@ -461,25 +446,27 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo ...@@ -461,25 +446,27 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
int sid = get_local_id(0); int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem #define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif #endif
for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
{ for (int js = (ws - 2); js > -1; js--, j -= DIS_PATCH_STRIDE)
float min_SSD, cur_SSD;
float2 Ux = vload2(0, Sx_ptr + is * ws + js);
float2 Uy = vload2(0, Sy_ptr + is * ws + js);
INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD)
{ {
Sx_ptr[is * ws + js] = Ux.y; float2 U0 = S_ptr[is * ws + js];
Sy_ptr[is * ws + js] = Uy.y; float2 U1 = S_ptr[is * ws + js + 1];
}
float i_I1, j_I1, w00, w01, w10, w11;
INIT_BILINEAR_WEIGHTS(U0.x, U0.y);
float min_SSD = computeSSDMeanNorm(
I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(U1.x, U1.y);
float cur_SSD = computeSSDMeanNorm(
I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, EXTRA_ARGS_computeSSDMeanNorm);
S_ptr[is * ws + js] = (cur_SSD < min_SSD) ? U1 : U0;
} }
#undef EXTRA_ARGS_computeSSDMeanNorm #undef EXTRA_ARGS_computeSSDMeanNorm
} }
...@@ -488,9 +475,8 @@ __kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __glo ...@@ -488,9 +475,8 @@ __kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __glo
__global const float *xx_ptr, __global const float *yy_ptr, __global const float *xx_ptr, __global const float *yy_ptr,
__global const float *xy_ptr, __global const float *xy_ptr,
__global const float *x_ptr, __global const float *y_ptr, __global const float *x_ptr, __global const float *y_ptr,
int border_size, int patch_size, int patch_stride,
int w, int h, int ws, int hs, int num_inner_iter, int w, int h, int ws, int hs, int num_inner_iter,
__global float *Sx_ptr, __global float *Sy_ptr) __global float2 *S_ptr)
{ {
int js = get_global_id(0); int js = get_global_id(0);
int is = get_global_id(1); int is = get_global_id(1);
...@@ -499,65 +485,56 @@ __kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __glo ...@@ -499,65 +485,56 @@ __kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __glo
js = (ws - 1 - js); js = (ws - 1 - js);
is = (hs - 1 - is); is = (hs - 1 - is);
int j = js * patch_stride; int j = js * DIS_PATCH_STRIDE;
int i = is * patch_stride; int i = is * DIS_PATCH_STRIDE;
int psz = patch_size; int w_ext = w + 2 * DIS_BORDER_SIZE;
int psz2 = psz / 2;
int w_ext = w + 2 * border_size;
int bsz = border_size;
int index = is * ws + js; int index = is * ws + js;
float Ux = Sx_ptr[index]; float2 U0 = S_ptr[index];
float Uy = Sy_ptr[index]; float2 cur_U = U0;
float cur_Ux = Ux;
float cur_Uy = Uy;
float cur_xx = xx_ptr[index]; float cur_xx = xx_ptr[index];
float cur_yy = yy_ptr[index]; float cur_yy = yy_ptr[index];
float cur_xy = xy_ptr[index]; float cur_xy = xy_ptr[index];
float detH = cur_xx * cur_yy - cur_xy * cur_xy; float detH = cur_xx * cur_yy - cur_xy * cur_xy;
if (fabs(detH) < EPS) detH = EPS; float inv_detH = (fabs(detH) < EPS) ? 1.0 / EPS : 1.0 / detH;
float invH11 = cur_yy * inv_detH;
float invH12 = -cur_xy * inv_detH;
float invH22 = cur_xx * inv_detH;
float invH11 = cur_yy / detH; float prev_SSD = INF;
float invH12 = -cur_xy / detH;
float invH22 = cur_xx / detH;
float prev_SSD = INF, SSD;
float x_grad_sum = x_ptr[index]; float x_grad_sum = x_ptr[index];
float y_grad_sum = y_ptr[index]; float y_grad_sum = y_ptr[index];
float i_lower_limit = bsz - psz + 1.0f; const float i_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
float i_upper_limit = bsz + h - 1.0f; const float i_upper_limit = DIS_BORDER_SIZE + h - 1.0f;
float j_lower_limit = bsz - psz + 1.0f; const float j_lower_limit = DIS_BORDER_SIZE - DIS_PATCH_SIZE + 1.0f;
float j_upper_limit = bsz + w - 1.0f; const float j_upper_limit = DIS_BORDER_SIZE + w - 1.0f;
float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
float3 res;
for (int t = 0; t < num_inner_iter; t++) for (int t = 0; t < num_inner_iter; t++)
{ {
INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy); float i_I1, j_I1, w00, w01, w10, w11;
res = processPatchMeanNorm(I0_ptr + i * w + j, INIT_BILINEAR_WEIGHTS(cur_U.x, cur_U.y);
I1_ptr + (int)i_I1 * w_ext + (int)j_I1, I0x_ptr + i * w + j, float4 res = processPatchMeanNorm(
I0y_ptr + i * w + j, w, w_ext, w00, w01, w10, w11, psz, I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
I0x_ptr + i * w + j, I0y_ptr + i * w + j,
w, w_ext, w00, w01, w10, w11,
x_grad_sum, y_grad_sum); x_grad_sum, y_grad_sum);
SSD = res.x; float SSD = res.x;
dUx = res.y; float dUx = res.y;
dUy = res.z; float dUy = res.z;
dx = invH11 * dUx + invH12 * dUy; float dx = invH11 * dUx + invH12 * dUy;
dy = invH12 * dUx + invH22 * dUy; float dy = invH12 * dUx + invH22 * dUy;
cur_Ux -= dx; cur_U -= (float2)(dx, dy);
cur_Uy -= dy;
if (SSD >= prev_SSD) if (SSD >= prev_SSD)
break; break;
prev_SSD = SSD; prev_SSD = SSD;
} }
float2 vec = (float2)(cur_Ux - Ux, cur_Uy - Uy); float2 vec = cur_U - U0;
if ((dot(vec, vec)) <= (float)(psz * psz)) S_ptr[index] = ((dot(vec, vec)) <= (float)(DIS_PATCH_SIZE * DIS_PATCH_SIZE)) ? cur_U : U0;
{
Sx_ptr[index] = cur_Ux;
Sy_ptr[index] = cur_Uy;
}
} }
#endif // DIS_BORDER_SIZE
...@@ -133,20 +133,28 @@ class VariationalRefinementImpl CV_FINAL : public VariationalRefinement ...@@ -133,20 +133,28 @@ class VariationalRefinementImpl CV_FINAL : public VariationalRefinement
}; };
void gradHorizAndSplitOp(void *src, void *dst, void *dst_split) void gradHorizAndSplitOp(void *src, void *dst, void *dst_split)
{ {
CV_INSTRUMENT_REGION();
Sobel(*(Mat *)src, *(Mat *)dst, -1, 1, 0, 1, 1, 0.00, BORDER_REPLICATE); Sobel(*(Mat *)src, *(Mat *)dst, -1, 1, 0, 1, 1, 0.00, BORDER_REPLICATE);
splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst); splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
} }
void gradVertAndSplitOp(void *src, void *dst, void *dst_split) void gradVertAndSplitOp(void *src, void *dst, void *dst_split)
{ {
CV_INSTRUMENT_REGION();
Sobel(*(Mat *)src, *(Mat *)dst, -1, 0, 1, 1, 1, 0.00, BORDER_REPLICATE); Sobel(*(Mat *)src, *(Mat *)dst, -1, 0, 1, 1, 1, 0.00, BORDER_REPLICATE);
splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst); splitCheckerboard(*(RedBlackBuffer *)dst_split, *(Mat *)dst);
} }
void averageOp(void *src1, void *src2, void *dst) void averageOp(void *src1, void *src2, void *dst)
{ {
CV_INSTRUMENT_REGION();
addWeighted(*(Mat *)src1, 0.5, *(Mat *)src2, 0.5, 0.0, *(Mat *)dst, CV_32F); addWeighted(*(Mat *)src1, 0.5, *(Mat *)src2, 0.5, 0.0, *(Mat *)dst, CV_32F);
} }
void subtractOp(void *src1, void *src2, void *dst) void subtractOp(void *src1, void *src2, void *dst)
{ {
CV_INSTRUMENT_REGION();
subtract(*(Mat *)src1, *(Mat *)src2, *(Mat *)dst, noArray(), CV_32F); subtract(*(Mat *)src1, *(Mat *)src2, *(Mat *)dst, noArray(), CV_32F);
} }
...@@ -206,6 +214,8 @@ class VariationalRefinementImpl CV_FINAL : public VariationalRefinement ...@@ -206,6 +214,8 @@ class VariationalRefinementImpl CV_FINAL : public VariationalRefinement
VariationalRefinementImpl::VariationalRefinementImpl() VariationalRefinementImpl::VariationalRefinementImpl()
{ {
CV_INSTRUMENT_REGION();
fixedPointIterations = 5; fixedPointIterations = 5;
sorIterations = 5; sorIterations = 5;
alpha = 20.0f; alpha = 20.0f;
...@@ -222,6 +232,8 @@ VariationalRefinementImpl::VariationalRefinementImpl() ...@@ -222,6 +232,8 @@ VariationalRefinementImpl::VariationalRefinementImpl()
*/ */
void VariationalRefinementImpl::splitCheckerboard(RedBlackBuffer &dst, Mat &src) void VariationalRefinementImpl::splitCheckerboard(RedBlackBuffer &dst, Mat &src)
{ {
CV_INSTRUMENT_REGION();
int buf_j, j; int buf_j, j;
int buf_w = (int)ceil(src.cols / 2.0) + 2; //!< max width of red/black buffers with borders int buf_w = (int)ceil(src.cols / 2.0) + 2; //!< max width of red/black buffers with borders
...@@ -288,6 +300,8 @@ void VariationalRefinementImpl::splitCheckerboard(RedBlackBuffer &dst, Mat &src) ...@@ -288,6 +300,8 @@ void VariationalRefinementImpl::splitCheckerboard(RedBlackBuffer &dst, Mat &src)
*/ */
void VariationalRefinementImpl::mergeCheckerboard(Mat &dst, RedBlackBuffer &src) void VariationalRefinementImpl::mergeCheckerboard(Mat &dst, RedBlackBuffer &src)
{ {
CV_INSTRUMENT_REGION();
int buf_j, j; int buf_j, j;
for (int i = 0; i < dst.rows; i++) for (int i = 0; i < dst.rows; i++)
{ {
...@@ -326,6 +340,8 @@ void VariationalRefinementImpl::mergeCheckerboard(Mat &dst, RedBlackBuffer &src) ...@@ -326,6 +340,8 @@ void VariationalRefinementImpl::mergeCheckerboard(Mat &dst, RedBlackBuffer &src)
*/ */
void VariationalRefinementImpl::updateRepeatedBorders(RedBlackBuffer &dst) void VariationalRefinementImpl::updateRepeatedBorders(RedBlackBuffer &dst)
{ {
CV_INSTRUMENT_REGION();
int buf_w = dst.red.cols; int buf_w = dst.red.cols;
for (int i = 0; i < dst.red.rows - 2; i++) for (int i = 0; i < dst.red.rows - 2; i++)
{ {
...@@ -369,10 +385,14 @@ void VariationalRefinementImpl::updateRepeatedBorders(RedBlackBuffer &dst) ...@@ -369,10 +385,14 @@ void VariationalRefinementImpl::updateRepeatedBorders(RedBlackBuffer &dst)
VariationalRefinementImpl::RedBlackBuffer::RedBlackBuffer() VariationalRefinementImpl::RedBlackBuffer::RedBlackBuffer()
{ {
CV_INSTRUMENT_REGION();
release(); release();
} }
void VariationalRefinementImpl::RedBlackBuffer::create(Size s) void VariationalRefinementImpl::RedBlackBuffer::create(Size s)
{ {
CV_INSTRUMENT_REGION();
/* Allocate enough memory to include borders */ /* Allocate enough memory to include borders */
int w = (int)ceil(s.width / 2.0) + 2; int w = (int)ceil(s.width / 2.0) + 2;
red.create(s.height + 2, w); red.create(s.height + 2, w);
...@@ -389,6 +409,8 @@ void VariationalRefinementImpl::RedBlackBuffer::create(Size s) ...@@ -389,6 +409,8 @@ void VariationalRefinementImpl::RedBlackBuffer::create(Size s)
void VariationalRefinementImpl::RedBlackBuffer::release() void VariationalRefinementImpl::RedBlackBuffer::release()
{ {
CV_INSTRUMENT_REGION();
red.release(); red.release();
black.release(); black.release();
red_even_len = red_odd_len = black_even_len = black_odd_len = 0; red_even_len = red_odd_len = black_even_len = black_odd_len = 0;
...@@ -403,12 +425,16 @@ VariationalRefinementImpl::ParallelOp_ParBody::ParallelOp_ParBody(VariationalRef ...@@ -403,12 +425,16 @@ VariationalRefinementImpl::ParallelOp_ParBody::ParallelOp_ParBody(VariationalRef
void VariationalRefinementImpl::ParallelOp_ParBody::operator()(const Range &range) const void VariationalRefinementImpl::ParallelOp_ParBody::operator()(const Range &range) const
{ {
CV_INSTRUMENT_REGION();
for (int i = range.start; i < range.end; i++) for (int i = range.start; i < range.end; i++)
(var->*ops[i])(op1s[i], op2s[i], op3s[i]); (var->*ops[i])(op1s[i], op2s[i], op3s[i]);
} }
void VariationalRefinementImpl::warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v) void VariationalRefinementImpl::warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &flow_v)
{ {
CV_INSTRUMENT_REGION();
for (int i = 0; i < flow_u.rows; i++) for (int i = 0; i < flow_u.rows; i++)
{ {
float *pFlowU = flow_u.ptr<float>(i); float *pFlowU = flow_u.ptr<float>(i);
...@@ -426,6 +452,8 @@ void VariationalRefinementImpl::warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat & ...@@ -426,6 +452,8 @@ void VariationalRefinementImpl::warpImage(Mat &dst, Mat &src, Mat &flow_u, Mat &
void VariationalRefinementImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v) void VariationalRefinementImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &W_u, Mat &W_v)
{ {
CV_INSTRUMENT_REGION();
Size s = I0.size(); Size s = I0.size();
A11.create(s); A11.create(s);
A12.create(s); A12.create(s);
...@@ -550,6 +578,8 @@ VariationalRefinementImpl::ComputeDataTerm_ParBody::ComputeDataTerm_ParBody(Vari ...@@ -550,6 +578,8 @@ VariationalRefinementImpl::ComputeDataTerm_ParBody::ComputeDataTerm_ParBody(Vari
*/ */
void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range &range) const void VariationalRefinementImpl::ComputeDataTerm_ParBody::operator()(const Range &range) const
{ {
CV_INSTRUMENT_REGION();
int start_i = min(range.start * stripe_sz, h); int start_i = min(range.start * stripe_sz, h);
int end_i = min(range.end * stripe_sz, h); int end_i = min(range.end * stripe_sz, h);
...@@ -709,6 +739,8 @@ VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::ComputeSmoothne ...@@ -709,6 +739,8 @@ VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::ComputeSmoothne
*/ */
void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()(const Range &range) const void VariationalRefinementImpl::ComputeSmoothnessTermHorPass_ParBody::operator()(const Range &range) const
{ {
CV_INSTRUMENT_REGION();
int start_i = min(range.start * stripe_sz, h); int start_i = min(range.start * stripe_sz, h);
int end_i = min(range.end * stripe_sz, h); int end_i = min(range.end * stripe_sz, h);
...@@ -873,6 +905,8 @@ VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::ComputeSmoothn ...@@ -873,6 +905,8 @@ VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::ComputeSmoothn
/* This function adds the last remaining terms to the linear system coefficients A11,A22,b1,b1. */ /* This function adds the last remaining terms to the linear system coefficients A11,A22,b1,b1. */
void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator()(const Range &range) const void VariationalRefinementImpl::ComputeSmoothnessTermVertPass_ParBody::operator()(const Range &range) const
{ {
CV_INSTRUMENT_REGION();
int start_i = min(range.start * stripe_sz, h); int start_i = min(range.start * stripe_sz, h);
int end_i = min(range.end * stripe_sz, h); int end_i = min(range.end * stripe_sz, h);
...@@ -965,6 +999,8 @@ VariationalRefinementImpl::RedBlackSOR_ParBody::RedBlackSOR_ParBody(VariationalR ...@@ -965,6 +999,8 @@ VariationalRefinementImpl::RedBlackSOR_ParBody::RedBlackSOR_ParBody(VariationalR
*/ */
void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &range) const void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &range) const
{ {
CV_INSTRUMENT_REGION();
int start = min(range.start * stripe_sz, h); int start = min(range.start * stripe_sz, h);
int end = min(range.end * stripe_sz, h); int end = min(range.end * stripe_sz, h);
...@@ -1079,6 +1115,8 @@ void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &ran ...@@ -1079,6 +1115,8 @@ void VariationalRefinementImpl::RedBlackSOR_ParBody::operator()(const Range &ran
void VariationalRefinementImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow) void VariationalRefinementImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
{ {
CV_INSTRUMENT_REGION();
CV_Assert(!I0.empty() && I0.channels() == 1); CV_Assert(!I0.empty() && I0.channels() == 1);
CV_Assert(!I1.empty() && I1.channels() == 1); CV_Assert(!I1.empty() && I1.channels() == 1);
CV_Assert(I0.sameSize(I1)); CV_Assert(I0.sameSize(I1));
...@@ -1095,6 +1133,8 @@ void VariationalRefinementImpl::calc(InputArray I0, InputArray I1, InputOutputAr ...@@ -1095,6 +1133,8 @@ void VariationalRefinementImpl::calc(InputArray I0, InputArray I1, InputOutputAr
void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v)
{ {
CV_INSTRUMENT_REGION();
CV_Assert(!I0.empty() && I0.channels() == 1); CV_Assert(!I0.empty() && I0.channels() == 1);
CV_Assert(!I1.empty() && I1.channels() == 1); CV_Assert(!I1.empty() && I1.channels() == 1);
CV_Assert(I0.sameSize(I1)); CV_Assert(I0.sameSize(I1));
...@@ -1124,6 +1164,8 @@ void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutput ...@@ -1124,6 +1164,8 @@ void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutput
for (int i = 0; i < fixedPointIterations; i++) for (int i = 0; i < fixedPointIterations; i++)
{ {
CV_TRACE_REGION("fixedPoint_iteration");
parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true)); parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false)); parallel_for_(Range(0, num_stripes), ComputeDataTerm_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
...@@ -1139,6 +1181,7 @@ void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutput ...@@ -1139,6 +1181,7 @@ void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutput
for (int j = 0; j < sorIterations; j++) for (int j = 0; j < sorIterations; j++)
{ {
CV_TRACE_REGION("SOR_iteration");
parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true)); parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, true));
parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false)); parallel_for_(Range(0, num_stripes), RedBlackSOR_ParBody(*this, num_stripes, I0Mat.rows, dW_u, dW_v, false));
} }
...@@ -1155,6 +1198,8 @@ void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutput ...@@ -1155,6 +1198,8 @@ void VariationalRefinementImpl::calcUV(InputArray I0, InputArray I1, InputOutput
} }
void VariationalRefinementImpl::collectGarbage() void VariationalRefinementImpl::collectGarbage()
{ {
CV_INSTRUMENT_REGION();
Ix.release(); Ix.release();
Iy.release(); Iy.release();
Iz.release(); Iz.release();
......
...@@ -46,18 +46,13 @@ ...@@ -46,18 +46,13 @@
namespace opencv_test { namespace { namespace opencv_test { namespace {
PARAM_TEST_CASE(OCL_DenseOpticalFlow_DIS, int) CV_ENUM(DIS_TestPresets, DISOpticalFlow::PRESET_ULTRAFAST, DISOpticalFlow::PRESET_FAST, DISOpticalFlow::PRESET_MEDIUM);
{
int preset;
virtual void SetUp() typedef ocl::TSTestWithParam<DIS_TestPresets> OCL_DenseOpticalFlow_DIS;
{
preset = GET_PARAM(0);
}
};
OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat) OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat)
{ {
int preset = (int)GetParam();
Mat frame1, frame2, GT; Mat frame1, frame2, GT;
frame1 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale1.png"); frame1 = imread(TS::ptr()->get_data_path() + "optflow/RubberWhale1.png");
...@@ -68,15 +63,11 @@ OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat) ...@@ -68,15 +63,11 @@ OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat)
cvtColor(frame1, frame1, COLOR_BGR2GRAY); cvtColor(frame1, frame1, COLOR_BGR2GRAY);
cvtColor(frame2, frame2, COLOR_BGR2GRAY); cvtColor(frame2, frame2, COLOR_BGR2GRAY);
Ptr<DenseOpticalFlow> algo;
// iterate over presets:
for (int i = 0; i < cvtest::ocl::test_loop_times; i++)
{ {
Mat flow; Mat flow;
UMat ocl_flow; UMat ocl_flow;
algo = DISOpticalFlow::create(preset); Ptr<DenseOpticalFlow> algo = DISOpticalFlow::create(preset);
OCL_OFF(algo->calc(frame1, frame2, flow)); OCL_OFF(algo->calc(frame1, frame2, flow));
OCL_ON(algo->calc(frame1, frame2, ocl_flow)); OCL_ON(algo->calc(frame1, frame2, ocl_flow));
ASSERT_EQ(flow.rows, ocl_flow.rows); ASSERT_EQ(flow.rows, ocl_flow.rows);
...@@ -87,9 +78,7 @@ OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat) ...@@ -87,9 +78,7 @@ OCL_TEST_P(OCL_DenseOpticalFlow_DIS, Mat)
} }
OCL_INSTANTIATE_TEST_CASE_P(Video, OCL_DenseOpticalFlow_DIS, OCL_INSTANTIATE_TEST_CASE_P(Video, OCL_DenseOpticalFlow_DIS,
Values(DISOpticalFlow::PRESET_ULTRAFAST, DIS_TestPresets::all());
DISOpticalFlow::PRESET_FAST,
DISOpticalFlow::PRESET_MEDIUM));
}} // namespace }} // namespace
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment