Commit 168c0b03 authored by peng xiao's avatar peng xiao

Optimize ocl::stereobm.

1. Use macro defines for some parameters(radius).
2. Reduce local memory usage.
3. Fix accuracy problem on Intel GPU.
parent a9a26950
This diff is collapsed.
...@@ -74,28 +74,21 @@ namespace stereoBM ...@@ -74,28 +74,21 @@ namespace stereoBM
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap) static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap)
{ {
Context *clCxt = input.clCxt;
string kernelName = "prefilter_xsobel"; string kernelName = "prefilter_xsobel";
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
size_t blockSize = 1; size_t blockSize = 1;
size_t globalThreads[3] = { input.cols, input.rows, 1 }; size_t globalThreads[3] = { input.cols, input.rows, 1 };
size_t localThreads[3] = { blockSize, blockSize, 1 }; size_t localThreads[3] = { blockSize, blockSize, 1 };
openCLVerifyKernel(clCxt, kernel, localThreads); std::vector<std::pair<size_t, const void *>> args;
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&input.rows)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&input.cols)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.cols));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&prefilterCap)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&prefilterCap));
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish((cl_command_queue)clCxt->oclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
globalThreads, localThreads, args, -1, -1);
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
//////////////////////////////common//////////////////////////////////// //////////////////////////////common////////////////////////////////////
...@@ -115,19 +108,13 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, ...@@ -115,19 +108,13 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
{ {
int winsz2 = winSize >> 1; int winsz2 = winSize >> 1;
//if(winsz2 == 0 || winsz2 >= calles_num)
//cv::ocl:error("Unsupported window size", __FILE__, __LINE__, __FUNCTION__);
Context *clCxt = left.clCxt;
string kernelName = "stereoKernel"; string kernelName = "stereoKernel";
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
disp.setTo(Scalar_<unsigned char>::all(0)); disp.setTo(Scalar_<unsigned char>::all(0));
minSSD_buf.setTo(Scalar_<unsigned int>::all(0xFFFFFFFF)); minSSD_buf.setTo(Scalar_<unsigned int>::all(0xFFFFFFFF));
size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize(); size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize();
size_t local_mem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * winsz2)) * size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
sizeof(cl_uint); sizeof(cl_uint);
//size_t blockSize = 1; //size_t blockSize = 1;
size_t localThreads[] = { BLOCK_W, 1,1}; size_t localThreads[] = { BLOCK_W, 1,1};
...@@ -136,26 +123,23 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, ...@@ -136,26 +123,23 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
1 1
}; };
openCLVerifyKernel(clCxt, kernel, localThreads); std::vector<std::pair<size_t, const void *>> args;
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&left.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&right.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&minSSD_buf.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&minssd_step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&minssd_step));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&disp.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disp.data));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&disp.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&disp.step));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.rows)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&left.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&maxdisp)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&maxdisp));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&winsz2)); args.push_back(std::make_pair(local_mem_size, (void *)NULL));
openCLSafeCall(clSetKernelArg(kernel, 11, local_mem_size, (void *)NULL));
char opt [128];
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL, sprintf(opt, "-D radius=%d", winsz2);
globalThreads, localThreads, 0, NULL, NULL)); openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
globalThreads, localThreads, args, -1, -1, opt);
clFinish((cl_command_queue)clCxt->oclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
} }
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
///////////////////////////////postfilter_textureness/////////////////////// ///////////////////////////////postfilter_textureness///////////////////////
...@@ -163,10 +147,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, ...@@ -163,10 +147,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
static void postfilter_textureness(oclMat &left, int winSize, static void postfilter_textureness(oclMat &left, int winSize,
float avergeTexThreshold, oclMat &disparity) float avergeTexThreshold, oclMat &disparity)
{ {
Context *clCxt = left.clCxt;
string kernelName = "textureness_kernel"; string kernelName = "textureness_kernel";
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
size_t blockSize = 1; size_t blockSize = 1;
size_t localThreads[] = { BLOCK_W, blockSize ,1}; size_t localThreads[] = { BLOCK_W, blockSize ,1};
...@@ -177,22 +158,19 @@ static void postfilter_textureness(oclMat &left, int winSize, ...@@ -177,22 +158,19 @@ static void postfilter_textureness(oclMat &left, int winSize,
size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float); size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);
openCLVerifyKernel(clCxt, kernel, localThreads); std::vector<std::pair<size_t, const void *>> args;
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disparity.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&disparity.rows)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&disparity.cols)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&disparity.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.step));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&left.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&winSize)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&winSize));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&avergeTexThreshold)); args.push_back(std::make_pair(sizeof(cl_float), (void *)&avergeTexThreshold));
openCLSafeCall(clSetKernelArg(kernel, 9, local_mem_size, NULL)); args.push_back(std::make_pair(local_mem_size, (void*)NULL));
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL, openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
globalThreads, localThreads, 0, NULL, NULL)); globalThreads, localThreads, args, -1, -1);
clFinish((cl_command_queue)clCxt->oclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////operator///////////////////////////////// /////////////////////////////////////operator/////////////////////////////////
......
...@@ -59,7 +59,7 @@ PARAM_TEST_CASE(StereoMatchBM, int, int) ...@@ -59,7 +59,7 @@ PARAM_TEST_CASE(StereoMatchBM, int, int)
virtual void SetUp() virtual void SetUp()
{ {
n_disp = GET_PARAM(0); n_disp = GET_PARAM(0);
winSize = GET_PARAM(1); winSize = GET_PARAM(1);
} }
}; };
...@@ -69,27 +69,27 @@ TEST_P(StereoMatchBM, Regression) ...@@ -69,27 +69,27 @@ TEST_P(StereoMatchBM, Regression)
Mat left_image = readImage("stereobm/aloe-L.png", IMREAD_GRAYSCALE); Mat left_image = readImage("stereobm/aloe-L.png", IMREAD_GRAYSCALE);
Mat right_image = readImage("stereobm/aloe-R.png", IMREAD_GRAYSCALE); Mat right_image = readImage("stereobm/aloe-R.png", IMREAD_GRAYSCALE);
Mat disp_gold = readImage("stereobm/aloe-disp.png", IMREAD_GRAYSCALE); Mat disp_gold = readImage("stereobm/aloe-disp.png", IMREAD_GRAYSCALE);
ocl::oclMat d_left, d_right; ocl::oclMat d_left, d_right;
ocl::oclMat d_disp(left_image.size(), CV_8U); ocl::oclMat d_disp(left_image.size(), CV_8U);
Mat disp; Mat disp;
ASSERT_FALSE(left_image.empty()); ASSERT_FALSE(left_image.empty());
ASSERT_FALSE(right_image.empty()); ASSERT_FALSE(right_image.empty());
ASSERT_FALSE(disp_gold.empty()); ASSERT_FALSE(disp_gold.empty());
d_left.upload(left_image); d_left.upload(left_image);
d_right.upload(right_image); d_right.upload(right_image);
ocl::StereoBM_OCL bm(0, n_disp, winSize); ocl::StereoBM_OCL bm(0, n_disp, winSize);
bm(d_left, d_right, d_disp); bm(d_left, d_right, d_disp);
d_disp.download(disp); d_disp.download(disp);
EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-3); EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-3);
} }
INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128), INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128),
testing::Values(19))); testing::Values(19)));
PARAM_TEST_CASE(StereoMatchBP, int, int, int, float, float, float, float) PARAM_TEST_CASE(StereoMatchBP, int, int, int, float, float, float, float)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment