Commit 58b84c2f authored by Ilya Lavrenov's avatar Ilya Lavrenov

removed needless divUp

parent f9c61234
...@@ -82,12 +82,6 @@ namespace cv ...@@ -82,12 +82,6 @@ namespace cv
} }
} }
static inline size_t divUp(size_t total, size_t grain)
{
return (total + grain - 1) / grain;
}
static inline int calcSize(int octave, int layer) static inline int calcSize(int octave, int layer)
{ {
/* Wavelet size at first layer of first octave. */ /* Wavelet size at first layer of first octave. */
......
...@@ -1887,6 +1887,11 @@ namespace cv ...@@ -1887,6 +1887,11 @@ namespace cv
oclMat temp4; oclMat temp4;
oclMat temp5; oclMat temp5;
}; };
static inline size_t divUp(size_t total, size_t grain)
{
return (total + grain - 1) / grain;
}
} }
} }
#if defined _MSC_VER && _MSC_VER >= 1200 #if defined _MSC_VER && _MSC_VER >= 1200
......
This diff is collapsed.
...@@ -360,14 +360,13 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi ...@@ -360,14 +360,13 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
size_t localThreads[3] = {128, 1, 1}; size_t localThreads[3] = {128, 1, 1};
#define DIVUP(a, b) ((a)+(b)-1)/(b)
int count_i[1] = {0}; int count_i[1] = {0};
while(count > 0) while(count > 0)
{ {
openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL)); openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
args.clear(); args.clear();
size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1}; size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data));
...@@ -382,7 +381,6 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi ...@@ -382,7 +381,6 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
std::swap(st1, st2); std::swap(st1, st2);
} }
#undef DIVUP
} }
void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols) void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
......
...@@ -68,22 +68,12 @@ extern const char *filtering_adaptive_bilateral; ...@@ -68,22 +68,12 @@ extern const char *filtering_adaptive_bilateral;
} }
} }
namespace
{
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
}
namespace namespace
{ {
inline void normalizeAnchor(int &anchor, int ksize) inline void normalizeAnchor(int &anchor, int ksize)
{ {
if (anchor < 0) if (anchor < 0)
{
anchor = ksize >> 1; anchor = ksize >> 1;
}
CV_Assert(0 <= anchor && anchor < ksize); CV_Assert(0 <= anchor && anchor < ksize);
} }
...@@ -97,9 +87,7 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize) ...@@ -97,9 +87,7 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize)
inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size) inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size)
{ {
if (roi == Rect(0, 0, -1, -1)) if (roi == Rect(0, 0, -1, -1))
{
roi = Rect(0, 0, src_size.width, src_size.height); roi = Rect(0, 0, src_size.width, src_size.height);
}
CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
...@@ -112,10 +100,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8 ...@@ -112,10 +100,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8
int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1; int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
if (nDivisor) if (nDivisor)
{
*nDivisor = scale; *nDivisor = scale;
}
Mat temp(kernel.size(), type); Mat temp(kernel.size(), type);
kernel.convertTo(temp, type, scale); kernel.convertTo(temp, type, scale);
Mat cont_krnl = temp.reshape(1, 1); Mat cont_krnl = temp.reshape(1, 1);
...@@ -125,9 +110,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8 ...@@ -125,9 +110,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8
int count = cont_krnl.cols >> 1; int count = cont_krnl.cols >> 1;
for (int i = 0; i < count; ++i) for (int i = 0; i < count; ++i)
{
std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i)); std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
}
} }
gpu_krnl.upload(cont_krnl); gpu_krnl.upload(cont_krnl);
...@@ -627,8 +610,6 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel ...@@ -627,8 +610,6 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel
int localWidth = localThreads[0] + paddingPixels; int localWidth = localThreads[0] + paddingPixels;
int localHeight = localThreads[1] + paddingPixels; int localHeight = localThreads[1] + paddingPixels;
// 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4
// 6 = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2)
size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize(); size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4}, int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
...@@ -1713,4 +1694,4 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize ...@@ -1713,4 +1694,4 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize
openCLExecuteKernel(Context::getContext(), &filtering_adaptive_bilateral, kernelName, openCLExecuteKernel(Context::getContext(), &filtering_adaptive_bilateral, kernelName,
globalThreads, localThreads, args, cn, depth, build_options); globalThreads, localThreads, args, cn, depth, build_options);
} }
\ No newline at end of file
...@@ -124,11 +124,6 @@ namespace cv ...@@ -124,11 +124,6 @@ namespace cv
using namespace ::cv::ocl::device; using namespace ::cv::ocl::device;
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
Size cell_size_, int nbins_, double win_sigma_, Size cell_size_, int nbins_, double win_sigma_,
double threshold_L2hys_, bool gamma_correction_, int nlevels_) double threshold_L2hys_, bool gamma_correction_, int nlevels_)
...@@ -1671,7 +1666,8 @@ void cv::ocl::device::hog::compute_hists(int nbins, ...@@ -1671,7 +1666,8 @@ void cv::ocl::device::hog::compute_hists(int nbins,
{ {
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
localThreads, args, -1, -1, "-D CPU"); localThreads, args, -1, -1, "-D CPU");
}else }
else
{ {
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel); int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
......
...@@ -1518,11 +1518,6 @@ namespace cv ...@@ -1518,11 +1518,6 @@ namespace cv
// CLAHE // CLAHE
namespace clahe namespace clahe
{ {
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain * grain;
}
static void calcLut(const oclMat &src, oclMat &dst, static void calcLut(const oclMat &src, oclMat &dst,
const int tilesX, const int tilesY, const cv::Size tileSize, const int tilesX, const int tilesY, const cv::Size tileSize,
const int clipLimit, const float lutScale) const int clipLimit, const float lutScale)
...@@ -1546,9 +1541,7 @@ namespace cv ...@@ -1546,9 +1541,7 @@ namespace cv
size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 }; size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>(); bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
if (is_cpu) if (is_cpu)
{
openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU"); openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
}
else else
{ {
cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName); cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
...@@ -1583,7 +1576,7 @@ namespace cv ...@@ -1583,7 +1576,7 @@ namespace cv
String kernelName = "transform"; String kernelName = "transform";
size_t localThreads[3] = { 32, 8, 1 }; size_t localThreads[3] = { 32, 8, 1 };
size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 }; size_t globalThreads[3] = { src.cols, src.rows, 1 };
openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1); openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
} }
...@@ -1801,10 +1794,7 @@ namespace cv ...@@ -1801,10 +1794,7 @@ namespace cv
} }
} }
//////////////////////////////////convolve//////////////////////////////////////////////////// //////////////////////////////////convolve////////////////////////////////////////////////////
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const char **kernelString) static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const char **kernelString)
{ {
CV_Assert(src.depth() == CV_32FC1); CV_Assert(src.depth() == CV_32FC1);
...@@ -1826,10 +1816,7 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, st ...@@ -1826,10 +1816,7 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, st
int rows = dst.rows; int rows = dst.rows;
size_t localThreads[3] = { 16, 16, 1 }; size_t localThreads[3] = { 16, 16, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], size_t globalThreads[3] = { cols, rows, 1 };
divUp(rows, localThreads[1]) *localThreads[1],
1
};
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
......
...@@ -285,11 +285,6 @@ namespace cv ...@@ -285,11 +285,6 @@ namespace cv
return 0; return 0;
} }
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
int getDevice(std::vector<Info> &oclinfo, int devicetype) int getDevice(std::vector<Info> &oclinfo, int devicetype)
{ {
//TODO: cache oclinfo vector //TODO: cache oclinfo vector
...@@ -707,11 +702,10 @@ namespace cv ...@@ -707,11 +702,10 @@ namespace cv
if ( localThreads != NULL) if ( localThreads != NULL)
{ {
globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0]; globalThreads[0] = alignSize(globalThreads[0], localThreads[0]);
globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; globalThreads[1] = alignSize(globalThreads[1], localThreads[1]);
globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; globalThreads[2] = alignSize(globalThreads[2], localThreads[2]);
//size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads); cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
} }
for(size_t i = 0; i < args.size(); i ++) for(size_t i = 0; i < args.size(); i ++)
...@@ -742,10 +736,6 @@ namespace cv ...@@ -742,10 +736,6 @@ namespace cv
execute_time = (double)(end_time - start_time) / (1000 * 1000); execute_time = (double)(end_time - start_time) / (1000 * 1000);
total_time = (double)(end_time - queue_time) / (1000 * 1000); total_time = (double)(end_time - queue_time) / (1000 * 1000);
// cout << setiosflags(ios::left) << setw(15) << execute_time;
// cout << setiosflags(ios::left) << setw(15) << total_time - execute_time;
// cout << setiosflags(ios::left) << setw(15) << total_time << endl;
total_execute_time += execute_time; total_execute_time += execute_time;
total_kernel_time += total_time; total_kernel_time += total_time;
clReleaseEvent(event); clReleaseEvent(event);
......
...@@ -307,11 +307,6 @@ void cv::ocl::oclMat::download(cv::Mat &m) const ...@@ -307,11 +307,6 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols); m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
} }
/////////////////////common//////////////////////////////////////
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo ///////////////////////////////// ////////////////////////////////// CopyTo /////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
...@@ -331,11 +326,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask ...@@ -331,11 +326,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
char compile_option[32]; char compile_option[32];
sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
size_t localThreads[3] = {16, 16, 1}; size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3]; size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
globalThreads[2] = 1;
int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
......
...@@ -71,12 +71,6 @@ namespace cv ...@@ -71,12 +71,6 @@ namespace cv
{ {
namespace ocl namespace ocl
{ {
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
// provide additional methods for the user to interact with the command queue after a task is fired // provide additional methods for the user to interact with the command queue after a task is fired
static void openCLExecuteKernel_2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], static void openCLExecuteKernel_2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
......
...@@ -73,11 +73,6 @@ oclMat gKer; ...@@ -73,11 +73,6 @@ oclMat gKer;
float ig[4]; float ig[4];
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
inline void setGaussianBlurKernel(const float *c_gKer, int ksizeHalf) inline void setGaussianBlurKernel(const float *c_gKer, int ksizeHalf)
{ {
cv::Mat t_gKer(1, ksizeHalf + 1, CV_32FC1, const_cast<float *>(c_gKer)); cv::Mat t_gKer(1, ksizeHalf + 1, CV_32FC1, const_cast<float *>(c_gKer));
...@@ -88,7 +83,7 @@ static void gaussianBlurOcl(const oclMat &src, int ksizeHalf, oclMat &dst) ...@@ -88,7 +83,7 @@ static void gaussianBlurOcl(const oclMat &src, int ksizeHalf, oclMat &dst)
{ {
string kernelName("gaussianBlur"); string kernelName("gaussianBlur");
size_t localThreads[3] = { 256, 1, 1 }; size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(src.cols, localThreads[0]) * localThreads[0], src.rows, 1 }; size_t globalThreads[3] = { src.cols, src.rows, 1 };
int smem_size = (localThreads[0] + 2*ksizeHalf) * sizeof(float); int smem_size = (localThreads[0] + 2*ksizeHalf) * sizeof(float);
CV_Assert(dst.size() == src.size()); CV_Assert(dst.size() == src.size());
...@@ -138,10 +133,7 @@ static void updateMatricesOcl(const oclMat &flowx, const oclMat &flowy, const oc ...@@ -138,10 +133,7 @@ static void updateMatricesOcl(const oclMat &flowx, const oclMat &flowy, const oc
{ {
string kernelName("updateMatrices"); string kernelName("updateMatrices");
size_t localThreads[3] = { 32, 8, 1 }; size_t localThreads[3] = { 32, 8, 1 };
size_t globalThreads[3] = { divUp(flowx.cols, localThreads[0]) * localThreads[0], size_t globalThreads[3] = { flowx.cols, flowx.rows, 1 };
divUp(flowx.rows, localThreads[1]) * localThreads[1],
1
};
std::vector< std::pair<size_t, const void *> > args; std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&M.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&M.data));
...@@ -166,7 +158,7 @@ static void boxFilter5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst) ...@@ -166,7 +158,7 @@ static void boxFilter5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
string kernelName("boxFilter5"); string kernelName("boxFilter5");
int height = src.rows / 5; int height = src.rows / 5;
size_t localThreads[3] = { 256, 1, 1 }; size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(src.cols, localThreads[0]) * localThreads[0], height, 1 }; size_t globalThreads[3] = { src.cols, height, 1 };
int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float); int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float);
std::vector< std::pair<size_t, const void *> > args; std::vector< std::pair<size_t, const void *> > args;
...@@ -188,10 +180,7 @@ static void updateFlowOcl(const oclMat &M, oclMat &flowx, oclMat &flowy) ...@@ -188,10 +180,7 @@ static void updateFlowOcl(const oclMat &M, oclMat &flowx, oclMat &flowy)
string kernelName("updateFlow"); string kernelName("updateFlow");
int cols = divUp(flowx.cols, 4); int cols = divUp(flowx.cols, 4);
size_t localThreads[3] = { 32, 8, 1 }; size_t localThreads[3] = { 32, 8, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], size_t globalThreads[3] = { cols, flowx.rows, 1 };
divUp(flowx.rows, localThreads[1]) * localThreads[0],
1
};
std::vector< std::pair<size_t, const void *> > args; std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowx.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowx.data));
...@@ -211,9 +200,8 @@ static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst) ...@@ -211,9 +200,8 @@ static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
{ {
string kernelName("gaussianBlur5"); string kernelName("gaussianBlur5");
int height = src.rows / 5; int height = src.rows / 5;
int width = src.cols;
size_t localThreads[3] = { 256, 1, 1 }; size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(width, localThreads[0]) * localThreads[0], height, 1 }; size_t globalThreads[3] = { src.cols, height, 1 };
int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float); int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float);
std::vector< std::pair<size_t, const void *> > args; std::vector< std::pair<size_t, const void *> > args;
...@@ -222,7 +210,7 @@ static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst) ...@@ -222,7 +210,7 @@ static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&gKer.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&gKer.data));
args.push_back(std::make_pair(smem_size, (void *)NULL)); args.push_back(std::make_pair(smem_size, (void *)NULL));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&height)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&height));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&width)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&ksizeHalf)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&ksizeHalf));
......
...@@ -73,61 +73,6 @@ namespace cv ...@@ -73,61 +73,6 @@ namespace cv
{ {
namespace split_merge namespace split_merge
{ {
///////////////////////////////////////////////////////////
///////////////common/////////////////////////////////////
/////////////////////////////////////////////////////////
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
////////////////////////////////////////////////////////////////////////////
////////////////////merge//////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
// static void merge_vector_run_no_roi(const oclMat *mat_src, size_t n, oclMat &mat_dst)
// {
// Context *clCxt = mat_dst.clCxt;
// int channels = mat_dst.oclchannels();
// int depth = mat_dst.depth();
// string kernelName = "merge_vector";
// int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0},
// {4, 4, 2, 2, 1, 1, 1},
// {4, 4, 2, 2 , 1, 1, 1},
// {4, 4, 2, 2, 1, 1, 1}
// };
// size_t index = indexes[channels - 1][mat_dst.depth()];
// int cols = divUp(mat_dst.cols, index);
// size_t localThreads[3] = { 64, 4, 1 };
// size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
// divUp(mat_dst.rows, localThreads[1]) *localThreads[1],
// 1
// };
// vector<pair<size_t , const void *> > args;
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.rows));
// args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst.data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.step));
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[0].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[0].step));
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[1].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].step));
// if(n >= 3)
// {
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
// }
// if(n >= 4)
// {
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[3].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].step));
// }
// openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth);
// }
static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst) static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
{ {
if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F) if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F)
...@@ -153,10 +98,7 @@ namespace cv ...@@ -153,10 +98,7 @@ namespace cv
int cols = divUp(mat_dst.cols + offset_cols, vector_length); int cols = divUp(mat_dst.cols + offset_cols, vector_length);
size_t localThreads[3] = { 64, 4, 1 }; size_t localThreads[3] = { 64, 4, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], size_t globalThreads[3] = { cols, mat_dst.rows, 1 };
divUp(mat_dst.rows, localThreads[1]) *localThreads[1],
1
};
int dst_step1 = mat_dst.cols * mat_dst.elemSize(); int dst_step1 = mat_dst.cols * mat_dst.elemSize();
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
...@@ -176,10 +118,6 @@ namespace cv ...@@ -176,10 +118,6 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));
// if channel == 3, then the matrix will convert to channel =4
//if(n == 3)
// args.push_back( make_pair( sizeof(cl_int), (void *)&offset_cols));
if(n == 3) if(n == 3)
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
...@@ -229,53 +167,6 @@ namespace cv ...@@ -229,53 +167,6 @@ namespace cv
mat_dst.create(size, CV_MAKETYPE(depth, total_channels)); mat_dst.create(size, CV_MAKETYPE(depth, total_channels));
merge_vector_run(mat_src, n, mat_dst); merge_vector_run(mat_src, n, mat_dst);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////split/////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// static void split_vector_run_no_roi(const oclMat &mat_src, oclMat *mat_dst)
// {
// Context *clCxt = mat_src.clCxt;
// int channels = mat_src.oclchannels();
// int depth = mat_src.depth();
// string kernelName = "split_vector";
// int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0},
// {8, 8, 8, 8, 4, 4, 2},
// {8, 8, 8, 8 , 4, 4, 4},
// {4, 4, 2, 2, 1, 1, 1}
// };
// size_t index = indexes[channels - 1][mat_dst[0].depth()];
// int cols = divUp(mat_src.cols, index);
// size_t localThreads[3] = { 64, 4, 1 };
// size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
// divUp(mat_src.rows, localThreads[1]) *localThreads[1],
// 1
// };
// vector<pair<size_t , const void *> > args;
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows));
// args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
// if(channels >= 3)
// {
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
// }
// if(channels >= 4)
// {
// args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
// args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
// }
// openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
// }
static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst) static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
{ {
...@@ -311,9 +202,7 @@ namespace cv ...@@ -311,9 +202,7 @@ namespace cv
: divUp(mat_src.cols + max_offset_cols, vector_length); : divUp(mat_src.cols + max_offset_cols, vector_length);
size_t localThreads[3] = { 64, 4, 1 }; size_t localThreads[3] = { 64, 4, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], size_t globalThreads[3] = { cols, mat_src.rows, 1 };
divUp(mat_src.rows, localThreads[1]) *localThreads[1], 1
};
int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize(); int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize();
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
......
...@@ -96,13 +96,6 @@ namespace cv ...@@ -96,13 +96,6 @@ namespace cv
{ {
namespace stereoCSBP namespace stereoCSBP
{ {
//////////////////////////////////////////////////////////////////////////
//////////////////////////////common////////////////////////////////////
////////////////////////////////////////////////////////////////////////
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
static string get_kernel_name(string kernel_name, int data_type) static string get_kernel_name(string kernel_name, int data_type)
{ {
stringstream idxStr; stringstream idxStr;
...@@ -132,10 +125,7 @@ namespace cv ...@@ -132,10 +125,7 @@ namespace cv
//size_t blockSize = 256; //size_t blockSize = 256;
size_t localThreads[] = {32, 8 ,1}; size_t localThreads[] = {32, 8 ,1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], size_t globalThreads[] = { w, h, 1 };
divUp(h, localThreads[1]) *localThreads[1],
1
};
int cdisp_step1 = msg_step * h; int cdisp_step1 = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads); openCLVerifyKernel(clCxt, kernel, localThreads);
...@@ -177,7 +167,7 @@ namespace cv ...@@ -177,7 +167,7 @@ namespace cv
const int threadsNum = 256; const int threadsNum = 256;
//size_t blockSize = threadsNum; //size_t blockSize = threadsNum;
size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; size_t localThreads[3] = {win_size, 1, threadsNum / win_size};
size_t globalThreads[3] = {w *localThreads[0], size_t globalThreads[3] = { w *localThreads[0],
h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2] h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2]
}; };
...@@ -222,10 +212,7 @@ namespace cv ...@@ -222,10 +212,7 @@ namespace cv
//size_t blockSize = 256; //size_t blockSize = 256;
size_t localThreads[] = {32, 8 ,1}; size_t localThreads[] = {32, 8 ,1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], size_t globalThreads[] = { w, h, 1 };
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h; int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads); openCLVerifyKernel(clCxt, kernel, localThreads);
...@@ -257,10 +244,7 @@ namespace cv ...@@ -257,10 +244,7 @@ namespace cv
//size_t blockSize = 256; //size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1}; size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], size_t globalThreads[] = { w, h, 1 };
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h; int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads); openCLVerifyKernel(clCxt, kernel, localThreads);
...@@ -291,14 +275,10 @@ namespace cv ...@@ -291,14 +275,10 @@ namespace cv
init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level); init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level);
if(rthis.use_local_init_data_cost == true) if(rthis.use_local_init_data_cost == true)
{
get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step); get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step);
}
else else
{
get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w,
nr_plane, msg_step); nr_plane, msg_step);
}
} }
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -317,12 +297,8 @@ namespace cv ...@@ -317,12 +297,8 @@ namespace cv
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256; size_t localThreads[] = { 32, 8, 1 };
size_t localThreads[] = {32, 8, 1}; size_t globalThreads[] = { w, h, 1 };
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step1 = msg_step1 * h; int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2; int disp_step2 = msg_step2 * h2;
...@@ -366,8 +342,8 @@ namespace cv ...@@ -366,8 +342,8 @@ namespace cv
const size_t threadsNum = 256; const size_t threadsNum = 256;
//size_t blockSize = threadsNum; //size_t blockSize = threadsNum;
size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; size_t localThreads[3] = { win_size, 1, threadsNum / win_size };
size_t globalThreads[3] = {w *localThreads[0], size_t globalThreads[3] = { w *localThreads[0],
h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2] h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2]
}; };
...@@ -431,10 +407,7 @@ namespace cv ...@@ -431,10 +407,7 @@ namespace cv
//size_t blockSize = 256; //size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1}; size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], size_t globalThreads[] = { w, h, 1 };
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step1 = msg_step1 * h; int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2; int disp_step2 = msg_step2 * h2;
...@@ -535,10 +508,7 @@ namespace cv ...@@ -535,10 +508,7 @@ namespace cv
//size_t blockSize = 256; //size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1}; size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0], size_t globalThreads[] = { disp.cols, disp.rows, 1 };
divUp(disp.rows, localThreads[1]) *localThreads[1],
1
};
int step_size = disp.step / disp.elemSize(); int step_size = disp.step / disp.elemSize();
int disp_step = disp.rows * msg_step; int disp_step = disp.rows * msg_step;
......
...@@ -96,10 +96,7 @@ static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterC ...@@ -96,10 +96,7 @@ static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterC
#define N_DISPARITIES 8 #define N_DISPARITIES 8
#define ROWSperTHREAD 21 #define ROWSperTHREAD 21
#define BLOCK_W 128 #define BLOCK_W 128
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
///////////////////////////////stereoBM_GPU//////////////////////////////// ///////////////////////////////stereoBM_GPU////////////////////////////////
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
...@@ -117,11 +114,10 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, ...@@ -117,11 +114,10 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) * size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
sizeof(cl_uint); sizeof(cl_uint);
//size_t blockSize = 1; //size_t blockSize = 1;
size_t localThreads[] = { BLOCK_W, 1,1}; size_t localThreads[] = { BLOCK_W, 1, 1 };
size_t globalThreads[] = { divUp(left.cols - maxdisp - 2 * winsz2, BLOCK_W) *BLOCK_W, size_t globalThreads[] = { left.cols - maxdisp - 2 * winsz2,
divUp(left.rows - 2 * winsz2, ROWSperTHREAD), divUp(left.rows - 2 * winsz2, ROWSperTHREAD),
1 1 };
};
std::vector< std::pair<size_t, const void *> > args; std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
...@@ -151,10 +147,9 @@ static void postfilter_textureness(oclMat &left, int winSize, ...@@ -151,10 +147,9 @@ static void postfilter_textureness(oclMat &left, int winSize,
size_t blockSize = 1; size_t blockSize = 1;
size_t localThreads[] = { BLOCK_W, blockSize ,1}; size_t localThreads[] = { BLOCK_W, blockSize ,1};
size_t globalThreads[] = { divUp(left.cols, BLOCK_W) *BLOCK_W, size_t globalThreads[] = { left.cols,
divUp(left.rows, 2 * ROWSperTHREAD), divUp(left.rows, 2 * ROWSperTHREAD),
1 1 };
};
size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float); size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);
......
...@@ -104,10 +104,7 @@ namespace cv ...@@ -104,10 +104,7 @@ namespace cv
{ {
openCLFree(cl_con_struct); openCLFree(cl_con_struct);
} }
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
///////////////////////////comp data//////////////////////////////////////// ///////////////////////////comp data////////////////////////////////////////
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment