Commit a48c1c82 authored by Alexander Karsakov's avatar Alexander Karsakov

Added workaround for Nvidia: take into account that 3-channel vector type takes…

Added workaround for Nvidia: take into account that 3-channel vector type takes 4*elem_size in local memory.
parent 214dab39
...@@ -671,8 +671,11 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, ...@@ -671,8 +671,11 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
size_t wgs = dev.maxWorkGroupSize(); size_t wgs = dev.maxWorkGroupSize();
size_t lmsz = dev.localMemSize(); size_t lmsz = dev.localMemSize();
size_t src_step = _src.step(), src_offset = _src.offset(); size_t src_step = _src.step(), src_offset = _src.offset();
// workaround for Nvidia: 3 channel vector type takes 4*elem_size in local memory
int loc_mem_cn = dev.vendorID() == ocl::Device::VENDOR_NVIDIA && cn == 3 ? 4 : cn;
if (((src_offset % src_step) % esz == 0) && if (((src_offset % src_step) % esz == 0) &&
( (
(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) || (borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE) ||
...@@ -680,7 +683,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, ...@@ -680,7 +683,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
(_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols)) (_src.cols() >= kernelX.cols && _src.rows() >= kernelY.cols))
) && ) &&
(tileSizeX * tileSizeYmin <= wgs) && (tileSizeX * tileSizeYmin <= wgs) &&
(LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, cn * 4) <= lmsz) (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeYmin, kernelX.cols, loc_mem_cn * 4) <= lmsz)
) )
{ {
Size size = _src.size(), wholeSize; Size size = _src.size(), wholeSize;
...@@ -689,7 +692,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst, ...@@ -689,7 +692,7 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
int wdepth = CV_32F; int wdepth = CV_32F;
size_t tileSizeY = wgs / tileSizeX; size_t tileSizeY = wgs / tileSizeX;
while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, cn * 4) > lmsz)) while ((tileSizeX * tileSizeY > wgs) || (LAPLACIAN_LOCAL_MEM(tileSizeX, tileSizeY, kernelX.cols, loc_mem_cn * 4) > lmsz))
{ {
tileSizeY /= 2; tileSizeY /= 2;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment