Commit e6c305cb authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #2897 from vbystricky:oclopt_sepFilter2D

parents effff27c 1a73aa1f
...@@ -3471,7 +3471,8 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY ...@@ -3471,7 +3471,8 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
return k.run(2, globalsize, localsize, false); return k.run(2, globalsize, localsize, false);
} }
const int optimizedSepFilterLocalSize = 16; const int optimizedSepFilterLocalWidth = 16;
const int optimizedSepFilterLocalHeight = 8;
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
Mat row_kernel, Mat col_kernel, Mat row_kernel, Mat col_kernel,
...@@ -3491,8 +3492,8 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, ...@@ -3491,8 +3492,8 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
borderType == BORDER_REFLECT_101)) borderType == BORDER_REFLECT_101))
return false; return false;
size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize }; size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) }; size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
char cvt[2][40]; char cvt[2][40];
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
...@@ -3584,8 +3585,8 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, ...@@ -3584,8 +3585,8 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
} }
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
imgSize.width > optimizedSepFilterLocalSize + anchor.x && imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
imgSize.height > optimizedSepFilterLocalSize + anchor.y && imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
(d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
......
...@@ -119,20 +119,17 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int ...@@ -119,20 +119,17 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
int liy = get_local_id(1); int liy = get_local_id(1);
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1);
// calculate pixel position in source image taking image offset into account // calculate pixel position in source image taking image offset into account
int srcX = x + srcOffsetX - RADIUSX; int srcX = x + srcOffsetX - RADIUSX;
int srcY = y + srcOffsetY - RADIUSY;
// extrapolate coordinates, if needed // extrapolate coordinates, if needed
// and read my own source pixel into local memory // and read my own source pixel into local memory
// with account for extra border pixels, which will be read by starting workitems // with account for extra border pixels, which will be read by starting workitems
int clocY = liy; int clocY = liy;
int cSrcY = srcY;
do do
{ {
int yb = cSrcY; int yb = clocY + srcOffsetY - RADIUSY;
EXTRAPOLATE(yb, (height)); EXTRAPOLATE(yb, (height));
int clocX = lix; int clocX = lix;
...@@ -149,53 +146,80 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int ...@@ -149,53 +146,80 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
while(clocX < BLK_X+(RADIUSX*2)); while(clocX < BLK_X+(RADIUSX*2));
clocY += BLK_Y; clocY += BLK_Y;
cSrcY += BLK_Y;
} }
while (clocY < BLK_Y+(RADIUSY*2)); while (clocY < BLK_Y+(RADIUSY*2));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// do vertical filter pass for (int y = 0; y < dst_rows; y+=BLK_Y)
// and store intermediate results to second local memory array
int i, clocX = lix;
WT sum = (WT) 0;
do
{ {
sum = (WT) 0; // do vertical filter pass
for (i=0; i<=2*RADIUSY; i++) // and store intermediate results to second local memory array
int i, clocX = lix;
WT sum = (WT) 0;
do
{
sum = (WT) 0;
for (i=0; i<=2*RADIUSY; i++)
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum); sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);
#else #else
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum); sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);
#endif #endif
lsmemDy[liy][clocX] = sum; lsmemDy[liy][clocX] = sum;
clocX += BLK_X; clocX += BLK_X;
} }
while(clocX < BLK_X+(RADIUSX*2)); while(clocX < BLK_X+(RADIUSX*2));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// if this pixel happened to be out of image borders because of global size rounding,
// then just return
if( x >= dst_cols || y >=dst_rows )
return;
// do second horizontal filter pass // if this pixel happened to be out of image borders because of global size rounding,
// and calculate final result // then just return
sum = 0.0f; if ((x < dst_cols) && (y + liy < dst_rows))
for (i=0; i<=2*RADIUSX; i++) {
// do second horizontal filter pass
// and calculate final result
sum = 0.0f;
for (i=0; i<=2*RADIUSX; i++)
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
#else #else
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
#endif #endif
#ifdef INTEGER_ARITHMETIC #ifdef INTEGER_ARITHMETIC
#ifdef INTEL_DEVICE #ifdef INTEL_DEVICE
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
#else #else
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
#endif #endif
#endif #endif
// store result into destination image
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
}
for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)
{
int clocX = i % (BLK_X+(RADIUSX*2));
int clocY = i / (BLK_X+(RADIUSX*2));
lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];
}
barrier(CLK_LOCAL_MEM_FENCE);
int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY;
EXTRAPOLATE(yb, (height));
clocX = lix;
int cSrcX = x + srcOffsetX - RADIUSX;
do
{
int xb = cSrcX;
EXTRAPOLATE(xb,(width));
lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 );
clocX += BLK_X;
cSrcX += BLK_X;
}
while(clocX < BLK_X+(RADIUSX*2));
barrier(CLK_LOCAL_MEM_FENCE);
}
// store result into destination image
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment