Commit 910d8f8e authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #2888 from ilya-lavrenov:tapi_remap

parents a0816c6d 87f4b47a
...@@ -3582,7 +3582,9 @@ private: ...@@ -3582,7 +3582,9 @@ private:
static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2, static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
int interpolation, int borderType, const Scalar& borderValue) int interpolation, int borderType, const Scalar& borderValue)
{ {
int cn = _src.channels(), type = _src.type(), depth = _src.depth(); const ocl::Device & dev = ocl::Device::getDefault();
int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
rowsPerWI = dev.isIntel() ? 4 : 1;
if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST) if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
|| _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1) || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
...@@ -3619,12 +3621,14 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input ...@@ -3619,12 +3621,14 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" }; static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
"BORDER_REFLECT_101", "BORDER_TRANSPARENT" }; "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
String buildOptions = format("-D %s -D %s -D T=%s", interMap[interpolation], borderMap[borderType], ocl::typeToStr(type)); String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
interMap[interpolation], borderMap[borderType],
ocl::typeToStr(type), rowsPerWI);
if (interpolation != INTER_NEAREST) if (interpolation != INTER_NEAREST)
{ {
char cvt[3][40]; char cvt[3][40];
int wdepth = std::max(CV_32F, dst.depth()); int wdepth = std::max(CV_32F, depth);
buildOptions = buildOptions buildOptions = buildOptions
+ format(" -D WT=%s -D convertToT=%s -D convertToWT=%s" + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
" -D convertToWT2=%s -D WT2=%s", " -D convertToWT2=%s -D WT2=%s",
...@@ -3636,10 +3640,9 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input ...@@ -3636,10 +3640,9 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
} }
int scalarcn = cn == 3 ? 4 : cn; int scalarcn = cn == 3 ? 4 : cn;
int sctype = CV_MAKETYPE(depth, scalarcn); int sctype = CV_MAKETYPE(depth, scalarcn);
buildOptions += format(" -D T=%s -D T1=%s" buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
" -D cn=%d -D ST=%s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(depth),
cn, ocl::typeToStr(sctype)); cn, ocl::typeToStr(sctype), depth);
ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions); ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
...@@ -3653,7 +3656,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input ...@@ -3653,7 +3656,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
else else
k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg); k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
size_t globalThreads[2] = { dst.cols, dst.rows }; size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalThreads, NULL, false); return k.run(2, globalThreads, NULL, false);
} }
......
...@@ -147,37 +147,43 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src ...@@ -147,37 +147,43 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
ST nVal) ST nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
T scalar = convertScalar(nVal); if (x < dst_cols)
if (x < dst_cols && y < dst_rows)
{ {
int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset); T scalar = convertScalar(nVal);
int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset);
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
__global const float * map1 = (__global const float *)(map1ptr + map1_index); int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
__global const float * map2 = (__global const float *)(map2ptr + map2_index);
__global T * dst = (__global T *)(dstptr + dst_index); #pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
int gx = convert_int_sat_rte(map1[0]); map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
int gy = convert_int_sat_rte(map2[0]); if (y < dst_rows)
{
if (NEED_EXTRAPOLATION(gx, gy)) __global const float * map1 = (__global const float *)(map1ptr + map1_index);
{ __global const float * map2 = (__global const float *)(map2ptr + map2_index);
__global T * dst = (__global T *)(dstptr + dst_index);
int gx = convert_int_sat_rte(map1[0]);
int gy = convert_int_sat_rte(map2[0]);
if (NEED_EXTRAPOLATION(gx, gy))
{
#ifndef BORDER_CONSTANT #ifndef BORDER_CONSTANT
int2 gxy = (int2)(gx, gy); int2 gxy = (int2)(gx, gy);
#endif #endif
T v; T v;
EXTRAPOLATE(gxy, v) EXTRAPOLATE(gxy, v)
storepix(v, dst); storepix(v, dst);
} }
else else
{ {
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
storepix(loadpix((__global const T*)(srcptr + src_index)), dst); storepix(loadpix((__global const T*)(srcptr + src_index)), dst);
} }
}
} }
} }
...@@ -187,31 +193,36 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o ...@@ -187,31 +193,36 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
ST nVal) ST nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
T scalar = convertScalar(nVal);
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); T scalar = convertScalar(nVal);
int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset); int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
__global const float2 * map = (__global const float2 *)(mapptr + map_index);
__global T * dst = (__global T *)(dstptr + dst_index); #pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
int2 gxy = convert_int2_sat_rte(map[0]); map_index += map_step, dst_index += dst_step)
int gx = gxy.x, gy = gxy.y; if (y < dst_rows)
{
if (NEED_EXTRAPOLATION(gx, gy)) __global const float2 * map = (__global const float2 *)(mapptr + map_index);
{ __global T * dst = (__global T *)(dstptr + dst_index);
T v;
EXTRAPOLATE(gxy, v) int2 gxy = convert_int2_sat_rte(map[0]);
storepix(v, dst); int gx = gxy.x, gy = gxy.y;
}
else if (NEED_EXTRAPOLATION(gx, gy))
{ {
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); T v;
storepix(loadpix((__global const T *)(srcptr + src_index)), dst); EXTRAPOLATE(gxy, v)
storepix(v, dst);
}
else
{
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
}
} }
} }
} }
...@@ -222,32 +233,37 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o ...@@ -222,32 +233,37 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
ST nVal) ST nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
T scalar = convertScalar(nVal);
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); T scalar = convertScalar(nVal);
int map_index = mad24(y, map_step, x * (int)sizeof(short2) + map_offset); int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset));
__global const short2 * map = (__global const short2 *)(mapptr + map_index);
__global T * dst = (__global T *)(dstptr + dst_index); #pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
int2 gxy = convert_int2(map[0]); map_index += map_step, dst_index += dst_step)
int gx = gxy.x, gy = gxy.y; if (y < dst_rows)
{
if (NEED_EXTRAPOLATION(gx, gy)) __global const short2 * map = (__global const short2 *)(mapptr + map_index);
{ __global T * dst = (__global T *)(dstptr + dst_index);
T v;
EXTRAPOLATE(gxy, v) int2 gxy = convert_int2(map[0]);
storepix(v, dst); int gx = gxy.x, gy = gxy.y;
}
else if (NEED_EXTRAPOLATION(gx, gy))
{ {
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); T v;
storepix(loadpix((__global const T *)(srcptr + src_index)), dst); EXTRAPOLATE(gxy, v)
} storepix(v, dst);
}
else
{
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
}
}
} }
} }
...@@ -258,41 +274,54 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int ...@@ -258,41 +274,54 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
ST nVal) ST nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
T scalar = convertScalar(nVal);
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); T scalar = convertScalar(nVal);
int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset); int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset); int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index); #pragma unroll
__global T * dst = (__global T *)(dstptr + dst_index); for (int i = 0; i < rowsPerWI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
int map2Value = convert_int(map2[0]) & (INTER_TAB_SIZE2 - 1); if (y < dst_rows)
int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0; {
int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0; __global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy); __global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);
int gx = gxy.x, gy = gxy.y; __global T * dst = (__global T *)(dstptr + dst_index);
if (NEED_EXTRAPOLATION(gx, gy)) int map2Value = convert_int(map2[0]) & (INTER_TAB_SIZE2 - 1);
{ int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
T v; int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
EXTRAPOLATE(gxy, v) int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy);
storepix(v, dst); int gx = gxy.x, gy = gxy.y;
}
else if (NEED_EXTRAPOLATION(gx, gy))
{ {
int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); T v;
storepix(loadpix((__global const T *)(srcptr + src_index)), dst); EXTRAPOLATE(gxy, v)
} storepix(v, dst);
}
else
{
int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));
storepix(loadpix((__global const T *)(srcptr + src_index)), dst);
}
}
} }
} }
#elif INTER_LINEAR #elif defined INTER_LINEAR
__constant float coeffs[64] =
{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
0.062500f, 0.937500f, 0.031250f, 0.968750f };
__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
...@@ -301,54 +330,60 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int ...@@ -301,54 +330,60 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
ST nVal) ST nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset);
int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset);
__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);
__global T * dst = (__global T *)(dstptr + dst_index);
int2 map_dataA = convert_int2(map1[0]);
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1));
WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE);
WT scalar = convertToWT(convertScalar(nVal)); WT scalar = convertToWT(convertScalar(nVal));
WT a = scalar, b = scalar, c = scalar, d = scalar; int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
else #pragma unroll
EXTRAPOLATE(map_dataA, a); for (int i = 0; i < rowsPerWI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) if (y < dst_rows)
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); {
else __global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);
EXTRAPOLATE(map_dataB, b); __global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);
__global T * dst = (__global T *)(dstptr + dst_index);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); int2 map_dataA = convert_int2(map1[0]);
else int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
EXTRAPOLATE(map_dataC, c); int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1));
else WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE);
EXTRAPOLATE(map_dataD, d);
WT a = scalar, b = scalar, c = scalar, d = scalar;
WT dst_data = a * (1 - u.x) * (1 - u.y) +
b * (u.x) * (1 - u.y) + if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
c * (1 - u.x) * (u.y) + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
d * (u.x) * (u.y); else
storepix(convertToT(dst_data), dst); EXTRAPOLATE(map_dataA, a);
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataD, d);
WT dst_data = a * (1 - u.x) * (1 - u.y) +
b * (u.x) * (1 - u.y) +
c * (1 - u.x) * (u.y) +
d * (u.x) * (u.y);
storepix(convertToT(dst_data), dst);
}
} }
} }
...@@ -359,55 +394,106 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src ...@@ -359,55 +394,106 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
ST nVal) ST nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset);
int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset);
__global const float * map1 = (__global const float *)(map1ptr + map1_index);
__global const float * map2 = (__global const float *)(map2ptr + map2_index);
__global T * dst = (__global T *)(dstptr + dst_index);
float2 map_data = (float2)(map1[0], map2[0]);
int2 map_dataA = convert_int2_sat_rtn(map_data);
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
WT scalar = convertToWT(convertScalar(nVal)); WT scalar = convertToWT(convertScalar(nVal));
WT a = scalar, b = scalar, c = scalar, d = scalar; int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
else #pragma unroll
EXTRAPOLATE(map_dataA, a); for (int i = 0; i < rowsPerWI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) if (y < dst_rows)
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); {
else __global const float * map1 = (__global const float *)(map1ptr + map1_index);
EXTRAPOLATE(map_dataB, b); __global const float * map2 = (__global const float *)(map2ptr + map2_index);
__global T * dst = (__global T *)(dstptr + dst_index);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); #if defined BORDER_CONSTANT
else
EXTRAPOLATE(map_dataC, c); float xf = map1[0], yf = map2[0];
int sx = convert_int_sat_rtn(xf), sy = convert_int_sat_rtn(yf);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); __constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
else __constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);
EXTRAPOLATE(map_dataD, d);
WT sum = (WT)(0), xsum;
WT dst_data = a * (1 - u.x) * (1 - u.y) + int src_index = mad24(sy, src_step, mad24(sx, TSIZE, src_offset));
b * (u.x) * (1 - u.y) +
c * (1 - u.x) * (u.y) + #pragma unroll
d * (u.x) * (u.y); for (int yp = 0; yp < 2; ++yp, src_index += src_step)
storepix(convertToT(dst_data), dst); {
if (sy + yp >= 0 && sy + yp < src_rows)
{
xsum = (WT)(0);
if (sx >= 0 && sx + 2 < src_cols)
{
#if depth == 0 && cn == 1
uchar2 value = vload2(0, srcptr + src_index);
xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
#else
#pragma unroll
for (int xp = 0; xp < 2; ++xp)
xsum = fma(convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
#endif
}
else
{
#pragma unroll
for (int xp = 0; xp < 2; ++xp)
xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?
convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
}
sum = fma(xsum, coeffs_y[yp], sum);
}
else
sum = fma(scalar, coeffs_y[yp], sum);
}
storepix(convertToT(sum), dst);
#else
float2 map_data = (float2)(map1[0], map2[0]);
int2 map_dataA = convert_int2_sat_rtn(map_data);
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
WT scalar = convertToWT(convertScalar(nVal));
WT a = scalar, b = scalar, c = scalar, d = scalar;
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataA, a);
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataD, d);
WT dst_data = a * (1 - u.x) * (1 - u.y) +
b * (u.x) * (1 - u.y) +
c * (1 - u.x) * (u.y) +
d * (u.x) * (u.y);
storepix(convertToT(dst_data), dst);
#endif
}
} }
} }
...@@ -417,52 +503,58 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o ...@@ -417,52 +503,58 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
ST nVal) ST nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1) * rowsPerWI;
if (x < dst_cols && y < dst_rows) if (x < dst_cols)
{ {
int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset);
int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset);
__global const float2 * map = (__global const float2 *)(mapptr + map_index);
__global T * dst = (__global T *)(dstptr + dst_index);
float2 map_data = map[0];
int2 map_dataA = convert_int2_sat_rtn(map_data);
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
WT scalar = convertToWT(convertScalar(nVal)); WT scalar = convertToWT(convertScalar(nVal));
WT a = scalar, b = scalar, c = scalar, d = scalar; int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); #pragma unroll
else for (int i = 0; i < rowsPerWI; ++i, ++y,
EXTRAPOLATE(map_dataA, a); map_index += map_step, dst_index += dst_step)
if (y < dst_rows)
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) {
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); __global const float2 * map = (__global const float2 *)(mapptr + map_index);
else __global T * dst = (__global T *)(dstptr + dst_index);
EXTRAPOLATE(map_dataB, b);
float2 map_data = map[0];
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) int2 map_dataA = convert_int2_sat_rtn(map_data);
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
else int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
EXTRAPOLATE(map_dataC, c); int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) float2 _u = map_data - convert_float2(map_dataA);
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
else WT a = scalar, b = scalar, c = scalar, d = scalar;
EXTRAPOLATE(map_dataD, d);
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
WT dst_data = a * (1 - u.x) * (1 - u.y) + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
b * (u.x) * (1 - u.y) + else
c * (1 - u.x) * (u.y) + EXTRAPOLATE(map_dataA, a);
d * (u.x) * (u.y);
storepix(convertToT(dst_data), dst); if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataD, d);
WT dst_data = a * (1 - u.x) * (1 - u.y) +
b * (u.x) * (1 - u.y) +
c * (1 - u.x) * (u.y) +
d * (u.x) * (u.y);
storepix(convertToT(dst_data), dst);
}
} }
} }
......
...@@ -267,7 +267,7 @@ PARAM_TEST_CASE(Remap, MatDepth, Channels, std::pair<MatType, MatType>, BorderTy ...@@ -267,7 +267,7 @@ PARAM_TEST_CASE(Remap, MatDepth, Channels, std::pair<MatType, MatType>, BorderTy
Border map1Border = randomBorder(0, useRoi ? MAX_VALUE : 0); Border map1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
randomSubMat(map1, map1_roi, dstROISize, map1Border, map1Type, -mapMaxValue, mapMaxValue); randomSubMat(map1, map1_roi, dstROISize, map1Border, map1Type, -mapMaxValue, mapMaxValue);
Border map2Border = randomBorder(0, useRoi ? MAX_VALUE : 0); Border map2Border = randomBorder(0, useRoi ? MAX_VALUE + 1 : 0);
if (map2Type != noType) if (map2Type != noType)
{ {
int mapMinValue = -mapMaxValue; int mapMinValue = -mapMaxValue;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment