Commit cf04fed3 authored by niko's avatar niko

fix the compilation bugs

parent c0a41054
...@@ -2,7 +2,7 @@ if(APPLE) ...@@ -2,7 +2,7 @@ if(APPLE)
set(OPENCL_FOUND YES) set(OPENCL_FOUND YES)
set(OPENCL_LIBRARIES "-framework OpenCL") set(OPENCL_LIBRARIES "-framework OpenCL")
else() else()
find_package(OpenCL) find_package(OpenCL QUIET)
# Try AMD/ATI Stream SDK # Try AMD/ATI Stream SDK
if (NOT OPENCL_FOUND) if (NOT OPENCL_FOUND)
......
...@@ -14,7 +14,7 @@ cl_list = glob.glob(os.path.join(indir, "*.cl")) ...@@ -14,7 +14,7 @@ cl_list = glob.glob(os.path.join(indir, "*.cl"))
kfile = open(outname, "wt") kfile = open(outname, "wt")
kfile.write("""// This file is auto-generated. Do not edit! kfile.write("""// This file is auto-generated. Do not edit!
#include "precomp.hpp" //#include "precomp.hpp"
namespace cv namespace cv
{ {
namespace ocl namespace ocl
......
...@@ -260,7 +260,7 @@ namespace cv ...@@ -260,7 +260,7 @@ namespace cv
CV_Assert((!map2.data || map2.size()== map1.size())); CV_Assert((!map2.data || map2.size()== map1.size()));
dst.create(map1.size(), src.type()); dst.create(map1.size(), src.type());
int depth = src.depth(), map_depth = map1.depth();
string kernelName; string kernelName;
...@@ -279,28 +279,107 @@ namespace cv ...@@ -279,28 +279,107 @@ namespace cv
kernelName = "remapNNSConstant"; kernelName = "remapNNSConstant";
} }
int type = src.type(); int channels = dst.channels();
size_t blkSizeX = 16, blkSizeY = 16; int depth = dst.depth();
size_t glbSizeX; int type = src.type();
size_t blkSizeX = 16, blkSizeY = 16;
if(src.type() == CV_8UC1 || src.type() == CV_8UC2 || src.type() == CV_8UC4) size_t glbSizeX;
int cols = dst.cols;
if(src.type() == CV_8UC1)
{
cols = (dst.cols + dst.offset%4 + 3)/4;
glbSizeX = cols %blkSizeX==0 ? cols : (cols/blkSizeX+1)*blkSizeX;
}
else if(src.type() == CV_8UC4 || src.type() == CV_32FC1)
{ {
size_t cols = (dst.cols + dst.offset%4 + 3)/4; cols = (dst.cols + (dst.offset>>2)%4 + 3)/4;
glbSizeX = cols %blkSizeX==0 ? cols : (cols/blkSizeX+1)*blkSizeX; glbSizeX = cols %blkSizeX==0 ? cols : (cols/blkSizeX+1)*blkSizeX;
} }
else else
{ {
glbSizeX = dst.cols%blkSizeX==0 ? dst.cols : (dst.cols/blkSizeX+1)*blkSizeX; glbSizeX = dst.cols%blkSizeX==0 ? dst.cols : (dst.cols/blkSizeX+1)*blkSizeX;
} }
size_t glbSizeY = dst.rows%blkSizeY==0 ? dst.rows : (dst.rows/blkSizeY+1)*blkSizeY; size_t glbSizeY = dst.rows%blkSizeY==0 ? dst.rows : (dst.rows/blkSizeY+1)*blkSizeY;
size_t globalThreads[3] = {glbSizeX,glbSizeY,1}; size_t globalThreads[3] = {glbSizeX,glbSizeY,1};
size_t localThreads[3] = {blkSizeX,blkSizeY,1}; size_t localThreads[3] = {blkSizeX,blkSizeY,1};
/*
/////////////////////////////
//using the image buffer
/////////////////////////////
size_t image_row_pitch = 0;
cl_int err1, err2, err3;
cl_mem_flags flags1 = CL_MEM_READ_ONLY;
cl_image_format format;
if(src.type() == CV_8UC1)
{
format.image_channel_order = CL_R;
format.image_channel_data_type = CL_UNSIGNED_INT8;
}
else if(src.type() == CV_8UC4)
{
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_UNSIGNED_INT8;
}
else if(src.type() == CV_32FC1)
{
format.image_channel_order = CL_R;
format.image_channel_data_type = CL_FLOAT;
}
else if(src.type() == CV_32FC4)
{
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_FLOAT;
}
cl_mem srcImage = clCreateImage2D(clCxt->impl->clContext, flags1, &format, src.cols, src.rows,
image_row_pitch, NULL, &err1);
if(err1 != CL_SUCCESS)
{
printf("Error creating CL image buffer, error code %d\n", err1);
return;
}
const size_t src_origin[3] = {0, 0, 0};
const size_t region[3] = {src.cols, src.rows, 1};
cl_event BtoI_event, ItoB_event;
err3 = clEnqueueCopyBufferToImage(clCxt->impl->clCmdQueue, (cl_mem)src.data, srcImage,
0, src_origin, region, 0, NULL, NULL);
if(err3 != CL_SUCCESS)
{
printf("Error copying buffer to image\n");
printf("Error code %d \n", err3);
return;
}
// clWaitForEvents(1, &BtoI_event);
cl_int ret;
Mat test(src.rows, src.cols, CV_8UC1);
memset(test.data, 0, src.rows*src.cols);
ret = clEnqueueReadImage(clCxt->impl->clCmdQueue, srcImage, CL_TRUE,
src_origin, region, 0, 0, test.data, NULL, NULL, &ItoB_event);
if(ret != CL_SUCCESS)
{
printf("read image error, %d ", ret);
return;
}
clWaitForEvents(1, &ItoB_event);
cout << "src" << endl;
cout << src << endl;
cout<<"image:"<<endl;
cout<< test << endl;
*/
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
if(map1.channels() == 2) if(map1.channels() == 2)
{ {
args.push_back( make_pair(sizeof(cl_mem),(void*)&dst.data)); args.push_back( make_pair(sizeof(cl_mem),(void*)&dst.data));
args.push_back( make_pair(sizeof(cl_mem),(void*)&src.data)); args.push_back( make_pair(sizeof(cl_mem),(void*)&src.data));
// args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer
args.push_back( make_pair(sizeof(cl_mem),(void*)&map1.data)); args.push_back( make_pair(sizeof(cl_mem),(void*)&map1.data));
args.push_back( make_pair(sizeof(cl_int),(void*)&dst.offset)); args.push_back( make_pair(sizeof(cl_int),(void*)&dst.offset));
args.push_back( make_pair(sizeof(cl_int),(void*)&src.offset)); args.push_back( make_pair(sizeof(cl_int),(void*)&src.offset));
...@@ -314,12 +393,10 @@ namespace cv ...@@ -314,12 +393,10 @@ namespace cv
args.push_back( make_pair(sizeof(cl_int),(void*)&dst.rows)); args.push_back( make_pair(sizeof(cl_int),(void*)&dst.rows));
args.push_back( make_pair(sizeof(cl_int),(void*)&map1.cols)); args.push_back( make_pair(sizeof(cl_int),(void*)&map1.cols));
args.push_back( make_pair(sizeof(cl_int),(void*)&map1.rows)); args.push_back( make_pair(sizeof(cl_int),(void*)&map1.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue)); args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue));
} }
openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth()); openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth());
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
......
...@@ -43,513 +43,884 @@ ...@@ -43,513 +43,884 @@
// the use of this software, even if advised of the possibility of such damage. // the use of this software, even if advised of the possibility of such damage.
// //
//M*/ //M*/
#pragma OPENCL EXTENSION cl_amd_printf : enable //#pragma OPENCL EXTENSION cl_amd_printf : enable
#if defined DOUBLE_SUPPORT #if defined DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
typedef double4 F4 ;
#else
typedef float4 F4;
#endif #endif
/////////////////////////////////////////////////////////
///////////////////////using buffer//////////////////////
/////////////////////////////////////////////////////////
__kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
/*
if(x < dst_cols && y < dst_rows) if(x < threadCols && y < dst_rows)
{ {
int dstIdx = y * dst_step + x + dst_offset; x = x << 2;
int map1Idx = y * (map1_step>>2) + x + (map1_offset>>2) - (map1_offset & 1); int gx = x - (dst_offset&3);
short2 map1_data = *(map1 + map1Idx); int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
int srcIdx = map1_data.y*src_step+map1_data.x + src_offset;
uchar src_data = *(src +srcIdx); uchar4 nval =convert_uchar4(nVal);
uchar dst_data = src_data; uchar val = nval.s0;
*(dst +dstIdx)=(map1_data.x >= map1_cols || map1_data.y >= map1_rows) ? val : dst_data;
} int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
*/
int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2);
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
int gx = (x << 2) - (dst_offset&3); uchar4 src_data;
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal); src_data.s0 = *(src + srcIdx.s0);
char val = nval.s0; src_data.s1 = *(src + srcIdx.s1);
src_data.s2 = *(src + srcIdx.s2);
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
x = x << 2; uchar4 dVal = *d;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2); *d = dst_data;
short8 map1_data;
map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start)); }
map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4));
map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8)); }
map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12)); __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int4 srcIdx ; int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
srcIdx.s0 = map1_data.s1 * src_step + map1_data.s0 + src_offset; {
srcIdx.s1 = map1_data.s3 * src_step + map1_data.s2 + src_offset; int x = get_global_id(0);
srcIdx.s2 = map1_data.s5 * src_step + map1_data.s4 + src_offset; int y = get_global_id(1);
srcIdx.s3 = map1_data.s7 * src_step + map1_data.s6 + src_offset;
if(x < threadCols && y < dst_rows)
//uchar4 src_data = *(src + srcIdx); {
uchar4 src_data; x = x << 2;
src_data.s0 = *(src + srcIdx.s0); int gx = x - (dst_offset&3);
src_data.s1 = *(src + srcIdx.s1); int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
src_data.s2 = *(src + srcIdx.s2);
src_data.s3 = *(src + srcIdx.s3); uchar4 nval =convert_uchar4(nVal);
uchar val = nval.s0;
uchar4 dst_data;
dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0; int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2; int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3);
dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3; float8 map1_data;
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
__global uchar4* d = (__global uchar4 *)(dst + dstStart); uchar4 src_data;
src_data.s0 = *(src + srcIdx.s0);
src_data.s1 = *(src + srcIdx.s1);
src_data.s2 = *(src + srcIdx.s2);
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *d;
uchar4 dVal = *d; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal;
*d = dst_data;
}
*d = dst_data;
} }
__kernel void remapNNSConstant_C2_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
int gx = (x << 3) - (dst_offset&7);
int8 Gx = (int8)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7);
uchar4 nval =convert_uchar4(nVal); if(x < threadCols && y < dst_rows)
uchar2 val = nval.s01;//testing {
x = x << 4;
int gx = x - (dst_offset&15);
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
uchar4 nval =convert_uchar4_sat_rte(nVal);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 );
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
uchar4 src_a, src_b, src_c, src_d;
src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
uchar16 dst_data;
uchar4 dst_a, dst_b, dst_c, dst_d;
dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a;
dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b;
dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c;
dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d;
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
*d = dst_data;
}
x = x << 3; }
__kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&7); if(x < threadCols && y < dst_rows)
{
x = x << 4;
int gx = x - (dst_offset&15);
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
int map1Start = y * map1_step + (x << 1) + map1_offset - (((dst_offset>>1) & 3) << 2); uchar4 nval =convert_uchar4(nVal);
short8 map1_data;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step +(x << 1) + map1_offset - ((dst_offset&15) << 1);
float8 map1_data;
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<2) + src_offset;
uchar4 src_a, src_b, src_c, src_d;
src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
uchar16 dst_data;
uchar4 dst_a, dst_b, dst_c, dst_d;
dst_a = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? nval : src_a;
dst_b = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? nval : src_b;
dst_c = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? nval : src_c;
dst_d = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? nval : src_d;
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
*d = dst_data;
}
map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start));
map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4));
map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8));
map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12));
int4 srcIdx ;
srcIdx.s0 = map1_data.s1 * src_step + (map1_data.s0 << 1) + src_offset;
srcIdx.s1 = map1_data.s3 * src_step + (map1_data.s2 << 1) + src_offset;
srcIdx.s2 = map1_data.s5 * src_step + (map1_data.s4 << 1) + src_offset;
srcIdx.s3 = map1_data.s7 * src_step + (map1_data.s6 << 1) + src_offset;
//uchar4 src_data = *(src + srcIdx);
uchar8 src_data;
src_data.s01 = *((__global uchar2 *)((__global char*)src + srcIdx.s0));
src_data.s23 = *((__global uchar2 *)((__global char*)src + srcIdx.s1));
src_data.s45 = *((__global uchar2 *)((__global char*)src + srcIdx.s2));
src_data.s67 = *((__global uchar2 *)((__global char*)src + srcIdx.s3));
uchar8 dst_data;
dst_data.s01 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows) ? val : (convert_uchar2(src_data.s01));
dst_data.s23 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows) ? val : (convert_uchar2(src_data.s23));
dst_data.s45 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows) ? val : (convert_uchar2(src_data.s45));
dst_data.s67 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows) ? val : (convert_uchar2(src_data.s67));
__global uchar8* d = (__global uchar8 *)(dst + dstStart);
uchar8 dVal = *d;
int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal;
*d = dst_data;
} }
__kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * restrict src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
int gx = (x << 4) - (dst_offset&15);
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); if(x < threadCols && y < dst_rows)
{
x = x << 4;
uchar4 nval =convert_uchar4(nVal); int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
x = x << 4; float4 nval =convert_float4(nVal);
float val = nval.s0;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + x + map1_offset - (((dst_offset>>2) & 3) << 2); int map1Start = y * map1_step + x + map1_offset - (dst_offset&15);
short8 map1_data; short8 map1_data;
map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start)); map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4));
map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8));
map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12));
int4 srcIdx ; int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
srcIdx.s0 = map1_data.s1 * src_step + (map1_data.s0 << 2) + src_offset;
srcIdx.s1 = map1_data.s3 * src_step + (map1_data.s2 << 2) + src_offset;
srcIdx.s2 = map1_data.s5 * src_step + (map1_data.s4 << 2) + src_offset;
srcIdx.s3 = map1_data.s7 * src_step + (map1_data.s6 << 2) + src_offset;
// uchar16 src_data; float4 src_data;
uchar4 src_a, src_b, src_c, src_d; src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); float4 dst_data;
// src_data = (uchar16)(src_a, src_b, src_c, src_d);
uchar16 dst_data; dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
uchar4 dst_a, dst_b, dst_c, dst_d; dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows) ? nval : src_a; dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows) ? nval : src_b; dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows) ? nval : src_c;
dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows) ? nval : src_d;
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != 0) ? dst_data : dVal;
*d = dst_data;
}
__kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, float4 dVal = *d;
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
*d = dst_data;
}
}
__kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
int gx = (x << 2) - (dst_offset&3); if(x < threadCols && y < dst_rows)
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); {
x = x << 4;
uchar4 nval =convert_uchar4_sat_rte(nVal); int gx = x - (dst_offset&15);
char val = nval.s0; int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
x = x << 2; float4 nval =convert_float4(nVal);
float val = nval.s0;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3); int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset&15) << 1);
float8 map1_data; float8 map1_data;
map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
/* map1_data.s01 = *((__global float2 *)((__global char*)map1 + map1Start)); int8 map1_dataZ = convert_int8_sat_rte(map1_data);
map1_data.s23 = *((__global float2 *)((__global char*)map1 + map1Start + 8));
map1_data.s45 = *((__global float2 *)((__global char*)map1 + map1Start + 16));
map1_data.s67 = *((__global float2 *)((__global char*)map1 + map1Start + 24));
*/
int8 map1_dataZ;
map1_dataZ = convert_int8_sat_rte(map1_data); int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<2) + src_offset;
int4 srcIdx ;
srcIdx.s0 = map1_dataZ.s1 * src_step + map1_dataZ.s0 + src_offset;
srcIdx.s1 = map1_dataZ.s3 * src_step + map1_dataZ.s2 + src_offset;
srcIdx.s2 = map1_dataZ.s5 * src_step + map1_dataZ.s4 + src_offset;
srcIdx.s3 = map1_dataZ.s7 * src_step + map1_dataZ.s6 + src_offset;
//uchar4 src_data = *(src + srcIdx);
uchar4 src_data;
src_data.s0 = *(src + srcIdx.s0);
src_data.s1 = *(src + srcIdx.s1);
src_data.s2 = *(src + srcIdx.s2);
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
__global uchar4* d = (__global uchar4 *)(dst + dstStart); float4 src_data;
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
float4 dst_data;
dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
float4 dVal = *d;
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
*d = dst_data;
uchar4 dVal = *d; }
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal;
*d = dst_data; }
__kernel void remapNNSConstant_C4_D5(__global float * dst, __global float const * restrict src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
int dstIdx = y * dst_step + (x << 4) + dst_offset ;
int mapIdx = y * map1_step + (x << 2) + map1_offset ;
short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx));
int srcIdx = map1_data.y * src_step + (map1_data.x << 4) + src_offset;
float4 nval = convert_float4(nVal);
float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
*((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_data.x >= src_cols || map1_data.y >= src_rows) ? nval : src_data;
}
}
__kernel void remapNNFConstant_C4_D5(__global float * dst, __global float const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
int dstIdx = y * dst_step + (x << 4) + dst_offset ;
int mapIdx = y * map1_step + (x << 3) + map1_offset ;
float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx));
int2 map1_dataZ = convert_int2_sat_rte(map1_data);
int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset;
float4 nval = convert_float4(nVal);
float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
*((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows) ? nval : src_data;
}
} }
__kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
int gx = (x << 2) - (dst_offset&3); {
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal); uchar4 nval =convert_uchar4(nVal);
uchar val = nval.s0; uchar val = nval.s0;
x = x << 2;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3);
float8 map1_data;
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
int8 map1_dataD = convert_int8(map1_data);
float8 temp = map1_data - convert_float8(map1_dataD);
float4 u = temp.even;
float4 v = temp.odd;
float4 ud = 1.f - u;
float4 vd = 1.f - v;
//float8 map1_dataU = map1_dataD + 1;
int4 map1_dataDx = map1_dataD.even;
int4 map1_dataDy = map1_dataD.odd;
int4 map1_dataDx1 = map1_dataDx + 1;
int4 map1_dataDy1 = map1_dataDy + 1;
int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
int4 src_StartD = src_StartU + src_step;
/*
//not using the vload
int4 src_StartU1 = src_StartU + 1;
int4 src_StartD1 = src_StartD + 1;
uchar4 a, b, c, d;
a.x = *(src_StartU.x + src);
a.y = *(src_StartU.y + src);
a.z = *(src_StartU.z + src);
a.w = *(src_StartU.w + src);
b.x = *(src_StartU1.x + src);
b.y = *(src_StartU1.y + src);
b.z = *(src_StartU1.z + src);
b.w = *(src_StartU1.w + src);
c.x = *(src_StartD.x + src);
c.y = *(src_StartD.y + src);
c.z = *(src_StartD.z + src);
c.w = *(src_StartD.w + src);
d.x = *(src_StartD1.x + src);
d.y = *(src_StartD1.y + src);
d.z = *(src_StartD1.z + src);
d.w = *(src_StartD1.w + src);
*/
uchar2 aU, aD, bU, bD, cU, cD, dU, dD;
aU = vload2(0, src + src_StartU.s0);
bU = vload2(0, src + src_StartU.s1);
cU = vload2(0, src + src_StartU.s2);
dU = vload2(0, src + src_StartU.s3);
aD = vload2(0, src + src_StartD.s0);
bD = vload2(0, src + src_StartD.s1);
cD = vload2(0, src + src_StartD.s2);
dD = vload2(0, src + src_StartD.s3);
uchar4 a, b, c, d;
a = (uchar4)(aU.x, bU.x, cU.x, dU.x);
b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
a = (convert_uchar4(ac) == (uchar4)0)? a : val;
b = (convert_uchar4(bc) == (uchar4)0)? b : val;
c = (convert_uchar4(cc) == (uchar4)0)? c : val;
d = (convert_uchar4(dc) == (uchar4)0)? d : val;
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *D;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3); *D = dst_data;
float8 map1_data; }
}
map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); __kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
int8 map1_dataD = convert_int8(map1_data); __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
float8 temp = map1_data - convert_float8(map1_dataD); int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
float4 u = temp.even; uchar4 nval =convert_uchar4(nVal);
float4 v = temp.odd; uchar val = nval.s0;
float4 ud = 1.0 - u;
float4 vd = 1.0 - v;
//float8 map1_dataU = map1_dataD + 1;
int4 map1_dataDx = map1_dataD.even; int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
int4 map1_dataDy = map1_dataD.odd;
int4 map1_dataDx1 = map1_dataDx + 1;
int4 map1_dataDy1 = map1_dataDy + 1;
int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset; int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2);
int4 src_StartD = src_StartU + src_step; short8 map1_data;
int4 src_StartU1 = src_StartU + 1;
int4 src_StartD1 = src_StartD + 1;
uchar4 a, b, c, d; map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
a.x = *(src_StartU.x + src); int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
a.y = *(src_StartU.y + src);
a.z = *(src_StartU.z + src);
a.w = *(src_StartU.w + src);
b.x = *(src_StartU1.x + src); uchar4 src_data;
b.y = *(src_StartU1.y + src);
b.z = *(src_StartU1.z + src); src_data.s0 = *(src + srcIdx.s0);
b.w = *(src_StartU1.w + src); src_data.s1 = *(src + srcIdx.s1);
src_data.s2 = *(src + srcIdx.s2);
c.x = *(src_StartD.x + src); src_data.s3 = *(src + srcIdx.s3);
c.y = *(src_StartD.y + src); uchar4 dst_data;
c.z = *(src_StartD.z + src); dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
c.w = *(src_StartD.w + src);
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
d.x = *(src_StartD1.x + src);
d.y = *(src_StartD1.y + src);
d.z = *(src_StartD1.z + src);
d.w = *(src_StartD1.w + src);
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
a = (convert_uchar4(ac) == 0)? a : val;
b = (convert_uchar4(bc) == 0)? b : val;
c = (convert_uchar4(cc) == 0)? c : val;
d = (convert_uchar4(dc) == 0)? d : val;
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *D; uchar4 dVal = *d;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
*d = dst_data;
}
*D = dst_data;
} }
__kernel void remapLNFConstant_C2_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
int gx = (x << 3) - (dst_offset&7); {
int8 Gx = (int8)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7); x = x << 4;
int gx = x - (dst_offset&15);
uchar4 nval =convert_uchar4(nVal); int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
uchar8 val = (uchar8)(nval.s01, nval.s01, nval.s01, nval.s01);
uchar4 nval =convert_uchar4(nVal);
x = x << 3;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&7);
int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
int map1Start = y * map1_step + (x << 2) + map1_offset - (((dst_offset>>1) & 3) << 3); float8 map1_data;
float8 map1_data;
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); int8 map1_dataD = convert_int8(map1_data);
int8 map1_dataD = convert_int8(map1_data); float8 temp = map1_data - convert_float8(map1_dataD);
float8 temp = map1_data - convert_float8(map1_dataD);
float4 u = temp.even;
float4 U = temp.even; float4 v = temp.odd;
float4 V = temp.odd; float4 ud = 1.f - u;
float4 UD = 1.0 - U; float4 vd = 1.f - v;
float4 VD = 1.0 - V;
//float8 map1_dataU = map1_dataD + 1;
float8 u, v, ud, vd;
u = (float8)(U.x, U.x, U.y, U.y, U.z, U.z, U.w, U.w); int4 map1_dataDx = map1_dataD.even;
v = (float8)(V.x, V.x, V.y, V.y, V.z, V.z, V.w, V.w); int4 map1_dataDy = map1_dataD.odd;
ud = (float8)(UD.x, UD.x, UD.y, UD.y, UD.z, UD.z, UD.w, UD.w); int4 map1_dataDx1 = map1_dataDx + 1;
vd = (float8)(VD.x, VD.x, VD.y, VD.y, VD.z, VD.z, VD.w, VD.w); int4 map1_dataDy1 = map1_dataDy + 1;
//float8 map1_dataU = map1_dataD + 1; int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset;
int4 src_StartD = src_StartU + src_step;
int4 map1_dataDx = map1_dataD.even;
int4 map1_dataDy = map1_dataD.odd; uchar8 aU, bU, cU, dU, aD, bD, cD, dD;
int4 map1_dataDx1 = map1_dataDx + 1; aU = vload8(0, src + src_StartU.s0);
int4 map1_dataDy1 = map1_dataDy + 1; bU = vload8(0, src + src_StartU.s1);
cU = vload8(0, src + src_StartU.s2);
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 1) + src_offset; dU = vload8(0, src + src_StartU.s3);
int4 src_StartD = src_StartU + src_step; aD = vload8(0, src + src_StartD.s0);
int4 src_StartU1 = src_StartU + 2; bD = vload8(0, src + src_StartD.s1);
int4 src_StartD1 = src_StartD + 2; cD = vload8(0, src + src_StartD.s2);
dD = vload8(0, src + src_StartD.s3);
uchar8 a, b, c, d; uchar16 a, b, c, d;
a.s01 = *((__global uchar2 *)((__global char*)src + src_StartU.x)); a = (uchar16)(aU.s0123, bU.s0123, cU.s0123, dU.s0123);
a.s23 = *((__global uchar2 *)((__global char*)src + src_StartU.y)); b = (uchar16)(aU.s4567, bU.s4567, cU.s4567, dU.s4567);
a.s45 = *((__global uchar2 *)((__global char*)src + src_StartU.z)); c = (uchar16)(aD.s0123, bD.s0123, cD.s0123, dD.s0123);
a.s67 = *((__global uchar2 *)((__global char*)src + src_StartU.w)); d = (uchar16)(aD.s4567, bD.s4567, cD.s4567, dD.s4567);
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
b.s01 = *((__global uchar2 *)((__global char*)src + src_StartU1.x)); int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
b.s23 = *((__global uchar2 *)((__global char*)src + src_StartU1.y)); int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
b.s45 = *((__global uchar2 *)((__global char*)src + src_StartU1.z)); int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
b.s67 = *((__global uchar2 *)((__global char*)src + src_StartU1.w));
int16 acc = (int16)((int4)(ac.x), (int4)(ac.y), (int4)(ac.z), (int4)(ac.w));
c.s01 = *((__global uchar2 *)((__global char*)src + src_StartD.x)); int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
c.s23 = *((__global uchar2 *)((__global char*)src + src_StartD.y)); int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
c.s45 = *((__global uchar2 *)((__global char*)src + src_StartD.z)); int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
c.s67 = *((__global uchar2 *)((__global char*)src + src_StartD.w));
uchar16 val = (uchar16)(nval, nval, nval, nval);
d.s01 = *((__global uchar2 *)((__global char*)src + src_StartD1.x)); a = (convert_uchar16(acc) == (uchar16)0)? a : val;
d.s23 = *((__global uchar2 *)((__global char*)src + src_StartD1.y)); b = (convert_uchar16(bcc) == (uchar16)0)? b : val;
d.s45 = *((__global uchar2 *)((__global char*)src + src_StartD1.z)); c = (convert_uchar16(ccc) == (uchar16)0)? c : val;
d.s67 = *((__global uchar2 *)((__global char*)src + src_StartD1.w)); d = (convert_uchar16(dcc) == (uchar16)0)? d : val;
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w));
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w));
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); float16 Ud = (float16)((float4)(ud.x), (float4)(ud.y), (float4)(ud.z), (float4)(ud.w));
int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));
/* a.even = (convert_uchar4(ac) == 0)? a.even : val.even; uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
a.odd = (convert_uchar4(ac) == 0)? a.odd : val.odd;
b.even = (convert_uchar4(bc) == 0)? b.even : val.even;
b.odd = (convert_uchar4(bc) == 0)? b.odd : val.odd;
c.even = (convert_uchar4(cc) == 0)? c.even : val.even;
c.odd = (convert_uchar4(cc) == 0)? c.odd : val.odd;
d.even = (convert_uchar4(dc) == 0)? d.even : val.even;
d.odd = (convert_uchar4(dc) == 0)? d.odd : val.odd;
*/
int8 aC = (int8)(ac.x, ac.x, ac.y, ac.y, ac.z, ac.z, ac.w, ac.w);
int8 bC = (int8)(bc.x, bc.x, bc.y, bc.y, bc.z, bc.z, bc.w, bc.w);
int8 cC = (int8)(cc.x, cc.x, cc.y, cc.y, cc.z, cc.z, cc.w, cc.w);
int8 dC = (int8)(dc.x, dc.x, dc.y, dc.y, dc.z, dc.z, dc.w, dc.w);
a = (convert_uchar8(aC) == 0)? a : val;
b = (convert_uchar8(bC) == 0)? b : val;
c = (convert_uchar8(cC) == 0)? c : val;
d = (convert_uchar8(dC) == 0)? d : val;
uchar8 dst_data = convert_uchar8_sat_rte((convert_float8(a))* ud * vd +(convert_float8(b))* u * vd + (convert_float8(c))* ud * v + (convert_float8(d)) * u * v );
__global uchar8* D = (__global uchar8 *)(dst + dstStart); __global uchar16* D = (__global uchar16 *)(dst + dstStart);
uchar8 dVal = *D; uchar16 dVal = *D;
int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows); int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal; dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
*D = dst_data; *D = dst_data;
}
} }
__kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
/* if(x < threadCols && y < dst_rows)
__kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, {
x = x << 4;
int gx = x - (dst_offset&15);
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
uchar4 nval =convert_uchar4_sat_rte(nVal);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 );
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
uchar4 src_a, src_b, src_c, src_d;
src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
uchar16 dst_data;
uchar4 dst_a, dst_b, dst_c, dst_d;
dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a;
dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b;
dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c;
dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d;
dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
__global uchar16* d = (__global uchar16 *)(dst + dstStart);
uchar16 dVal = *d;
int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal;
*d = dst_data;
}
}
__kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
int gx = (x << 4) - (dst_offset&15); {
int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
uchar4 nval =convert_uchar4(nVal); float4 nval =convert_float4(nVal);
uchar16 val = (uchar16)(nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01); float4 val = (float4)(nval.s0);
x = x << 4; int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); float8 map1_data;
int map1Start = y * map1_step + (x << 1) + map1_offset - (((dst_offset>>2) & 3) << 3); map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
float8 map1_data; int8 map1_dataD = convert_int8(map1_data);
float8 temp = map1_data - convert_float8(map1_dataD);
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
int8 map1_dataD = convert_int8(map1_data); float4 u = temp.even;
float8 temp = map1_data - convert_float8(map1_dataD); float4 v = temp.odd;
float4 ud = 1.f - u;
float4 U = temp.even; float4 vd = 1.f - v;
float4 V = temp.odd; //float8 map1_dataU = map1_dataD + 1;
float4 UD = 1.0 - U;
float4 VD = 1.0 - V; int4 map1_dataDx = map1_dataD.even;
int4 map1_dataDy = map1_dataD.odd;
float16 u, v, ud, vd; int4 map1_dataDx1 = map1_dataDx + 1;
u = (float16)(U.x, U.x, U.x, U.x, U.y, U.y, U.y, U.y, U.z, U.z, U.z, U.z, U.w, U.w, U.w, U.w); int4 map1_dataDy1 = map1_dataDy + 1;
v = (float16)(V.x, V.x, V.x, V.x, V.y, V.y, V.y, V.y, V.z, V.z, V.z, V.z, V.w, V.w, V.w, V.w);
ud = (float16)(UD.x, UD.x, UD.x, UD.x, UD.y, UD.y, UD.y, UD.y, UD.z, UD.z, UD.z, UD.z, UD.w, UD.w, UD.w, UD.w); int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset;
vd = (float16)(VD.x, VD.x, VD.y, VD.y, VD.z, VD.z, VD.w, VD.w); int4 src_StartD = src_StartU + src_step;
/*
//float8 map1_dataU = map1_dataD + 1; //not using the vload
int4 src_StartU1 = src_StartU + 1;
int4 map1_dataDx = map1_dataD.even; int4 src_StartD1 = src_StartD + 1;
int4 map1_dataDy = map1_dataD.odd;
int4 map1_dataDx1 = map1_dataDx + 1; float4 a, b, c, d;
int4 map1_dataDy1 = map1_dataDy + 1; a.x = *(src_StartU.x + src);
a.y = *(src_StartU.y + src);
int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 1) + src_offset; a.z = *(src_StartU.z + src);
int4 src_StartD = src_StartU + src_step; a.w = *(src_StartU.w + src);
int4 src_StartU1 = src_StartU + 2;
int4 src_StartD1 = src_StartD + 2; b.x = *(src_StartU1.x + src);
b.y = *(src_StartU1.y + src);
uchar8 a, b, c, d; b.z = *(src_StartU1.z + src);
a.s01 = *((__global uchar2 *)((__global char*)src + src_StartU.x)); b.w = *(src_StartU1.w + src);
a.s23 = *((__global uchar2 *)((__global char*)src + src_StartU.y));
a.s45 = *((__global uchar2 *)((__global char*)src + src_StartU.z)); c.x = *(src_StartD.x + src);
a.s67 = *((__global uchar2 *)((__global char*)src + src_StartU.w)); c.y = *(src_StartD.y + src);
c.z = *(src_StartD.z + src);
b.s01 = *((__global uchar2 *)((__global char*)src + src_StartU1.x)); c.w = *(src_StartD.w + src);
b.s23 = *((__global uchar2 *)((__global char*)src + src_StartU1.y));
b.s45 = *((__global uchar2 *)((__global char*)src + src_StartU1.z)); d.x = *(src_StartD1.x + src);
b.s67 = *((__global uchar2 *)((__global char*)src + src_StartU1.w)); d.y = *(src_StartD1.y + src);
d.z = *(src_StartD1.z + src);
c.s01 = *((__global uchar2 *)((__global char*)src + src_StartD.x)); d.w = *(src_StartD1.w + src);
c.s23 = *((__global uchar2 *)((__global char*)src + src_StartD.y)); */
c.s45 = *((__global uchar2 *)((__global char*)src + src_StartD.z)); float2 aU, aD, bU, bD, cU, cD, dU, dD;
c.s67 = *((__global uchar2 *)((__global char*)src + src_StartD.w));
aU = vload2(0, (__global float *)((__global char*)src + src_StartU.s0));
d.s01 = *((__global uchar2 *)((__global char*)src + src_StartD1.x)); bU = vload2(0, (__global float *)((__global char*)src + src_StartU.s1));
d.s23 = *((__global uchar2 *)((__global char*)src + src_StartD1.y)); cU = vload2(0, (__global float *)((__global char*)src + src_StartU.s2));
d.s45 = *((__global uchar2 *)((__global char*)src + src_StartD1.z)); dU = vload2(0, (__global float *)((__global char*)src + src_StartU.s3));
d.s67 = *((__global uchar2 *)((__global char*)src + src_StartD1.w)); aD = vload2(0, (__global float *)((__global char*)src + src_StartD.s0));
bD = vload2(0, (__global float *)((__global char*)src + src_StartD.s1));
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); cD = vload2(0, (__global float *)((__global char*)src + src_StartD.s2));
int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); dD = vload2(0, (__global float *)((__global char*)src + src_StartD.s3));
int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); float4 a, b, c, d;
a = (float4)(aU.x, bU.x, cU.x, dU.x);
int8 aC = (int8)(ac.x, ac.x, ac.y, ac.y, ac.z, ac.z, ac.w, ac.w); b = (float4)(aU.y, bU.y, cU.y, dU.y);
int8 bC = (int8)(bc.x, bc.x, bc.y, bc.y, bc.z, bc.z, bc.w, bc.w); c = (float4)(aD.x, bD.x, cD.x, dD.x);
int8 cC = (int8)(cc.x, cc.x, cc.y, cc.y, cc.z, cc.z, cc.w, cc.w); d = (float4)(aD.y, bD.y, cD.y, dD.y);
int8 dC = (int8)(dc.x, dc.x, dc.y, dc.y, dc.z, dc.z, dc.w, dc.w);
int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
a = (convert_uchar8(aC) == 0)? a : val; int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
b = (convert_uchar8(bC) == 0)? b : val; int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
c = (convert_uchar8(cC) == 0)? c : val; int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
d = (convert_uchar8(dC) == 0)? d : val; a = (convert_float4(ac) == 0)? a : val;
uchar8 dst_data = convert_uchar8_sat_rte((convert_float8(a))* ud * vd +(convert_float8(b))* u * vd + (convert_float8(c))* ud * v + (convert_float8(d)) * u * v ); b = (convert_float4(bc) == 0)? b : val;
c = (convert_float4(cc) == 0)? c : val;
d = (convert_float4(dc) == 0)? d : val;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
__global uchar8* D = (__global uchar8 *)(dst + dstStart); __global float4* D = (__global float4 *)((__global char*)dst + dstStart);
float4 dVal = *D;
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
*D = dst_data;
}
}
__kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const * restrict src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
uchar8 dVal = *D; float4 nval =convert_float4(nVal);
int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows); float val = nval.s0;
dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal;
*D = dst_data; int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + x + map1_offset - (dst_offset&15);
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset;
float4 src_data;
src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
float4 dst_data;
dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
__global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
float4 dVal = *d;
int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != 0) ? dst_data : dVal;
*d = dst_data;
}
} }
*/
__kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
int dstIdx = y * dst_step + (x << 4) + dst_offset ;
int mapIdx = y * map1_step + (x << 3) + map1_offset ;
float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx));
int2 map1_dataZ = convert_int2(map1_data);
int mX = map1_dataZ.x;
int mY = map1_dataZ.y;
int mX1 = map1_dataZ.x + 1;
int mY1 = map1_dataZ.y + 1;
float u = map1_data.x - convert_float(map1_dataZ.x);
float v = map1_data.y - convert_float(map1_dataZ.y);
float ud = 1.0 - u;
float vd = 1.0 - v;
int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset;
float8 src_dataU = vload8(0,(__global float *)((__global char*)src + srcIdx));
float8 src_dataD = vload8(0,(__global float *)((__global char*)src + srcIdx + src_step));
float4 a = src_dataU.lo;
float4 b = src_dataU.hi;
float4 c = src_dataD.lo;
float4 d = src_dataD.hi;
float4 nval = convert_float4(nVal);
a = (mX >= src_cols || mY >= src_rows ) ? nval : a;
b = (mX1 >= src_cols || mY >= src_rows ) ? nval : b;
c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
*((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
}
}
/*
////////////////////////////////////////////////////////////////////////
///////////////////using image buffer///////////////////////////////////
////////////////////////////////////////////////////////////////////////
__kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only image2d_t src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
x = x << 2;
if(x < threadCols && y < dst_rows)
{
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
char val = nval.s0;
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2);
short8 map1_data;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
int4 src_data;
src_data.x = read_imageui(src, sampler, (int2)((int)map1_data.s0, (int)map1_data.s1)).x;
src_data.y = read_imageui(src, sampler, (int2)((int)map1_data.s2, (int)map1_data.s3)).x;
src_data.z = read_imageui(src, sampler, (int2)((int)map1_data.s4, (int)map1_data.s5)).x;
src_data.w = read_imageui(src, sampler, (int2)((int)map1_data.s6, (int)map1_data.s7)).x;
int4 bcon = (convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows));
uchar4 dst_data = (convert_uchar4(bcon != 0)) ? (uchar4)(val) : convert_uchar4(src_data);
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
uchar4 dVal = *d;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal;
*d = dst_data;
}
}
*/
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
#include "interpolation.hpp" #include "interpolation.hpp"
//#include "add_test_info.h" //#include "add_test_info.h"
#define OPENCV_DEFAULT_OPENCL_DEVICE CVCL_DEVICE_TYPE_ALL #define OPENCV_DEFAULT_OPENCL_DEVICE CVCL_DEVICE_TYPE_GPU
#endif #endif
...@@ -858,10 +858,10 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) ...@@ -858,10 +858,10 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int)
cv::RNG& rng = TS::ptr()->get_rng(); cv::RNG& rng = TS::ptr()->get_rng();
//cv::Size size = cv::Size(20, 20); //cv::Size size = cv::Size(20, 20);
cv::Size srcSize = cv::Size(15, 20); cv::Size srcSize = cv::Size(100, 100);
cv::Size dstSize = cv::Size(20, 20); cv::Size dstSize = cv::Size(100, 100);
cv::Size map1Size = cv::Size(20, 20); cv::Size map1Size = cv::Size(100, 100);
double min = 1, max = 20; double min = 5, max = 16;
if(srcType != nulltype) if(srcType != nulltype)
{ {
...@@ -898,14 +898,11 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) ...@@ -898,14 +898,11 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int)
src_roicols = rng.uniform(1, src.cols); src_roicols = rng.uniform(1, src.cols);
src_roirows = rng.uniform(1, src.rows); src_roirows = rng.uniform(1, src.rows);
cout << "dst_roicols: " << dst_roicols << "dst_roirows: "<< dst_roirows << endl;
cout << "src_roicols: " << src_roicols << "dst_roirows: "<< src_roirows << endl;
srcx = rng.uniform(0, src.cols - src_roicols); srcx = rng.uniform(0, src.cols - src_roicols);
srcy = rng.uniform(0, src.rows - src_roirows); srcy = rng.uniform(0, src.rows - src_roirows);
dstx = rng.uniform(0, dst.cols - dst_roicols); dstx = rng.uniform(0, dst.cols - dst_roicols);
dsty = rng.uniform(0, dst.rows - dst_roirows); dsty = rng.uniform(0, dst.rows - dst_roirows);
cout << "srcx: " << srcx << "srcy: " << srcy << "dstx: " << dstx << "dsty: " << dsty << endl;
map1_roicols = dst_roicols; map1_roicols = dst_roicols;
map1_roirows = dst_roirows; map1_roirows = dst_roirows;
map2_roicols = dst_roicols; map2_roicols = dst_roicols;
...@@ -940,10 +937,6 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) ...@@ -940,10 +937,6 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int)
{ {
map1_roi = map1(Rect(map1x,map1y,map1_roicols,map1_roirows)); map1_roi = map1(Rect(map1x,map1y,map1_roicols,map1_roirows));
gmap1_roi = map1_roi; gmap1_roi = map1_roi;
// cv::Mat maptest(gmap1_roi);
// cout << "maptest " << endl;
//cout << maptest << endl;
//gmap1_roi = gmap1(Rect(map1x,map1y,map1_roicols,map1_roirows));
} }
else if (map1Type == CV_32FC1 && map2Type == CV_32FC1) else if (map1Type == CV_32FC1 && map2Type == CV_32FC1)
...@@ -962,6 +955,11 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) ...@@ -962,6 +955,11 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int)
TEST_P(Remap, Mat) TEST_P(Remap, Mat)
{ {
if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1))
{
cout << "LINEAR don't support the map1Type and map2Type" << endl;
return;
}
int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
// for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++) // for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++)
...@@ -1448,7 +1446,7 @@ INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine( ...@@ -1448,7 +1446,7 @@ INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine(
)); ));
INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine( INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine(
Values(CV_8UC1, CV_8UC2, CV_8UC4), Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
Values(CV_16SC2, CV_32FC2), NULL_TYPE, Values(CV_16SC2, CV_32FC2), NULL_TYPE,
Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR), Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR),
Values((int)cv::BORDER_CONSTANT))); Values((int)cv::BORDER_CONSTANT)));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment