Commit 8eeacc8c authored by niko's avatar niko

performance and bug fix for addWeighted cartToPolar div exp log resize setTo

add channel 3 support
add fast way Between CPU and GPU for the data which is aligned
parent cf04fed3
......@@ -49,7 +49,7 @@ namespace cv
namespace ocl
{
////////////////////////////////////OpenCL kernel strings//////////////////////////
extern const char *convertC3C4;
//extern const char *convertC3C4;
////////////////////////////////////////////////////////////////////////
//////////////////////////////// oclMat ////////////////////////////////
......
......@@ -49,6 +49,7 @@
#include "opencv2/core/core.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/objdetect/objdetect.hpp"
#include "opencv2/features2d/features2d.hpp"
namespace cv
{
......
......@@ -455,13 +455,12 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
}
void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
{
if(src1.clCxt -> impl -> double_support ==0)
{
CV_Error(-217,"Selected device don't support double\r\n");
return;
}
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
if(src1.clCxt -> impl -> double_support !=0)
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
else
arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
}
template <typename WT ,typename CL_WT>
void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar)
......@@ -579,7 +578,14 @@ void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, co
args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
if(src.clCxt -> impl -> double_support !=0)
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
else
{
float f_scalar = (float)scalar;
args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar));
}
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
}
......@@ -670,9 +676,9 @@ void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string ker
int cols = divUp(dst.cols + offset_cols, vector_length);
size_t localThreads[3] = { 64, 4, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
......@@ -1253,7 +1259,11 @@ void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, c
CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
Context *clCxt = src.clCxt;
if(clCxt -> impl -> double_support ==0 && src.type() == CV_64F)
{
CV_Error(-217,"Selected device don't support double\r\n");
return;
}
//int channels = dst.channels();
int depth = dst.depth();
......@@ -2193,56 +2203,46 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args;
if(sizeof(double) == 8)
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
if(src1.clCxt -> impl -> double_support != 0)
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
}
else
{
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&alpha ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&beta ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
args.push_back( make_pair( sizeof(cl_float), (void *)&gama ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
}
}
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
}
void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
{
CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() &&
(src1.depth() == CV_32F ));
(src1.depth() == CV_32F ));
dst.create(src1.size(), src1.type());
......@@ -2265,9 +2265,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args;
......@@ -2313,9 +2313,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst)
size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
divUp(dst.rows, localThreads[1]) * localThreads[1],
1
};
int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args;
......@@ -2348,9 +2348,9 @@ void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernel
size_t localThreads[3] = { 64, 4, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(rows, localThreads[1]) * localThreads[1],
1
};
divUp(rows, localThreads[1]) * localThreads[1],
1
};
int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args;
......
......@@ -410,7 +410,11 @@ namespace cv
float ify = 1. / fy;
double ifx_d = 1. / fx;
double ify_d = 1. / fy;
int srcStep_in_pixel = src.step1() / src.channels();
int srcoffset_in_pixel = src.offset / src.elemSize();
int dstStep_in_pixel = dst.step1() / dst.channels();
int dstoffset_in_pixel = dst.offset / dst.elemSize();
//printf("%d %d\n",src.step1() , dst.elemSize());
string kernelName;
if(interpolation == INTER_LINEAR)
kernelName = "resizeLN";
......@@ -438,25 +442,33 @@ namespace cv
{
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
if(src.clCxt -> impl -> double_support != 0)
{
args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
}
else
{
args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
}
}
else
{
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
......
......@@ -378,20 +378,36 @@ namespace cv
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height, enum openCLMemcpyKind kind)
size_t width, size_t height, enum openCLMemcpyKind kind, int channels)
{
size_t buffer_origin[3] = {0, 0, 0};
size_t host_origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
if(kind == clMemcpyHostToDevice)
{
openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
if(dpitch == width || channels==3)
{
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
0, width*height, src, 0, NULL, NULL));
}
else
{
openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
}
}
else if(kind == clMemcpyDeviceToHost)
{
openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
if(spitch == width || channels==3)
{
openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
0, width*height, dst, 0, NULL, NULL));
}
else
{
openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
}
}
}
......
......@@ -51,9 +51,9 @@ typedef float F;
//////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int src1_offset,
__global uchar *src2, F beta, int src2_step,int src2_offset,
F gama,
__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
__global uchar *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama,
__global uchar *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
......@@ -99,9 +99,9 @@ __kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int sr
__kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int src1_offset,
__global ushort *src2, F beta, int src2_step,int src2_offset,
F gama,
__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
__global ushort *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama,
__global ushort *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
......@@ -145,9 +145,9 @@ __kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int s
}
__kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int src1_offset,
__global short *src2, F beta, int src2_step,int src2_offset,
F gama,
__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
__global short *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama,
__global short *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
......@@ -190,9 +190,9 @@ __kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int sr
}
__kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1_offset,
__global int *src2, F beta, int src2_step,int src2_offset,
F gama,
__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__global int *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama,
__global int *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
......@@ -238,9 +238,9 @@ __kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1
}
__kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int src1_offset,
__global float *src2, F beta, int src2_step,int src2_offset,
F gama,
__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
__global float *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama,
__global float *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
......@@ -286,9 +286,9 @@ __kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int sr
}
#if defined (DOUBLE_SUPPORT)
__kernel void addWeighted_D6 (__global double *src1, F alpha,int src1_step,int src1_offset,
__global double *src2, F beta, int src2_step,int src2_offset,
F gama,
__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
__global double *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama,
__global double *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
......
......@@ -49,6 +49,10 @@
#define CV_PI 3.1415926535897932384626433832795
#ifndef DBL_EPSILON
#define DBL_EPSILON 0x1.0p-52
#endif
__kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global float *dst1, int dst1_step, int dst1_offset, //magnitude
......
This diff is collapsed.
......@@ -70,6 +70,8 @@ __kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int sr
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
{
int x = get_global_id(0);
......@@ -87,3 +89,5 @@ __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int sr
// dst[dstIdx] = exp(src[srcIdx]);
}
}
#endif
......@@ -73,7 +73,7 @@ __kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int sr
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
{
int x = get_global_id(0);
......@@ -91,4 +91,4 @@ __kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int sr
}
}
#endif
......@@ -6,7 +6,7 @@
// Third party copyrights are property of their respective owners.
//
// @Authors
// Zero Lin, zero.lin@amd.com
// Niko Li, newlife20080214@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
......@@ -32,106 +32,107 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
__kernel void convertC3C4_D0(__global const char4 * restrict src, __global char4 *dst, int cols, int rows,
int srcStep, int dstStep)
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int dstStep_in_piexl,int pixel_end)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
//int pixel_end = mul24(cols -1 , rows -1);
int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end);
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
pixel0 = src[pixelid.x];
pixel1 = src[pixelid.y];
pixel2 = src[pixelid.z];
int d = y * srcStep + x * 3;
char8 data = (char8)(src[d>>2], src[(d>>2) + 1]);
char temp[8] = {data.s0, data.s1, data.s2, data.s3, data.s4, data.s5, data.s6, data.s7};
int start = d & 3;
char4 ndata = (char4)(temp[start], temp[start + 1], temp[start + 2], 0);
if(y < rows)
dst[y * dstStep + x] = ndata;
}
__kernel void convertC3C4_D1(__global const short* restrict src, __global short4 *dst, int cols, int rows,
int srcStep, int dstStep)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
int d = (y * srcStep + x * 6)>>1;
short4 data = *(__global short4 *)(src + ((d>>1)<<1));
short temp[4] = {data.s0, data.s1, data.s2, data.s3};
int start = d & 1;
short4 ndata = (short4)(temp[start], temp[start + 1], temp[start + 2], 0);
if(y < rows)
dst[y * dstStep + x] = ndata;
int4 outy = (id<<2)/cols;
int4 outx = (id<<2)%cols;
outx.y++;
outx.z+=2;
outx.w+=3;
outy = select(outy,outy+1,outx>=cols);
outx = select(outx,outx-cols,outx>=cols);
//outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
//outx = select(outx,(int4)outx.x,outy>=rows);
//outy = select(outy,(int4)outy.x,outy>=rows);
int4 addr = mad24(outy,dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
dst[addr.w] = outpix3;
}
else if(outx.z<cols && outy.z<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
}
else if(outx.y<cols && outy.y<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
}
else if(outx.x<cols && outy.x<rows)
{
dst[addr.x] = outpix0;
}
}
__kernel void convertC3C4_D2(__global const int * restrict src, __global int4 *dst, int cols, int rows,
int srcStep, int dstStep)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
int d = (y * srcStep + x * 12)>>2;
int4 data = *(__global int4 *)(src + d);
data.z = 0;
if(y < rows)
dst[y * dstStep + x] = data;
}
__kernel void convertC4C3_D2(__global const int4 * restrict src, __global int *dst, int cols, int rows,
int srcStep, int dstStep)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
int4 data = src[y * srcStep + x];
if(y < rows)
{
int d = y * dstStep + x * 3;
dst[d] = data.x;
dst[d + 1] = data.y;
dst[d + 2] = data.z;
}
}
__kernel void convertC4C3_D1(__global const short4 * restrict src, __global short *dst, int cols, int rows,
int srcStep, int dstStep)
__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int srcStep_in_pixel,int pixel_end)
{
int id = get_global_id(0);
int id = get_global_id(0)<<2;
int y = id / cols;
int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
pixel0 = src[addr.x];
pixel1 = src[addr.y];
pixel2 = src[addr.z];
pixel3 = src[addr.w];
short4 data = src[y * srcStep + x];
if(y < rows)
pixel0.w = pixel1.x;
outpixel1.x = pixel1.y;
outpixel1.y = pixel1.z;
outpixel1.z = pixel2.x;
outpixel1.w = pixel2.y;
outpixel2.x = pixel2.z;
outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z;
int4 outaddr = mul24(id>>2 , 3);
outaddr.y++;
outaddr.z+=2;
//printf("%d ",outaddr.z);
if(outaddr.z <= pixel_end)
{
int d = y * dstStep + x * 3;
dst[d] = data.x;
dst[d + 1] = data.y;
dst[d + 2] = data.z;
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
dst[outaddr.z] = outpixel2;
}
}
__kernel void convertC4C3_D0(__global const char4 * restrict src, __global char *dst, int cols, int rows,
int srcStep, int dstStep)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
char4 data = src[y * srcStep + x];
if(y < rows)
else if(outaddr.y <= pixel_end)
{
int d = y * dstStep + x * 3;
dst[d] = data.x;
dst[d + 1] = data.y;
dst[d + 2] = data.z;
dst[outaddr.x] = pixel0;
dst[outaddr.y] = outpixel1;
}
else if(outaddr.x <= pixel_end)
{
dst[outaddr.x] = pixel0;
}
}
This diff is collapsed.
......@@ -34,13 +34,8 @@
//
//
/*
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
*/
__kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat,
__kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0)<<2;
......@@ -49,7 +44,8 @@ __kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat,
int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,(int)(x+ offset_in_pixel & (int)0xfffffffc));
uchar4 out;
out.x = out.y = out.z = out.w = convert_uchar_sat(scalar.x);
out.x = out.y = out.z = out.w = scalar;
if ( (idx>=addr_start)&(idx+3 < addr_end) & (y < rows))
{
*(__global uchar4*)(dstMat+idx) = out;
......@@ -65,7 +61,7 @@ __kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat,
}
}
__kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat,
__kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
......@@ -73,52 +69,6 @@ __kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat,
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = convert_uchar4_sat(scalar);
dstMat[idx] = scalar;
}
}
__kernel void set_to_without_mask_C1_D4(float4 scalar,__global int * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y, dstStep_in_pixel, x+offset_in_pixel);
dstMat[idx] = convert_int_sat(scalar.x);
}
}
__kernel void set_to_without_mask_C4_D4(float4 scalar,__global int4 * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = convert_int4_sat(scalar);
}
}
__kernel void set_to_without_mask_C1_D5(float4 scalar,__global float * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = scalar.x;
}
}
__kernel void set_to_without_mask_C4_D5(float4 scalar,__global float4 * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = scalar;
}
}
......@@ -35,12 +35,6 @@
//
/*#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
*/
/*
__kernel void set_to_with_mask_C1_D0(
float4 scalar,
......@@ -67,7 +61,7 @@ __kernel void set_to_with_mask_C1_D0(
*/
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void set_to_with_mask_C1_D0(
float4 scalar,
uchar scalar,
__global uchar* dstMat,
int cols,
int rows,
......@@ -85,7 +79,7 @@ __kernel void set_to_with_mask_C1_D0(
int mask_addr_start = mad24(y,maskStep,maskoffset);
int mask_addr_end = mad24(y,maskStep,cols+maskoffset);
int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc);
uchar out = convert_uchar_sat(scalar.x);
int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3;
if ( (x < cols) & (y < rows) )
......@@ -107,104 +101,16 @@ __kernel void set_to_with_mask_C1_D0(
temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0;
temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0;
uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end)& trans_mask[off_mask] ? out : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end)& trans_mask[off_mask+1] ? out : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end)& trans_mask[off_mask+2] ? out : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end)& trans_mask[off_mask+3] ? out : temp_dst.w;
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end)& trans_mask[off_mask] ? scalar : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end)& trans_mask[off_mask+1] ? scalar : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end)& trans_mask[off_mask+2] ? scalar : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end)& trans_mask[off_mask+3] ? scalar : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst;
}
}
__kernel void set_to_with_mask_C4_D0(
float4 scalar,
__global uchar4 * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_uchar4_sat(scalar);
}
}
__kernel void set_to_with_mask_C1_D4(
float4 scalar,
__global int * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_int_sat(scalar.x);
}
}
__kernel void set_to_with_mask_C4_D4(
float4 scalar,
__global int4 * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_int4_sat(scalar);
}
}
__kernel void set_to_with_mask_C1_D5(
float4 scalar,
__global float * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = scalar.x;
}
}
__kernel void set_to_with_mask_C4_D5(
float4 scalar,
__global float4 * dstMat,
__kernel void set_to_with_mask(
GENTYPE scalar,
__global GENTYPE * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
......@@ -220,7 +126,7 @@ __kernel void set_to_with_mask_C4_D5(
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = scalar;
dstMat[dstidx] = scalar;
}
}
......
This diff is collapsed.
......@@ -97,7 +97,7 @@ namespace cv
size_t widthInBytes, size_t height);
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch,
size_t width, size_t height, enum openCLMemcpyKind kind);
size_t width, size_t height, enum openCLMemcpyKind kind, int channels=-1);
void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch,
size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind);
......@@ -126,8 +126,8 @@ namespace cv
cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr,
enum openCLMemcpyKind kind, cl_bool blocking_write);
//void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr,
// enum openCLMemcpyKind kind, cl_bool blocking_write);
int savetofile(const Context *clcxt, cl_program &program, const char *fileName);
struct Context::Impl
{
......
......@@ -958,7 +958,7 @@ TEST_P(Remap, Mat)
if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1))
{
cout << "LINEAR don't support the map1Type and map2Type" << endl;
return;
return;
}
int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
......
......@@ -396,6 +396,101 @@ TEST_P(SetTo, With_mask)
}
}
//convertC3C4
PARAM_TEST_CASE(convertC3C4, MatType, cv::Size)
{
int type;
cv::Size ksize;
//src mat
cv::Mat mat1;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat dst_roi;
std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
ksize = GET_PARAM(1);
//dst = randomMat(rng, size, type, 5, 16, false);
int devnums = getDevice(oclinfo);
CV_Assert(devnums > 0);
//if you want to use undefault device, set it here
//setDevice(oclinfo[1]);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
dstx = 0;
dsty = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
}
};
TEST_P(convertC3C4, Accuracy)
{
cv::RNG &rng = TS::ptr()->get_rng();
for(int j = 0; j < LOOP_TIMES; j++)
{
//random_roi();
int width = rng.uniform(2, MWIDTH);
int height = rng.uniform(2, MHEIGHT);
cv::Size size(width, height);
mat1 = randomMat(rng, size, type, 0, 40, false);
gmat1 = mat1;
cv::Mat cpu_dst;
gmat1.download(cpu_dst);
char sss[1024];
sprintf(sss, "cols=%d,rows=%d", mat1.cols, mat1.rows);
EXPECT_MAT_NEAR(mat1, cpu_dst, 0.0, sss);
}
}
INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4),
......@@ -408,5 +503,8 @@ INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4),
Values(false))); // Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(MatrixOperation, convertC3C4, Combine(
Values(CV_8UC3, CV_32SC3, CV_32FC3),
Values(cv::Size())));
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment