Commit 66ac4621 authored by Alexander Karsakov's avatar Alexander Karsakov

Final refactoring, fixes

parent 1d2cf0e2
...@@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine( ...@@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine(
typedef Size_MatType TransposeFixture; typedef Size_MatType TransposeFixture;
OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine( OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
OCL_TEST_SIZES, Values(CV_8UC1, CV_32FC1, CV_8UC2, CV_32FC2, CV_8UC4, CV_32FC4))) OCL_TEST_SIZES, OCL_TEST_TYPES_134))
{ {
const Size_MatType_t params = GetParam(); const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params); const Size srcSize = get<0>(params);
......
...@@ -54,40 +54,21 @@ namespace ocl { ...@@ -54,40 +54,21 @@ namespace ocl {
///////////// dft //////////////////////// ///////////// dft ////////////////////////
enum OCL_FFT_TYPE typedef tuple<Size, int> DftParams;
{
R2R = 0, // real to real (CCS)
C2R = 1, // complex to real
R2C = 2, // real to complex
C2C = 3 // complex to complex
};
typedef tuple<OCL_FFT_TYPE, Size, int> DftParams;
typedef TestBaseWithParam<DftParams> DftFixture; typedef TestBaseWithParam<DftParams> DftFixture;
OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C), OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)), Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE/*, (int)DFT_INVERSE, (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
(int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/)))
{ {
const DftParams params = GetParam(); const DftParams params = GetParam();
const int dft_type = get<0>(params); const Size srcSize = get<0>(params);
const Size srcSize = get<1>(params); const int flags = get<1>(params);
int flags = get<2>(params);
UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
int in_cn, out_cn;
switch (dft_type)
{
case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break;
case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break;
case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break;
case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break;
}
UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn));
declare.in(src, WARMUP_RNG).out(dst); declare.in(src, WARMUP_RNG).out(dst);
OCL_TEST_CYCLE() cv::dft(src, dst, flags); OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT);
SANITY_CHECK(dst, 1e-3); SANITY_CHECK(dst, 1e-3);
} }
......
...@@ -1781,251 +1781,11 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag) ...@@ -1781,251 +1781,11 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag)
#endif #endif
} }
#ifdef HAVE_CLAMDFFT #ifdef HAVE_OPENCL
namespace cv {
#define CLAMDDFT_Assert(func) \
{ \
clAmdFftStatus s = (func); \
CV_Assert(s == CLFFT_SUCCESS); \
}
class PlanCache
{
struct FftPlan
{
FftPlan(const Size & _dft_size, int _src_step, int _dst_step, bool _doubleFP, bool _inplace, int _flags, FftType _fftType) :
dft_size(_dft_size), src_step(_src_step), dst_step(_dst_step),
doubleFP(_doubleFP), inplace(_inplace), flags(_flags), fftType(_fftType),
context((cl_context)ocl::Context::getDefault().ptr()), plHandle(0)
{
bool dft_inverse = (flags & DFT_INVERSE) != 0;
bool dft_scale = (flags & DFT_SCALE) != 0;
bool dft_rows = (flags & DFT_ROWS) != 0;
clAmdFftLayout inLayout = CLFFT_REAL, outLayout = CLFFT_REAL;
clAmdFftDim dim = dft_size.height == 1 || dft_rows ? CLFFT_1D : CLFFT_2D;
size_t batchSize = dft_rows ? dft_size.height : 1;
size_t clLengthsIn[3] = { dft_size.width, dft_rows ? 1 : dft_size.height, 1 };
size_t clStridesIn[3] = { 1, 1, 1 };
size_t clStridesOut[3] = { 1, 1, 1 };
int elemSize = doubleFP ? sizeof(double) : sizeof(float);
switch (fftType)
{
case C2C:
inLayout = CLFFT_COMPLEX_INTERLEAVED;
outLayout = CLFFT_COMPLEX_INTERLEAVED;
clStridesIn[1] = src_step / (elemSize << 1);
clStridesOut[1] = dst_step / (elemSize << 1);
break;
case R2C:
inLayout = CLFFT_REAL;
outLayout = CLFFT_HERMITIAN_INTERLEAVED;
clStridesIn[1] = src_step / elemSize;
clStridesOut[1] = dst_step / (elemSize << 1);
break;
case C2R:
inLayout = CLFFT_HERMITIAN_INTERLEAVED;
outLayout = CLFFT_REAL;
clStridesIn[1] = src_step / (elemSize << 1);
clStridesOut[1] = dst_step / elemSize;
break;
case R2R:
default:
CV_Error(Error::StsNotImplemented, "AMD Fft does not support this type");
break;
}
clStridesIn[2] = dft_rows ? clStridesIn[1] : dft_size.width * clStridesIn[1];
clStridesOut[2] = dft_rows ? clStridesOut[1] : dft_size.width * clStridesOut[1];
CLAMDDFT_Assert(clAmdFftCreateDefaultPlan(&plHandle, (cl_context)ocl::Context::getDefault().ptr(), dim, clLengthsIn))
// setting plan properties
CLAMDDFT_Assert(clAmdFftSetPlanPrecision(plHandle, doubleFP ? CLFFT_DOUBLE : CLFFT_SINGLE));
CLAMDDFT_Assert(clAmdFftSetResultLocation(plHandle, inplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE))
CLAMDDFT_Assert(clAmdFftSetLayout(plHandle, inLayout, outLayout))
CLAMDDFT_Assert(clAmdFftSetPlanBatchSize(plHandle, batchSize))
CLAMDDFT_Assert(clAmdFftSetPlanInStride(plHandle, dim, clStridesIn))
CLAMDDFT_Assert(clAmdFftSetPlanOutStride(plHandle, dim, clStridesOut))
CLAMDDFT_Assert(clAmdFftSetPlanDistance(plHandle, clStridesIn[dim], clStridesOut[dim]))
float scale = dft_scale ? 1.0f / (dft_rows ? dft_size.width : dft_size.area()) : 1.0f;
CLAMDDFT_Assert(clAmdFftSetPlanScale(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale))
// ready to bake
cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr();
CLAMDDFT_Assert(clAmdFftBakePlan(plHandle, 1, &queue, NULL, NULL))
}
~FftPlan()
{
// clAmdFftDestroyPlan(&plHandle);
}
friend class PlanCache;
private:
Size dft_size;
int src_step, dst_step;
bool doubleFP;
bool inplace;
int flags;
FftType fftType;
cl_context context;
clAmdFftPlanHandle plHandle;
};
public:
static PlanCache & getInstance()
{
static PlanCache planCache;
return planCache;
}
clAmdFftPlanHandle getPlanHandle(const Size & dft_size, int src_step, int dst_step, bool doubleFP,
bool inplace, int flags, FftType fftType)
{
cl_context currentContext = (cl_context)ocl::Context::getDefault().ptr();
for (size_t i = 0, size = planStorage.size(); i < size; ++i)
{
const FftPlan * const plan = planStorage[i];
if (plan->dft_size == dft_size &&
plan->flags == flags &&
plan->src_step == src_step &&
plan->dst_step == dst_step &&
plan->doubleFP == doubleFP &&
plan->fftType == fftType &&
plan->inplace == inplace)
{
if (plan->context != currentContext)
{
planStorage.erase(planStorage.begin() + i);
break;
}
return plan->plHandle;
}
}
// no baked plan is found, so let's create a new one
FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType);
planStorage.push_back(newPlan);
return newPlan->plHandle;
}
~PlanCache()
{
for (std::vector<FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
delete (*i);
planStorage.clear();
}
protected:
PlanCache() :
planStorage()
{
}
std::vector<FftPlan *> planStorage;
};
extern "C" {
static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
{
UMatData * u = (UMatData *)p;
if( u && CV_XADD(&u->urefcount, -1) == 1 )
u->currAllocator->deallocate(u);
u = 0;
clReleaseEvent(e), e = 0;
}
}
static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
Size ssize = _src.size();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( (!doubleSupport && depth == CV_64F) ||
!(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2) ||
_src.offset() != 0)
return false;
// if is not a multiplication of prime numbers { 2, 3, 5 }
if (ssize.area() != getOptimalDFTSize(ssize.area()))
return false;
int dst_complex_input = cn == 2 ? 1 : 0;
bool dft_inverse = (flags & DFT_INVERSE) != 0 ? 1 : 0;
int dft_complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
bool dft_real_output = (flags & DFT_REAL_OUTPUT) != 0;
CV_Assert(dft_complex_output + dft_real_output < 2);
FftType fftType = (FftType)(dst_complex_input << 0 | dft_complex_output << 1);
switch (fftType)
{
case C2C:
_dst.create(ssize.height, ssize.width, CV_MAKE_TYPE(depth, 2));
break;
case R2C: // TODO implement it if possible
case C2R: // TODO implement it if possible
case R2R: // AMD Fft does not support this type
default:
return false;
}
UMat src = _src.getUMat(), dst = _dst.getUMat();
bool inplace = src.u == dst.u;
clAmdFftPlanHandle plHandle = PlanCache::getInstance().
getPlanHandle(ssize, (int)src.step, (int)dst.step,
depth == CV_64F, inplace, flags, fftType);
// get the bufferSize
size_t bufferSize = 0;
CLAMDDFT_Assert(clAmdFftGetTmpBufSize(plHandle, &bufferSize))
UMat tmpBuffer(1, (int)bufferSize, CV_8UC1);
cl_mem srcarg = (cl_mem)src.handle(ACCESS_READ);
cl_mem dstarg = (cl_mem)dst.handle(ACCESS_RW);
cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr();
cl_event e = 0;
CLAMDDFT_Assert(clAmdFftEnqueueTransform(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
1, &queue, 0, NULL, &e,
&srcarg, &dstarg, (cl_mem)tmpBuffer.handle(ACCESS_RW)))
tmpBuffer.addref();
clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u);
return true;
}
#undef DFT_ASSERT
}
#endif // HAVE_CLAMDFFT
namespace cv namespace cv
{ {
#ifdef HAVE_OPENCL
enum FftType enum FftType
{ {
R2R = 0, R2R = 0,
...@@ -2038,7 +1798,7 @@ static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int> ...@@ -2038,7 +1798,7 @@ static void ocl_getRadixes(int cols, std::vector<int>& radixes, std::vector<int>
{ {
int factors[34]; int factors[34];
int nf = DFTFactorize(cols, factors); int nf = DFTFactorize(cols, factors);
int n = 1; int n = 1;
int factor_index = 0; int factor_index = 0;
min_radix = INT_MAX; min_radix = INT_MAX;
...@@ -2118,7 +1878,7 @@ struct OCL_FftPlan ...@@ -2118,7 +1878,7 @@ struct OCL_FftPlan
ocl_getRadixes(dft_size, radixes, blocks, min_radix); ocl_getRadixes(dft_size, radixes, blocks, min_radix);
thread_count = dft_size / min_radix; thread_count = dft_size / min_radix;
if (thread_count > ocl::Device::getDefault().maxWorkGroupSize()) if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize())
{ {
status = false; status = false;
return; return;
...@@ -2141,13 +1901,13 @@ struct OCL_FftPlan ...@@ -2141,13 +1901,13 @@ struct OCL_FftPlan
Mat tw(1, twiddle_size, CV_32FC2); Mat tw(1, twiddle_size, CV_32FC2);
float* ptr = tw.ptr<float>(); float* ptr = tw.ptr<float>();
int ptr_index = 0; int ptr_index = 0;
n = 1; n = 1;
for (size_t i=0; i<radixes.size(); i++) for (size_t i=0; i<radixes.size(); i++)
{ {
int radix = radixes[i]; int radix = radixes[i];
n *= radix; n *= radix;
for (int j=1; j<radix; j++) for (int j=1; j<radix; j++)
{ {
double theta = -CV_TWO_PI*j/n; double theta = -CV_TWO_PI*j/n;
...@@ -2157,7 +1917,7 @@ struct OCL_FftPlan ...@@ -2157,7 +1917,7 @@ struct OCL_FftPlan
ptr[ptr_index++] = (float) cos(k*theta); ptr[ptr_index++] = (float) cos(k*theta);
ptr[ptr_index++] = (float) sin(k*theta); ptr[ptr_index++] = (float) sin(k*theta);
} }
} }
} }
twiddles = tw.getUMat(ACCESS_READ); twiddles = tw.getUMat(ACCESS_READ);
...@@ -2165,7 +1925,7 @@ struct OCL_FftPlan ...@@ -2165,7 +1925,7 @@ struct OCL_FftPlan
dft_size, dft_size/thread_count, radix_processing.c_str()); dft_size, dft_size/thread_count, radix_processing.c_str());
} }
bool enqueueTransform(InputArray _src, OutputArray _dst, int dft_size, int flags, int fftType, bool rows = true) const bool enqueueTransform(InputArray _src, OutputArray _dst, int num_dfts, int flags, int fftType, bool rows = true) const
{ {
if (!status) if (!status)
return false; return false;
...@@ -2177,7 +1937,7 @@ struct OCL_FftPlan ...@@ -2177,7 +1937,7 @@ struct OCL_FftPlan
size_t localsize[2]; size_t localsize[2];
String kernel_name; String kernel_name;
bool is1d = (flags & DFT_ROWS) != 0 || dft_size == 1; bool is1d = (flags & DFT_ROWS) != 0 || num_dfts == 1;
bool inv = (flags & DFT_INVERSE) != 0; bool inv = (flags & DFT_INVERSE) != 0;
String options = buildOptions; String options = buildOptions;
...@@ -2191,7 +1951,7 @@ struct OCL_FftPlan ...@@ -2191,7 +1951,7 @@ struct OCL_FftPlan
} }
else else
{ {
globalsize[0] = dft_size; globalsize[1] = thread_count; globalsize[0] = num_dfts; globalsize[1] = thread_count;
localsize[0] = 1; localsize[1] = thread_count; localsize[0] = 1; localsize[1] = thread_count;
kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols"; kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols";
if (flags & DFT_SCALE) if (flags & DFT_SCALE)
...@@ -2201,7 +1961,7 @@ struct OCL_FftPlan ...@@ -2201,7 +1961,7 @@ struct OCL_FftPlan
options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT"; options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT";
options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT"; options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT";
options += is1d ? " -D IS_1D" : ""; options += is1d ? " -D IS_1D" : "";
if (!inv) if (!inv)
{ {
if ((is1d && src.channels() == 1) || (rows && (fftType == R2R))) if ((is1d && src.channels() == 1) || (rows && (fftType == R2R)))
...@@ -2219,7 +1979,7 @@ struct OCL_FftPlan ...@@ -2219,7 +1979,7 @@ struct OCL_FftPlan
if (k.empty()) if (k.empty())
return false; return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, dft_size); k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(twiddles), thread_count, num_dfts);
return k.run(2, globalsize, localsize, false); return k.run(2, globalsize, localsize, false);
} }
}; };
...@@ -2232,7 +1992,7 @@ public: ...@@ -2232,7 +1992,7 @@ public:
static OCL_FftPlanCache planCache; static OCL_FftPlanCache planCache;
return planCache; return planCache;
} }
OCL_FftPlan* getFftPlan(int dft_size) OCL_FftPlan* getFftPlan(int dft_size)
{ {
for (size_t i = 0, size = planStorage.size(); i < size; ++i) for (size_t i = 0, size = planStorage.size(); i < size; ++i)
...@@ -2280,11 +2040,9 @@ static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols ...@@ -2280,11 +2040,9 @@ static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols
static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows)
{ {
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); int type = _src.type(), cn = CV_MAT_CN(type);
Size ssize = _src.size(); Size ssize = _src.size();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; if ( !(type == CV_32FC1 || type == CV_32FC2) )
if ( (!doubleSupport && depth == CV_64F) ||
!(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2))
return false; return false;
// if is not a multiplication of prime numbers { 2, 3, 5 } // if is not a multiplication of prime numbers { 2, 3, 5 }
...@@ -2325,7 +2083,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro ...@@ -2325,7 +2083,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
if (fftType == C2C || fftType == R2C) if (fftType == C2C || fftType == R2C)
{ {
// complex output // complex output
_dst.create(src.size(), CV_32FC2); _dst.create(src.size(), CV_32FC2);
output = _dst.getUMat(); output = _dst.getUMat();
} }
else else
...@@ -2381,7 +2139,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro ...@@ -2381,7 +2139,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
int nonzero_cols = src.cols/2 + 1; int nonzero_cols = src.cols/2 + 1;
if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType))
return false; return false;
if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType)) if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType))
return false; return false;
} }
...@@ -2390,11 +2148,248 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro ...@@ -2390,11 +2148,248 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro
return true; return true;
} }
} // namespace cv;
#endif #endif
} // namespace cv; #ifdef HAVE_CLAMDFFT
namespace cv {
#define CLAMDDFT_Assert(func) \
{ \
clAmdFftStatus s = (func); \
CV_Assert(s == CLFFT_SUCCESS); \
}
class PlanCache
{
struct FftPlan
{
FftPlan(const Size & _dft_size, int _src_step, int _dst_step, bool _doubleFP, bool _inplace, int _flags, FftType _fftType) :
dft_size(_dft_size), src_step(_src_step), dst_step(_dst_step),
doubleFP(_doubleFP), inplace(_inplace), flags(_flags), fftType(_fftType),
context((cl_context)ocl::Context::getDefault().ptr()), plHandle(0)
{
bool dft_inverse = (flags & DFT_INVERSE) != 0;
bool dft_scale = (flags & DFT_SCALE) != 0;
bool dft_rows = (flags & DFT_ROWS) != 0;
clAmdFftLayout inLayout = CLFFT_REAL, outLayout = CLFFT_REAL;
clAmdFftDim dim = dft_size.height == 1 || dft_rows ? CLFFT_1D : CLFFT_2D;
size_t batchSize = dft_rows ? dft_size.height : 1;
size_t clLengthsIn[3] = { dft_size.width, dft_rows ? 1 : dft_size.height, 1 };
size_t clStridesIn[3] = { 1, 1, 1 };
size_t clStridesOut[3] = { 1, 1, 1 };
int elemSize = doubleFP ? sizeof(double) : sizeof(float);
switch (fftType)
{
case C2C:
inLayout = CLFFT_COMPLEX_INTERLEAVED;
outLayout = CLFFT_COMPLEX_INTERLEAVED;
clStridesIn[1] = src_step / (elemSize << 1);
clStridesOut[1] = dst_step / (elemSize << 1);
break;
case R2C:
inLayout = CLFFT_REAL;
outLayout = CLFFT_HERMITIAN_INTERLEAVED;
clStridesIn[1] = src_step / elemSize;
clStridesOut[1] = dst_step / (elemSize << 1);
break;
case C2R:
inLayout = CLFFT_HERMITIAN_INTERLEAVED;
outLayout = CLFFT_REAL;
clStridesIn[1] = src_step / (elemSize << 1);
clStridesOut[1] = dst_step / elemSize;
break;
case R2R:
default:
CV_Error(Error::StsNotImplemented, "AMD Fft does not support this type");
break;
}
clStridesIn[2] = dft_rows ? clStridesIn[1] : dft_size.width * clStridesIn[1];
clStridesOut[2] = dft_rows ? clStridesOut[1] : dft_size.width * clStridesOut[1];
CLAMDDFT_Assert(clAmdFftCreateDefaultPlan(&plHandle, (cl_context)ocl::Context::getDefault().ptr(), dim, clLengthsIn))
// setting plan properties
CLAMDDFT_Assert(clAmdFftSetPlanPrecision(plHandle, doubleFP ? CLFFT_DOUBLE : CLFFT_SINGLE));
CLAMDDFT_Assert(clAmdFftSetResultLocation(plHandle, inplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE))
CLAMDDFT_Assert(clAmdFftSetLayout(plHandle, inLayout, outLayout))
CLAMDDFT_Assert(clAmdFftSetPlanBatchSize(plHandle, batchSize))
CLAMDDFT_Assert(clAmdFftSetPlanInStride(plHandle, dim, clStridesIn))
CLAMDDFT_Assert(clAmdFftSetPlanOutStride(plHandle, dim, clStridesOut))
CLAMDDFT_Assert(clAmdFftSetPlanDistance(plHandle, clStridesIn[dim], clStridesOut[dim]))
float scale = dft_scale ? 1.0f / (dft_rows ? dft_size.width : dft_size.area()) : 1.0f;
CLAMDDFT_Assert(clAmdFftSetPlanScale(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale))
// ready to bake
cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr();
CLAMDDFT_Assert(clAmdFftBakePlan(plHandle, 1, &queue, NULL, NULL))
}
~FftPlan()
{
// clAmdFftDestroyPlan(&plHandle);
}
friend class PlanCache;
private:
Size dft_size;
int src_step, dst_step;
bool doubleFP;
bool inplace;
int flags;
FftType fftType;
cl_context context;
clAmdFftPlanHandle plHandle;
};
public:
static PlanCache & getInstance()
{
static PlanCache planCache;
return planCache;
}
clAmdFftPlanHandle getPlanHandle(const Size & dft_size, int src_step, int dst_step, bool doubleFP,
bool inplace, int flags, FftType fftType)
{
cl_context currentContext = (cl_context)ocl::Context::getDefault().ptr();
for (size_t i = 0, size = planStorage.size(); i < size; ++i)
{
const FftPlan * const plan = planStorage[i];
if (plan->dft_size == dft_size &&
plan->flags == flags &&
plan->src_step == src_step &&
plan->dst_step == dst_step &&
plan->doubleFP == doubleFP &&
plan->fftType == fftType &&
plan->inplace == inplace)
{
if (plan->context != currentContext)
{
planStorage.erase(planStorage.begin() + i);
break;
}
return plan->plHandle;
}
}
// no baked plan is found, so let's create a new one
FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType);
planStorage.push_back(newPlan);
return newPlan->plHandle;
}
~PlanCache()
{
for (std::vector<FftPlan *>::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i)
delete (*i);
planStorage.clear();
}
protected:
PlanCache() :
planStorage()
{
}
std::vector<FftPlan *> planStorage;
};
extern "C" {
static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
{
UMatData * u = (UMatData *)p;
if( u && CV_XADD(&u->urefcount, -1) == 1 )
u->currAllocator->deallocate(u);
u = 0;
clReleaseEvent(e), e = 0;
}
}
static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
Size ssize = _src.size();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( (!doubleSupport && depth == CV_64F) ||
!(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2) ||
_src.offset() != 0)
return false;
// if is not a multiplication of prime numbers { 2, 3, 5 }
if (ssize.area() != getOptimalDFTSize(ssize.area()))
return false;
int dst_complex_input = cn == 2 ? 1 : 0;
bool dft_inverse = (flags & DFT_INVERSE) != 0 ? 1 : 0;
int dft_complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0;
bool dft_real_output = (flags & DFT_REAL_OUTPUT) != 0;
CV_Assert(dft_complex_output + dft_real_output < 2);
FftType fftType = (FftType)(dst_complex_input << 0 | dft_complex_output << 1);
switch (fftType)
{
case C2C:
_dst.create(ssize.height, ssize.width, CV_MAKE_TYPE(depth, 2));
break;
case R2C: // TODO implement it if possible
case C2R: // TODO implement it if possible
case R2R: // AMD Fft does not support this type
default:
return false;
}
UMat src = _src.getUMat(), dst = _dst.getUMat();
bool inplace = src.u == dst.u;
clAmdFftPlanHandle plHandle = PlanCache::getInstance().
getPlanHandle(ssize, (int)src.step, (int)dst.step,
depth == CV_64F, inplace, flags, fftType);
// get the bufferSize
size_t bufferSize = 0;
CLAMDDFT_Assert(clAmdFftGetTmpBufSize(plHandle, &bufferSize))
UMat tmpBuffer(1, (int)bufferSize, CV_8UC1);
cl_mem srcarg = (cl_mem)src.handle(ACCESS_READ);
cl_mem dstarg = (cl_mem)dst.handle(ACCESS_RW);
cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr();
cl_event e = 0;
CLAMDDFT_Assert(clAmdFftEnqueueTransform(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
1, &queue, 0, NULL, &e,
&srcarg, &dstarg, (cl_mem)tmpBuffer.handle(ACCESS_RW)))
tmpBuffer.addref();
clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u);
return true;
}
#undef DFT_ASSERT
}
#endif // HAVE_CLAMDFFT
void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
{ {
......
...@@ -3002,8 +3002,7 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], ...@@ -3002,8 +3002,7 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
sync ? 0 : &p->e); sync ? 0 : &p->e);
if( sync || retval != CL_SUCCESS ) if( sync || retval != CL_SUCCESS )
{ {
int a = clFinish(qq); CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
CV_OclDbgAssert(a == CL_SUCCESS);
p->cleanupUMats(); p->cleanupUMats();
} }
else else
...@@ -3899,9 +3898,8 @@ public: ...@@ -3899,9 +3898,8 @@ public:
if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() ) if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
{ {
AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
int a = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
u->size, alignedPtr.getAlignedPtr(), 0, 0, 0); u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
CV_Assert( a == CL_SUCCESS );
u->markHostCopyObsolete(false); u->markHostCopyObsolete(false);
} }
} }
......
...@@ -6,36 +6,36 @@ ...@@ -6,36 +6,36 @@
#define fft5_5 0.363271264002f #define fft5_5 0.363271264002f
__attribute__((always_inline)) __attribute__((always_inline))
float2 mul_float2(float2 a, float2 b) { float2 mul_float2(float2 a, float2 b) {
return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));
} }
__attribute__((always_inline)) __attribute__((always_inline))
float2 twiddle(float2 a) { float2 twiddle(float2 a) {
return (float2)(a.y, -a.x); return (float2)(a.y, -a.x);
} }
__attribute__((always_inline)) __attribute__((always_inline))
void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size) const int x, const int block_size)
{ {
const int k = x & (block_size - 1); const int k = x & (block_size - 1);
a1 = mul_float2(twiddles[k], a1); a1 = mul_float2(twiddles[k], a1);
const int dst_ind = (x << 1) - k; const int dst_ind = (x << 1) - k;
smem[dst_ind] = a0 + a1; smem[dst_ind] = a0 + a1;
smem[dst_ind+block_size] = a0 - a1; smem[dst_ind+block_size] = a0 - a1;
} }
__attribute__((always_inline)) __attribute__((always_inline))
void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size) const int x, const int block_size)
{ {
const int k = x & (block_size - 1); const int k = x & (block_size - 1);
a1 = mul_float2(twiddles[k], a1); a1 = mul_float2(twiddles[k], a1);
a2 = mul_float2(twiddles[k + block_size], a2); a2 = mul_float2(twiddles[k + block_size], a2);
a3 = mul_float2(twiddles[k + 2*block_size], a3); a3 = mul_float2(twiddles[k + 2*block_size], a3);
const int dst_ind = ((x - k) << 2) + k; const int dst_ind = ((x - k) << 2) + k;
float2 b0 = a0 + a2; float2 b0 = a0 + a2;
...@@ -50,9 +50,9 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem ...@@ -50,9 +50,9 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem
} }
__attribute__((always_inline)) __attribute__((always_inline))
void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size) const int x, const int block_size)
{ {
const int k = x % block_size; const int k = x % block_size;
a1 = mul_float2(twiddles[k], a1); a1 = mul_float2(twiddles[k], a1);
a2 = mul_float2(twiddles[k+block_size], a2); a2 = mul_float2(twiddles[k+block_size], a2);
...@@ -69,8 +69,8 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global ...@@ -69,8 +69,8 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global
__attribute__((always_inline)) __attribute__((always_inline))
void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles, void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles,
const int x, const int block_size) const int x, const int block_size)
{ {
const int k = x % block_size; const int k = x % block_size;
a1 = mul_float2(twiddles[k], a1); a1 = mul_float2(twiddles[k], a1);
a2 = mul_float2(twiddles[k + block_size], a2); a2 = mul_float2(twiddles[k + block_size], a2);
...@@ -95,7 +95,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f ...@@ -95,7 +95,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f
a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x);
b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);
a4.x += fft5_4 * a3.y; a4.x += fft5_4 * a3.y;
a4.y -= fft5_4 * a3.x; a4.y -= fft5_4 * a3.x;
a1 = b0 + b1; a1 = b0 + b1;
...@@ -109,7 +109,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f ...@@ -109,7 +109,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f
} }
__attribute__((always_inline)) __attribute__((always_inline))
void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t)
{ {
float2 a0, a1; float2 a0, a1;
...@@ -122,13 +122,13 @@ void fft_radix2(__local float2* smem, __global const float2* twiddles, const int ...@@ -122,13 +122,13 @@ void fft_radix2(__local float2* smem, __global const float2* twiddles, const int
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (x < t) if (x < t)
butterfly2(a0, a1, smem, twiddles, x, block_size); butterfly2(a0, a1, smem, twiddles, x, block_size);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
__attribute__((always_inline)) __attribute__((always_inline))
void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{ {
const int x2 = x1 + t/2; const int x2 = x1 + t/2;
float2 a0, a1, a2, a3; float2 a0, a1, a2, a3;
...@@ -151,7 +151,7 @@ void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const ...@@ -151,7 +151,7 @@ void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const
} }
__attribute__((always_inline)) __attribute__((always_inline))
void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{ {
const int x2 = x1 + t/3; const int x2 = x1 + t/3;
const int x3 = x1 + 2*t/3; const int x3 = x1 + 2*t/3;
...@@ -177,7 +177,7 @@ void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const ...@@ -177,7 +177,7 @@ void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const
} }
__attribute__((always_inline)) __attribute__((always_inline))
void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{ {
const int thread_block = t/4; const int thread_block = t/4;
const int x2 = x1 + thread_block; const int x2 = x1 + thread_block;
...@@ -207,7 +207,7 @@ void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const ...@@ -207,7 +207,7 @@ void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const
} }
__attribute__((always_inline)) __attribute__((always_inline))
void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t)
{ {
const int thread_block = t/5; const int thread_block = t/5;
const int x2 = x1 + thread_block; const int x2 = x1 + thread_block;
...@@ -326,7 +326,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int ...@@ -326,7 +326,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int
a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]); a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]);
float2 b0, b1, b6, b7; float2 b0, b1, b6, b7;
b0 = a0 + a4; b0 = a0 + a4;
a4 = a0 - a4; a4 = a0 - a4;
b1 = a1 + a5; b1 = a1 + a5;
...@@ -335,7 +335,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int ...@@ -335,7 +335,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int
b6 = twiddle(a2 - a6); b6 = twiddle(a2 - a6);
a2 = a2 + a6; a2 = a2 + a6;
b7 = a3 - a7; b7 = a3 - a7;
b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y); b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y);
a3 = a3 + a7; a3 = a3 + a7;
a0 = b0 + a2; a0 = b0 + a2;
...@@ -571,10 +571,15 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, ...@@ -571,10 +571,15 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
} }
else else
{ {
// fill with zero other rows
#ifdef COMPLEX_OUTPUT
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset)); __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
#else
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
#endif
#pragma unroll #pragma unroll
for (int i=x; i<dst_cols; i+=block_size) for (int i=x; i<dst_cols; i+=block_size)
dst[i] = (float2) 0.f; dst[i] = 0.f;
} }
} }
...@@ -658,7 +663,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, ...@@ -658,7 +663,7 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
__global const float2* twiddles = (__global float2*) twiddles_ptr; __global const float2* twiddles = (__global float2*) twiddles_ptr;
const int ind = x; const int ind = x;
#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE) #if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)
__global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset)));
#pragma unroll #pragma unroll
for (int i=0; i<kercn; i++) for (int i=0; i<kercn; i++)
...@@ -667,12 +672,9 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, ...@@ -667,12 +672,9 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
smem[x+i*block_size].y = -src[i*block_size].y; smem[x+i*block_size].y = -src[i*block_size].y;
} }
#else #else
__global const float2* src;
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE) #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset))); __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(float), src_offset)));
#else
src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(1, (int)sizeof(float), src_offset)));
#endif
#pragma unroll #pragma unroll
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size) for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
...@@ -681,6 +683,20 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, ...@@ -681,6 +683,20 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
smem[i+1].y = -src[i].y; smem[i+1].y = -src[i].y;
smem[LOCAL_SIZE-i-1] = src[i]; smem[LOCAL_SIZE-i-1] = src[i];
} }
#else
#pragma unroll
for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
{
float2 src = vload2(0, (__global const float*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(float), src_offset))));
smem[i+1].x = src.x;
smem[i+1].y = -src.y;
smem[LOCAL_SIZE-i-1] = src;
}
#endif
if (x==0) if (x==0)
{ {
smem[0].x = *(__global const float*)(src_ptr + mad24(y, src_step, src_offset)); smem[0].x = *(__global const float*)(src_ptr + mad24(y, src_step, src_offset));
...@@ -688,7 +704,11 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, ...@@ -688,7 +704,11 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
if(LOCAL_SIZE % 2 ==0) if(LOCAL_SIZE % 2 ==0)
{ {
#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x; smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
#else
smem[LOCAL_SIZE/2].x = *(__global const float*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(float), src_offset)));
#endif
smem[LOCAL_SIZE/2].y = 0.f; smem[LOCAL_SIZE/2].y = 0.f;
} }
} }
...@@ -718,10 +738,15 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, ...@@ -718,10 +738,15 @@ __kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step,
} }
else else
{ {
__global float2* dst = (__global float*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset))); // fill with zero other rows
#ifdef COMPLEX_OUTPUT
__global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset));
#else
__global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset));
#endif
#pragma unroll #pragma unroll
for (int i=0; i<kercn; i++) for (int i=x; i<dst_cols; i+=block_size)
dst[i*block_size] = (float2) 0.f; dst[i] = 0.f;
} }
} }
...@@ -763,13 +788,13 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, ...@@ -763,13 +788,13 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
rez[0].y = -smem[y + i*block_size].y; rez[0].y = -smem[y + i*block_size].y;
} }
} }
#else #else
if (x < nz) if (x < nz)
{ {
__global const float2* twiddles = (__global float2*) twiddles_ptr; __global const float2* twiddles = (__global float2*) twiddles_ptr;
const int ind = y; const int ind = y;
const int block_size = LOCAL_SIZE/kercn; const int block_size = LOCAL_SIZE/kercn;
__local float2 smem[LOCAL_SIZE]; __local float2 smem[LOCAL_SIZE];
#ifdef EVEN #ifdef EVEN
if (x!=0 && (x!=(nz-1))) if (x!=0 && (x!=(nz-1)))
...@@ -781,7 +806,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, ...@@ -781,7 +806,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
#pragma unroll #pragma unroll
for (int i=0; i<kercn; i++) for (int i=0; i<kercn; i++)
{ {
float2 temp = *((__global const float2*)(src + i*block_size*src_step)); float2 temp = vload2(0, (__global const float*)(src + i*block_size*src_step));
smem[y+i*block_size].x = temp.x; smem[y+i*block_size].x = temp.x;
smem[y+i*block_size].y = -temp.y; smem[y+i*block_size].y = -temp.y;
} }
...@@ -819,7 +844,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, ...@@ -819,7 +844,7 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
// copy data to dst // copy data to dst
__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float2)), dst_offset)); __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float2)), dst_offset));
#pragma unroll #pragma unroll
for (int i=0; i<kercn; i++) for (int i=0; i<kercn; i++)
{ {
...@@ -827,6 +852,6 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, ...@@ -827,6 +852,6 @@ __kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step,
rez[0].x = smem[y + i*block_size].x; rez[0].x = smem[y + i*block_size].x;
rez[0].y = -smem[y + i*block_size].y; rez[0].y = -smem[y + i*block_size].y;
} }
} }
#endif #endif
} }
\ No newline at end of file
...@@ -48,26 +48,17 @@ ...@@ -48,26 +48,17 @@
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
enum OCL_FFT_TYPE
{
R2R = 0,
C2R = 1,
R2C = 2,
C2C = 3
};
namespace cvtest { namespace cvtest {
namespace ocl { namespace ocl {
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Dft // Dft
PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool) PARAM_TEST_CASE(Dft, cv::Size, MatDepth, bool, bool, bool, bool)
{ {
cv::Size dft_size; cv::Size dft_size;
int dft_flags, depth, cn, dft_type; int dft_flags, depth;
bool hint; bool inplace;
bool is1d;
TEST_DECLARE_INPUT_PARAMETER(src); TEST_DECLARE_INPUT_PARAMETER(src);
TEST_DECLARE_OUTPUT_PARAMETER(dst); TEST_DECLARE_OUTPUT_PARAMETER(dst);
...@@ -75,60 +66,34 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool) ...@@ -75,60 +66,34 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool)
virtual void SetUp() virtual void SetUp()
{ {
dft_size = GET_PARAM(0); dft_size = GET_PARAM(0);
dft_type = GET_PARAM(1); depth = GET_PARAM(1);
depth = CV_32F; inplace = GET_PARAM(2);
dft_flags = 0; dft_flags = 0;
switch (dft_type)
{
case R2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 1; break;
case C2R: dft_flags |= cv::DFT_REAL_OUTPUT; cn = 2; break;
case R2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 1; break;
case C2C: dft_flags |= cv::DFT_COMPLEX_OUTPUT; cn = 2; break;
}
if (GET_PARAM(2))
dft_flags |= cv::DFT_INVERSE;
if (GET_PARAM(3)) if (GET_PARAM(3))
dft_flags |= cv::DFT_ROWS; dft_flags |= cv::DFT_ROWS;
if (GET_PARAM(4)) if (GET_PARAM(4))
dft_flags |= cv::DFT_SCALE; dft_flags |= cv::DFT_SCALE;
hint = GET_PARAM(5); if (GET_PARAM(5))
is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1; dft_flags |= cv::DFT_INVERSE;
} }
void generateTestData() void generateTestData(int cn = 2)
{ {
src = randomMat(dft_size, CV_MAKE_TYPE(depth, cn), 0.0, 100.0); src = randomMat(dft_size, CV_MAKE_TYPE(depth, cn), 0.0, 100.0);
usrc = src.getUMat(ACCESS_READ); usrc = src.getUMat(ACCESS_READ);
if (inplace)
dst = src, udst = usrc;
} }
}; };
OCL_TEST_P(Dft, Mat) OCL_TEST_P(Dft, C2C)
{ {
generateTestData(); generateTestData();
int nonzero_rows = hint ? src.cols - randomInt(1, src.rows-1) : 0; OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
OCL_OFF(cv::dft(src, dst, dft_flags, nonzero_rows)); OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT));
OCL_ON(cv::dft(usrc, udst, dft_flags, nonzero_rows));
if (dft_type == R2C && is1d && (dft_flags & cv::DFT_INVERSE) == 0)
{
dst = dst(cv::Range(0, dst.rows), cv::Range(0, dst.cols/2 + 1));
udst = udst(cv::Range(0, udst.rows), cv::Range(0, udst.cols/2 + 1));
}
//Mat gpu = udst.getMat(ACCESS_READ);
//std::cout << dst << std::endl;
//std::cout << gpu << std::endl;
//int cn = udst.channels();
//
//Mat dst1ch = dst.reshape(1);
//Mat gpu1ch = gpu.reshape(1);
//Mat df;
//absdiff(dst1ch, gpu1ch, df);
//std::cout << Mat_<int>(df) << std::endl;
double eps = src.size().area() * 1e-4; double eps = src.size().area() * 1e-4;
EXPECT_MAT_NEAR(dst, udst, eps); EXPECT_MAT_NEAR(dst, udst, eps);
...@@ -185,15 +150,15 @@ OCL_TEST_P(MulSpectrums, Mat) ...@@ -185,15 +150,15 @@ OCL_TEST_P(MulSpectrums, Mat)
OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool()));
OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)), OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20),
Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), cv::Size(512, 1), cv::Size(1024, 768)),
Bool(), // DFT_INVERSE Values(CV_32F, CV_64F),
Bool(), // inplace
Bool(), // DFT_ROWS Bool(), // DFT_ROWS
Bool(), // DFT_SCALE Bool(), // DFT_SCALE
Bool() // hint Bool()) // DFT_INVERSE
)
); );
} } // namespace cvtest::ocl } } // namespace cvtest::ocl
#endif // HAVE_OPENCL #endif // HAVE_OPENCL
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment