Commit 539f367d authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

refactored gpu::LUT function:

* converted it to Algorithm, because implementation uses inner buffers and
  requires preprocessing step
* new implementation splits preprocessing and transform,
  what is more effecient
* old API still can be used for source compatibility (marked as deprecated)
parent 0c50d082
...@@ -49,6 +49,17 @@ ...@@ -49,6 +49,17 @@
#include "opencv2/core/gpu.hpp" #include "opencv2/core/gpu.hpp"
#if defined __GNUC__
#define __OPENCV_GPUARITHM_DEPR_BEFORE__
#define __OPENCV_GPUARITHM_DEPR_AFTER__ __attribute__ ((deprecated))
#elif (defined WIN32 || defined _WIN32)
#define __OPENCV_GPUARITHM_DEPR_BEFORE__ __declspec(deprecated)
#define __OPENCV_GPUARITHM_DEPR_AFTER__
#else
#define __OPENCV_GPUARITHM_DEPR_BEFORE__
#define __OPENCV_GPUARITHM_DEPR_AFTER__
#endif
namespace cv { namespace gpu { namespace cv { namespace gpu {
//! adds one matrix to another (dst = src1 + src2) //! adds one matrix to another (dst = src1 + src2)
...@@ -178,14 +189,25 @@ CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Str ...@@ -178,14 +189,25 @@ CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Str
//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth //! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null()); CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());
//! implements generalized matrix product algorithm GEMM from BLAS
CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
//! destination array will have the depth type as lut and the same channels number as source //! destination array will have the depth type as lut and the same channels number as source
//! supports CV_8UC1, CV_8UC3 types //! supports CV_8UC1, CV_8UC3 types
CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null()); class CV_EXPORTS LookUpTable : public Algorithm
{
public:
virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
};
CV_EXPORTS Ptr<LookUpTable> createLookUpTable(InputArray lut);
__OPENCV_GPUARITHM_DEPR_BEFORE__ void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
inline void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream)
{
createLookUpTable(lut)->transform(src, dst, stream);
}
//! implements generalized matrix product algorithm GEMM from BLAS
CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values //! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
...@@ -311,4 +333,7 @@ CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& resul ...@@ -311,4 +333,7 @@ CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& resul
}} // namespace cv { namespace gpu { }} // namespace cv { namespace gpu {
#undef __OPENCV_GPUARITHM_DEPR_BEFORE__
#undef __OPENCV_GPUARITHM_DEPR_AFTER__
#endif /* __OPENCV_GPUARITHM_HPP__ */ #endif /* __OPENCV_GPUARITHM_HPP__ */
...@@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel, ...@@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel,
if (PERF_RUN_GPU()) if (PERF_RUN_GPU())
{ {
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
const cv::gpu::GpuMat d_src(src); const cv::gpu::GpuMat d_src(src);
cv::gpu::GpuMat dst; cv::gpu::GpuMat dst;
TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst); TEST_CYCLE() lutAlg->transform(d_src, dst);
GPU_SANITY_CHECK(dst); GPU_SANITY_CHECK(dst);
} }
...@@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel, ...@@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,
if (PERF_RUN_GPU()) if (PERF_RUN_GPU())
{ {
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
const cv::gpu::GpuMat d_src(src); const cv::gpu::GpuMat d_src(src);
cv::gpu::GpuMat dst; cv::gpu::GpuMat dst;
TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst); TEST_CYCLE() lutAlg->transform(d_src, dst);
GPU_SANITY_CHECK(dst); GPU_SANITY_CHECK(dst);
} }
......
...@@ -57,7 +57,7 @@ void cv::gpu::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); } ...@@ -57,7 +57,7 @@ void cv::gpu::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
void cv::gpu::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); } void cv::gpu::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); } Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr<LookUpTable>(); }
void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); } void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
...@@ -290,93 +290,214 @@ void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stre ...@@ -290,93 +290,214 @@ void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stre
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// LUT // LUT
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) #if (CUDA_VERSION >= 5000)
namespace
{ {
const int cn = src.channels(); class LookUpTableImpl : public LookUpTable
{
public:
LookUpTableImpl(InputArray lut);
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 ); void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
CV_Assert( lut.depth() == CV_8U );
CV_Assert( lut.channels() == 1 || lut.channels() == cn );
CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn)); private:
int lut_cn;
NppiSize sz; int nValues3[3];
sz.height = src.rows; const Npp32s* pValues3[3];
sz.width = src.cols; const Npp32s* pLevels3[3];
Mat nppLut; GpuMat d_pLevels;
lut.convertTo(nppLut, CV_32S); GpuMat d_nppLut;
GpuMat d_nppLut3[3];
};
int nValues3[] = {256, 256, 256}; LookUpTableImpl::LookUpTableImpl(InputArray _lut)
{
nValues3[0] = nValues3[1] = nValues3[2] = 256;
Npp32s pLevels[256]; Npp32s pLevels[256];
for (int i = 0; i < 256; ++i) for (int i = 0; i < 256; ++i)
pLevels[i] = i; pLevels[i] = i;
const Npp32s* pLevels3[3]; d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
#if (CUDA_VERSION <= 4020) GpuMat lut;
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels; if (_lut.kind() == _InputArray::GPU_MAT)
#else {
GpuMat d_pLevels; lut = _lut.getGpuMat();
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels)); }
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>(); else
#endif {
Mat hLut = _lut.getMat();
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
lut.upload(Mat(1, 256, hLut.type(), hLut.data));
}
cudaStream_t stream = StreamAccessor::getStream(s); lut_cn = lut.channels();
NppStreamHandler h(stream);
CV_Assert( lut.depth() == CV_8U );
CV_Assert( lut.rows == 1 && lut.cols == 256 );
lut.convertTo(d_nppLut, CV_32S);
if (lut_cn == 1)
{
pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
}
else
{
gpu::split(d_nppLut, d_nppLut3);
pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
}
}
if (src.type() == CV_8UC1) void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
{ {
#if (CUDA_VERSION <= 4020) GpuMat src = _src.getGpuMat();
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) ); const int cn = src.channels();
#else
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), CV_Assert( lut_cn == 1 || lut_cn == cn );
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
#endif _dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStreamHandler h(stream);
NppiSize sz;
sz.height = src.rows;
sz.width = src.cols;
if (src.type() == CV_8UC1)
{
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
}
else
{
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
else }
#else // (CUDA_VERSION >= 5000)
namespace
{
class LookUpTableImpl : public LookUpTable
{ {
public:
LookUpTableImpl(InputArray lut);
void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
private:
int lut_cn;
Npp32s pLevels[256];
int nValues3[3];
const Npp32s* pValues3[3]; const Npp32s* pValues3[3];
const Npp32s* pLevels3[3];
Mat nppLut;
Mat nppLut3[3]; Mat nppLut3[3];
if (nppLut.channels() == 1) };
LookUpTableImpl::LookUpTableImpl(InputArray _lut)
{
nValues3[0] = nValues3[1] = nValues3[2] = 256;
for (int i = 0; i < 256; ++i)
pLevels[i] = i;
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
Mat lut;
if (_lut.kind() == _InputArray::GPU_MAT)
{
lut = Mat(_lut.getGpuMat());
}
else
{
Mat hLut = _lut.getMat();
CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
lut = hLut;
}
lut_cn = lut.channels();
CV_Assert( lut.depth() == CV_8U );
CV_Assert( lut.rows == 1 && lut.cols == 256 );
lut.convertTo(nppLut, CV_32S);
if (lut_cn == 1)
{ {
#if (CUDA_VERSION <= 4020)
pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>(); pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
#else
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
#endif
} }
else else
{ {
cv::split(nppLut, nppLut3); cv::split(nppLut, nppLut3);
#if (CUDA_VERSION <= 4020)
pValues3[0] = nppLut3[0].ptr<Npp32s>(); pValues3[0] = nppLut3[0].ptr<Npp32s>();
pValues3[1] = nppLut3[1].ptr<Npp32s>(); pValues3[1] = nppLut3[1].ptr<Npp32s>();
pValues3[2] = nppLut3[2].ptr<Npp32s>(); pValues3[2] = nppLut3[2].ptr<Npp32s>();
#else }
GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data)); }
GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
pValues3[0] = d_nppLut0.ptr<Npp32s>(); void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
pValues3[1] = d_nppLut1.ptr<Npp32s>(); {
pValues3[2] = d_nppLut2.ptr<Npp32s>(); GpuMat src = _src.getGpuMat();
#endif
const int cn = src.channels();
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
CV_Assert( lut_cn == 1 || lut_cn == cn );
_dst.create(src.size(), src.type());
GpuMat dst = _dst.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
NppStreamHandler h(stream);
NppiSize sz;
sz.height = src.rows;
sz.width = src.cols;
if (src.type() == CV_8UC1)
{
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
}
else
{
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
} }
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), if (stream == 0)
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}
if (stream == 0) #endif // (CUDA_VERSION >= 5000)
cudaSafeCall( cudaDeviceSynchronize() );
Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray lut)
{
return new LookUpTableImpl(lut);
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
......
...@@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel) ...@@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel)
cv::Mat src = randomMat(size, type); cv::Mat src = randomMat(size, type);
cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1); cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels())); cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
cv::gpu::LUT(loadMat(src, useRoi), lut, dst); lutAlg->transform(loadMat(src, useRoi), dst);
cv::Mat dst_gold; cv::Mat dst_gold;
cv::LUT(src, lut, dst_gold); cv::LUT(src, lut, dst_gold);
...@@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel) ...@@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel)
cv::Mat src = randomMat(size, type); cv::Mat src = randomMat(size, type);
cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels())); cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi); cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
cv::gpu::LUT(loadMat(src, useRoi), lut, dst); lutAlg->transform(loadMat(src, useRoi), dst);
cv::Mat dst_gold; cv::Mat dst_gold;
cv::LUT(src, lut, dst_gold); cv::LUT(src, lut, dst_gold);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment