Commit 54fa600b authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

update docs

minor fixes and refactoring of GPU module
parent 7d42dbdd
\section{Feature Detection and Description}
\cvclass{gpu::SURF\_GPU}
\cvclass{gpu::SURFParams\_GPU}\label{class.gpu.SURFParams}
Various SURF algorithm parameters.
\begin{lstlisting}
struct SURFParams_GPU
{
SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4),
initialScale(2.f), l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f),
l4(1.f/1.5f), edgeScale(0.81f), initialStep(1), extended(true),
featuresRatio(0.01f) {}
//! The interest operator threshold
float threshold;
//! The number of octaves to process
int nOctaves;
//! The number of intervals in each octave
int nIntervals;
//! The scale associated with the first interval of the first octave
float initialScale;
//! mask parameter l_1
float l1;
//! mask parameter l_2
float l2;
//! mask parameter l_3
float l3;
//! mask parameter l_4
float l4;
//! The amount to scale the edge rejection mask
float edgeScale;
//! The initial sampling step in pixels.
int initialStep;
//! True, if generate 128-len descriptors, false - 64-len descriptors
bool extended;
//! max features = featuresRatio * img.size().area()
float featuresRatio;
};
\end{lstlisting}
In contrast to \hyperref[cv.class.SURF]{cv::SURF} \texttt{SURF\_GPU} works with float sources (with range [0..1]). It performs conversion after calculation of the integral by division result by 255. Please take it into consideration when change some parameters (like hessian threshold).
Current \texttt{SURF\_GPU} implementation supports the number of intervals in each octave in range [3..21].
See also: \hyperref[class.gpu.SURF]{cv::gpu::SURF\_GPU}.
\cvclass{gpu::SURF\_GPU}\label{class.gpu.SURF}
Class for extracting Speeded Up Robust Features from an image.
\begin{lstlisting}
......@@ -62,7 +110,7 @@ The class \texttt{SURF\_GPU} can store results to GPU and CPU memory and provide
The class \texttt{SURF\_GPU} uses some buffers and provides access to it. All buffers can be safely released between function calls.
See also: \hyperref[cv.class.SURF]{cv::SURF}.
See also: \hyperref[cv.class.SURF]{cv::SURF}, \hyperref[class.gpu.SURFParams]{cv::gpu::SURFParams\_GPU}.
\cvclass{gpu::BruteForceMatcher\_GPU}
......@@ -269,7 +317,7 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par
void radiusMatch(const GpuMat\& queryDescs, \par std::vector< std::vector<DMatch> >\& matches, \par float maxDistance, \par const std::vector<GpuMat>\& masks = std::vector<GpuMat>(), \par bool compactResult = false);
}
This function works only on devices with Compute Capability $>=$ 1.1.
\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.
See also: \cvCppCross{DescriptorMatcher::radiusMatch}.
......@@ -293,7 +341,8 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par
In contrast to \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatch]{cv::gpu::BruteForceMather\_GPU::radiusMatch} results are not sorted by distance increasing order.
This function works only on devices with Compute Capability $>=$ 1.1.
\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.
\cvfunc{cv::gpu::BruteForceMatcher\_GPU::radiusMatchDownload}\label{cppfunc.gpu.BruteForceMatcher.radiusMatchDownload}
Downloads \texttt{trainIdx}, \texttt{nMatches} and \texttt{distance} matrices obtained via \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatchSingle]{radiusMatch} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}. If \texttt{compactResult} is true \texttt{matches} vector will not contain matches for fully masked out query descriptors.
......
......@@ -17,6 +17,8 @@ Performs mean-shift filtering for each point of the source image. It maps each p
\cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
\end{description}
\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
\cvCppFunc{gpu::meanShiftProc}
Performs mean-shift procedure and stores information about processed points (i.e. their colors and positions) into two images.
......@@ -35,6 +37,8 @@ Performs mean-shift procedure and stores information about processed points (i.e
\cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
\end{description}
\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
See also: \cvCppCross{gpu::meanShiftFiltering}.
......@@ -55,6 +59,8 @@ Performs mean-shift segmentation of the source image and eleminates small segmen
\cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
\end{description}
\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
\cvCppFunc{gpu::integral}
Computes integral image and squared integral image.
......@@ -319,7 +325,7 @@ double threshold(const GpuMat\& src, GpuMat\& dst, double thresh, \par double ma
}
\begin{description}
\cvarg{src}{Source array (single-channel, \texttt{CV\_64F} depth isn't supported).}
\cvarg{src}{Source array (single-channel).}
\cvarg{dst}{Destination array; will have the same size and the same type as \texttt{src}.}
\cvarg{thresh}{Threshold value.}
\cvarg{maxVal}{Maximum value to use with \texttt{THRESH\_BINARY} and \texttt{THRESH\_BINARY\_INV} thresholding types.}
......
No preview for this file type
......@@ -582,10 +582,10 @@ namespace cv { namespace gpu { namespace bfmatcher
}
///////////////////////////////////////////////////////////////////////////////
// Match kernel chooser
// Match caller
template <typename Dist, typename T, typename Train, typename Mask>
void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train,
void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
{
......@@ -616,11 +616,11 @@ namespace cv { namespace gpu { namespace bfmatcher
if (mask.data)
{
SingleMask m(mask);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
}
else
{
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
}
}
......@@ -640,11 +640,11 @@ namespace cv { namespace gpu { namespace bfmatcher
if (mask.data)
{
SingleMask m(mask);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
}
else
{
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
}
}
......@@ -664,11 +664,11 @@ namespace cv { namespace gpu { namespace bfmatcher
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
}
else
{
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
}
}
......@@ -688,11 +688,11 @@ namespace cv { namespace gpu { namespace bfmatcher
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
}
else
{
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
}
}
......@@ -942,22 +942,35 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// knn match caller
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
const Mask& mask, const DevMem2Df& allDist)
{
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);
}
void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance,
const DevMem2Df& allDist)
{
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
}
template <typename T>
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
{
if (mask.data)
{
calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist);
}
else
{
calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist);
}
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
}
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
......@@ -973,16 +986,16 @@ namespace cv { namespace gpu { namespace bfmatcher
{
if (mask.data)
{
calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist);
}
else
{
calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist);
}
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
}
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
......@@ -1061,7 +1074,16 @@ namespace cv { namespace gpu { namespace bfmatcher
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel chooser
// Radius Match caller
template <typename Dist, typename T, typename Mask>
void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
const DevMem2Df& distance)
{
radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask,
trainIdx, nMatches, distance);
}
template <typename T>
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
......@@ -1069,12 +1091,12 @@ namespace cv { namespace gpu { namespace bfmatcher
{
if (mask.data)
{
radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
}
else
{
radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
}
}
......@@ -1092,12 +1114,12 @@ namespace cv { namespace gpu { namespace bfmatcher
{
if (mask.data)
{
radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
}
else
{
radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
}
}
......
......@@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/transform.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
......@@ -94,20 +95,27 @@ namespace cv { namespace gpu { namespace color
return vec.w;
}
template <typename Cvt>
void callConvert(const DevMem2D& src, const DevMem2D& dst, const Cvt& cvt, cudaStream_t stream)
{
typedef typename Cvt::src_t src_t;
typedef typename Cvt::dst_t dst_t;
transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, cvt, stream);
}
////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
template <int SRCCN, int DSTCN, typename T>
__global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template <typename T, int SRCCN, int DSTCN>
struct RGB2RGB
{
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit RGB2RGB(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
dst.x = (&src.x)[bidx];
......@@ -115,25 +123,18 @@ namespace cv { namespace gpu { namespace color
dst.z = (&src.x)[bidx ^ 2];
setAlpha(dst, getAlpha<T>(src));
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
return dst;
}
private:
int bidx;
};
template <typename T, int SRCCN, int DSTCN>
void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
RGB2RGB<T, SRCCN, DSTCN> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
void RGB2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
......@@ -174,110 +175,90 @@ namespace cv { namespace gpu { namespace color
/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};
template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN>
template <int GREEN_BITS> struct RGB5x52RGBConverter;
template <> struct RGB5x52RGBConverter<5>
{
typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;
static __device__ dst_t cvt(uint src, int bidx)
template <typename D>
static __device__ void cvt(uint src, D& dst, int bidx)
{
dst_t dst;
(&dst.x)[bidx] = (uchar)(src << 3);
dst.y = (uchar)((src >> 2) & ~7);
(&dst.x)[bidx ^ 2] = (uchar)((src >> 7) & ~7);
setAlpha(dst, (uchar)(src & 0x8000 ? 255 : 0));
return dst;
}
};
template <int DSTCN> struct RGB5x52RGBConverter<6, DSTCN>
template <> struct RGB5x52RGBConverter<6>
{
typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;
static __device__ dst_t cvt(uint src, int bidx)
template <typename D>
static __device__ void cvt(uint src, D& dst, int bidx)
{
dst_t dst;
(&dst.x)[bidx] = (uchar)(src << 3);
dst.y = (uchar)((src >> 3) & ~3);
(&dst.x)[bidx ^ 2] = (uchar)((src >> 8) & ~7);
setAlpha(dst, (uchar)(255));
return dst;
}
};
template <int GREEN_BITS, int DSTCN>
__global__ void RGB5x52RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template <int GREEN_BITS, int DSTCN> struct RGB5x52RGB
{
typedef ushort src_t;
typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit RGB5x52RGB(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(ushort src) const
{
uint src = *(const ushort*)(src_ + y * src_step + (x << 1));
*(dst_t*)(dst_ + y * dst_step + x * DSTCN) = RGB5x52RGBConverter<GREEN_BITS, DSTCN>::cvt(src, bidx);
}
dst_t dst;
RGB5x52RGBConverter<GREEN_BITS>::cvt((uint)src, dst, bidx);
return dst;
}
template <int SRCCN, int GREEN_BITS> struct RGB2RGB5x5Converter {};
template<int SRCCN> struct RGB2RGB5x5Converter<SRCCN, 6>
private:
int bidx;
};
template <int GREEN_BITS> struct RGB2RGB5x5Converter;
template<> struct RGB2RGB5x5Converter<6>
{
static __device__ ushort cvt(const uchar* src, int bidx)
template <typename T>
static __device__ ushort cvt(const T& src, int bidx)
{
return (ushort)((src[bidx] >> 3) | ((src[1] & ~3) << 3) | ((src[bidx^2] & ~7) << 8));
return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
}
};
template<> struct RGB2RGB5x5Converter<3, 5>
template<> struct RGB2RGB5x5Converter<5>
{
static __device__ ushort cvt(const uchar* src, int bidx)
static __device__ ushort cvt(const uchar3& src, int bidx)
{
return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7));
return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
}
};
template<> struct RGB2RGB5x5Converter<4, 5>
{
static __device__ ushort cvt(const uchar* src, int bidx)
static __device__ ushort cvt(const uchar4& src, int bidx)
{
return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7) | (src[3] ? 0x8000 : 0));
return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0));
}
};
template<int SRCCN, int GREEN_BITS>
__global__ void RGB2RGB5x5(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template<int SRCCN, int GREEN_BITS> struct RGB2RGB5x5
{
typedef typename TypeVec<uchar, SRCCN>::vec_t src_t;
typedef ushort dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit RGB2RGB5x5(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ ushort operator()(const src_t& src)
{
src_t src = *(src_t*)(src_ + y * src_step + x * SRCCN);
*(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter<SRCCN, GREEN_BITS>::cvt(&src.x, bidx);
}
return RGB2RGB5x5Converter<GREEN_BITS>::cvt(src, bidx);
}
private:
int bidx;
};
template <int GREEN_BITS, int DSTCN>
void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
RGB5x52RGB<GREEN_BITS, DSTCN> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
void RGB5x52RGB_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
......@@ -295,18 +276,8 @@ namespace cv { namespace gpu { namespace color
template <int SRCCN, int GREEN_BITS>
void RGB2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
RGB2RGB5x5<SRCCN, GREEN_BITS> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
void RGB2RGB5x5_gpu(const DevMem2D& src, int srccn, const DevMem2D& dst, int green_bits, int bidx, cudaStream_t stream)
......@@ -323,27 +294,23 @@ namespace cv { namespace gpu { namespace color
///////////////////////////////// Grayscale to Color ////////////////////////////////
template <int DSTCN, typename T>
__global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
template <int DSTCN, typename T> struct Gray2RGB
{
typedef T src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (y < rows && x < cols)
__device__ dst_t operator()(const T& src) const
{
T src = *(const T*)(src_ + y * src_step + x * sizeof(T));
dst_t dst;
dst.x = src;
dst.y = src;
dst.z = src;
dst.z = dst.y = dst.x = src;
setAlpha(dst, ColorChannel<T>::max());
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
return dst;
}
};
template <int GREEN_BITS> struct Gray2RGB5x5Converter {};
template <int GREEN_BITS> struct Gray2RGB5x5Converter;
template<> struct Gray2RGB5x5Converter<6>
{
static __device__ ushort cvt(uint t)
......@@ -360,35 +327,22 @@ namespace cv { namespace gpu { namespace color
}
};
template<int GREEN_BITS>
__global__ void Gray2RGB5x5(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
template<int GREEN_BITS> struct Gray2RGB5x5
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
typedef uchar src_t;
typedef ushort dst_t;
if (y < rows && x < cols)
__device__ ushort operator()(uchar src) const
{
uint src = src_[y * src_step + x];
*(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter<GREEN_BITS>::cvt(src);
}
return Gray2RGB5x5Converter<GREEN_BITS>::cvt((uint)src);
}
};
template <typename T, int DSTCN>
void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
Gray2RGB<DSTCN, T> cvt;
callConvert(src, dst, cvt, stream);
}
void Gray2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
......@@ -418,18 +372,8 @@ namespace cv { namespace gpu { namespace color
template <int GREEN_BITS>
void Gray2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
Gray2RGB5x5<GREEN_BITS> cvt;
callConvert(src, dst, cvt, stream);
}
void Gray2RGB5x5_gpu(const DevMem2D& src, const DevMem2D& dst, int green_bits, cudaStream_t stream)
......@@ -459,7 +403,7 @@ namespace cv { namespace gpu { namespace color
BLOCK_SIZE = 256
};
template <int GREEN_BITS> struct RGB5x52GrayConverter {};
template <int GREEN_BITS> struct RGB5x52GrayConverter;
template<> struct RGB5x52GrayConverter<6>
{
static __device__ uchar cvt(uint t)
......@@ -475,30 +419,23 @@ namespace cv { namespace gpu { namespace color
}
};
template<int GREEN_BITS>
__global__ void RGB5x52Gray(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
template<int GREEN_BITS> struct RGB5x52Gray
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
typedef ushort src_t;
typedef uchar dst_t;
if (y < rows && x < cols)
__device__ uchar operator()(ushort src) const
{
uint src = *(ushort*)(src_ + y * src_step + (x << 1));
dst_[y * dst_step + x] = RGB5x52GrayConverter<GREEN_BITS>::cvt(src);
}
return RGB5x52GrayConverter<GREEN_BITS>::cvt((uint)src);
}
};
template <typename T> struct RGB2GrayConvertor
{
static __device__ T cvt(const T* src, int bidx)
template <typename T>
__device__ T RGB2GrayConvert(const T* src, int bidx)
{
return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
}
};
template <> struct RGB2GrayConvertor<float>
{
static __device__ float cvt(const float* src, int bidx)
__device__ float RGB2GrayConvert(const float* src, int bidx)
{
const float cr = 0.299f;
const float cg = 0.587f;
......@@ -506,39 +443,28 @@ namespace cv { namespace gpu { namespace color
return src[bidx] * cb + src[1] * cg + src[bidx^2] * cr;
}
};
template <int SRCCN, typename T>
__global__ void RGB2Gray(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template <int SRCCN, typename T> struct RGB2Gray
{
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef T dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit RGB2Gray(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ T operator()(const src_t& src)
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
*(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt(&src.x, bidx);
}
return RGB2GrayConvert(&src.x, bidx);
}
private:
int bidx;
};
template <typename T, int SRCCN>
void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
RGB2Gray<SRCCN, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
void RGB2Gray_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
......@@ -568,18 +494,8 @@ namespace cv { namespace gpu { namespace color
template <int GREEN_BITS>
void RGB5x52Gray_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
RGB5x52Gray<GREEN_BITS> cvt;
callConvert(src, dst, cvt, stream);
}
void RGB5x52Gray_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, cudaStream_t stream)
......@@ -595,13 +511,11 @@ namespace cv { namespace gpu { namespace color
///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
__constant__ float cYCrCbCoeffs_f[5];
__constant__ int cYCrCbCoeffs_i[5];
__constant__ float cYCrCbCoeffs_f[5];
template <typename T> struct RGB2YCrCbConverter
{
template <typename D>
static __device__ void cvt(const T* src, D& dst, int bidx)
template <typename T, typename D>
__device__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx)
{
const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
......@@ -613,42 +527,53 @@ namespace cv { namespace gpu { namespace color
dst.y = saturate_cast<T>(Cr);
dst.z = saturate_cast<T>(Cb);
}
};
template<> struct RGB2YCrCbConverter<float>
{
template <typename D>
static __device__ void cvt(const float* src, D& dst, int bidx)
static __device__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx)
{
dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2];
dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel<float>::half();
dst.z = (src[bidx] - dst.x) * cYCrCbCoeffs_f[4] + ColorChannel<float>::half();
}
template<typename T> struct RGB2YCrCbBase
{
typedef int coeff_t;
explicit RGB2YCrCbBase(const coeff_t coeffs[5])
{
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
}
};
template<> struct RGB2YCrCbBase<float>
{
typedef float coeff_t;
template <int SRCCN, int DSTCN, typename T>
__global__ void RGB2YCrCb(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
explicit RGB2YCrCbBase(const coeff_t coeffs[5])
{
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
}
};
template <int SRCCN, int DSTCN, typename T> struct RGB2YCrCb : RGB2YCrCbBase<T>
{
typedef typename RGB2YCrCbBase<T>::coeff_t coeff_t;
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
RGB2YCrCb(int bidx, const coeff_t coeffs[5]) : RGB2YCrCbBase<T>(coeffs), bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
RGB2YCrCbConverter<T>::cvt(&src.x, dst, bidx);
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
RGB2YCrCbConvert(&src.x, dst, bidx);
return dst;
}
template <typename D> struct YCrCb2RGBConvertor
{
template <typename T>
static __device__ void cvt(const T& src, D* dst, int bidx)
private:
int bidx;
};
template <typename T, typename D>
__device__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx)
{
const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[3], yuv_shift);
const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * cYCrCbCoeffs_i[1], yuv_shift);
......@@ -658,372 +583,374 @@ namespace cv { namespace gpu { namespace color
dst[1] = saturate_cast<D>(g);
dst[bidx^2] = saturate_cast<D>(r);
}
};
template <> struct YCrCb2RGBConvertor<float>
{
template <typename T>
static __device__ void cvt(const T& src, float* dst, int bidx)
__device__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx)
{
dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[3];
dst[1] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[1];
dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[0];
}
template<typename T> struct YCrCb2RGBBase
{
typedef int coeff_t;
explicit YCrCb2RGBBase(const coeff_t coeffs[4])
{
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
}
};
template<> struct YCrCb2RGBBase<float>
{
typedef float coeff_t;
template <int SRCCN, int DSTCN, typename T>
__global__ void YCrCb2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
explicit YCrCb2RGBBase(const coeff_t coeffs[4])
{
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
}
};
template <int SRCCN, int DSTCN, typename T> struct YCrCb2RGB : YCrCb2RGBBase<T>
{
typedef typename YCrCb2RGBBase<T>::coeff_t coeff_t;
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
YCrCb2RGB(int bidx, const coeff_t coeffs[4]) : YCrCb2RGBBase<T>(coeffs), bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
YCrCb2RGBConvertor<T>::cvt(src, &dst.x, bidx);
YCrCb2RGBConvert(src, &dst.x, bidx);
setAlpha(dst, ColorChannel<T>::max());
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
return dst;
}
private:
int bidx;
};
template <typename T, int SRCCN, int DSTCN>
void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
typedef typename RGB2YCrCb<SRCCN, DSTCN, T>::coeff_t coeff_t;
RGB2YCrCb<SRCCN, DSTCN, T> cvt(bidx, (const coeff_t*)coeffs);
callConvert(src, dst, cvt, stream);
}
void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
{
typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] =
{
{RGB2YCrCb_caller<uchar, 3, 3>, RGB2YCrCb_caller<uchar, 3, 4>},
{RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
}
void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
{
typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] =
{
{RGB2YCrCb_caller<ushort, 3, 3>, RGB2YCrCb_caller<ushort, 3, 4>},
{RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
}
void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
{
typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] =
{
{RGB2YCrCb_caller<float, 3, 3>, RGB2YCrCb_caller<float, 3, 4>},
{RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
}
template <typename T, int SRCCN, int DSTCN>
void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
typedef typename YCrCb2RGB<SRCCN, DSTCN, T>::coeff_t coeff_t;
YCrCb2RGB<SRCCN, DSTCN, T> cvt(bidx, (const coeff_t*)coeffs);
callConvert(src, dst, cvt, stream);
}
void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
{
typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] =
{
{YCrCb2RGB_caller<uchar, 3, 3>, YCrCb2RGB_caller<uchar, 3, 4>},
{YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
}
void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
{
typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] =
{
{YCrCb2RGB_caller<ushort, 3, 3>, YCrCb2RGB_caller<ushort, 3, 4>},
{YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
}
void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
{
typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] =
{
{YCrCb2RGB_caller<float, 3, 3>, YCrCb2RGB_caller<float, 3, 4>},
{YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
}
////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
__constant__ float cXYZ_D65f[9];
__constant__ int cXYZ_D65i[9];
__constant__ float cXYZ_D65f[9];
template <typename T> struct RGB2XYZConvertor
{
template <typename D>
static __device__ void cvt(const T* src, D& dst)
template <typename T, typename D>
__device__ void RGB2XYZConvert(const T* src, D& dst)
{
dst.x = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift));
dst.y = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift));
dst.z = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift));
}
};
template <> struct RGB2XYZConvertor<float>
{
template <typename D>
static __device__ void cvt(const float* src, D& dst)
__device__ void RGB2XYZConvert(const float* src, D& dst)
{
dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2];
dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5];
dst.z = src[0] * cXYZ_D65f[6] + src[1] * cXYZ_D65f[7] + src[2] * cXYZ_D65f[8];
}
template <typename T> struct RGB2XYZBase
{
typedef int coeff_t;
explicit RGB2XYZBase(const coeff_t coeffs[9])
{
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
}
};
template <> struct RGB2XYZBase<float>
{
typedef float coeff_t;
template <int SRCCN, int DSTCN, typename T>
__global__ void RGB2XYZ(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
explicit RGB2XYZBase(const coeff_t coeffs[9])
{
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
}
};
template <int SRCCN, int DSTCN, typename T> struct RGB2XYZ : RGB2XYZBase<T>
{
typedef typename RGB2XYZBase<T>::coeff_t coeff_t;
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit RGB2XYZ(const coeff_t coeffs[9]) : RGB2XYZBase<T>(coeffs) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
RGB2XYZConvertor<T>::cvt(&src.x, dst);
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
RGB2XYZConvert(&src.x, dst);
return dst;
}
};
template <typename D> struct XYZ2RGBConvertor
{
template <typename T>
static __device__ void cvt(const T& src, D* dst)
template <typename T, typename D>
__device__ void XYZ2RGBConvert(const T& src, D* dst)
{
dst[0] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift));
dst[1] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift));
dst[2] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift));
}
};
template <> struct XYZ2RGBConvertor<float>
{
template <typename T>
static __device__ void cvt(const T& src, float* dst)
__device__ void XYZ2RGBConvert(const T& src, float* dst)
{
dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2];
dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5];
dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8];
}
template <typename T> struct XYZ2RGBBase
{
typedef int coeff_t;
explicit XYZ2RGBBase(const coeff_t coeffs[9])
{
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
}
};
template <> struct XYZ2RGBBase<float>
{
typedef float coeff_t;
template <int SRCCN, int DSTCN, typename T>
__global__ void XYZ2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
explicit XYZ2RGBBase(const coeff_t coeffs[9])
{
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
}
};
template <int SRCCN, int DSTCN, typename T> struct XYZ2RGB : XYZ2RGBBase<T>
{
typedef typename RGB2XYZBase<T>::coeff_t coeff_t;
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit XYZ2RGB(const coeff_t coeffs[9]) : XYZ2RGBBase<T>(coeffs) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
XYZ2RGBConvertor<T>::cvt(src, &dst.x);
XYZ2RGBConvert(src, &dst.x);
setAlpha(dst, ColorChannel<T>::max());
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
return dst;
}
};
template <typename T, int SRCCN, int DSTCN>
void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
typedef typename RGB2XYZ<SRCCN, DSTCN, T>::coeff_t coeff_t;
RGB2XYZ<SRCCN, DSTCN, T> cvt((const coeff_t*)coeffs);
callConvert(src, dst, cvt, stream);
}
void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
{
typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] =
{
{RGB2XYZ_caller<uchar, 3, 3>, RGB2XYZ_caller<uchar, 3, 4>},
{RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
}
void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
{
typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] =
{
{RGB2XYZ_caller<ushort, 3, 3>, RGB2XYZ_caller<ushort, 3, 4>},
{RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
}
void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
{
typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] =
{
{RGB2XYZ_caller<float, 3, 3>, RGB2XYZ_caller<float, 3, 4>},
{RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
}
template <typename T, int SRCCN, int DSTCN>
void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
typedef typename XYZ2RGB<SRCCN, DSTCN, T>::coeff_t coeff_t;
XYZ2RGB<SRCCN, DSTCN, T> cvt((const coeff_t*)coeffs);
callConvert(src, dst, cvt, stream);
}
void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
{
typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] =
{
{XYZ2RGB_caller<uchar, 3, 3>, XYZ2RGB_caller<uchar, 3, 4>},
{XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
}
void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
{
typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] =
{
{XYZ2RGB_caller<ushort, 3, 3>, XYZ2RGB_caller<ushort, 3, 4>},
{XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
}
void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
{
typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] =
{
{XYZ2RGB_caller<float, 3, 3>, XYZ2RGB_caller<float, 3, 4>},
{XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>}
};
cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
}
////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
__constant__ int cHsvDivTable[256];
template<typename T, int HR> struct RGB2HSVConvertor;
template<int HR> struct RGB2HSVConvertor<uchar, HR>
__constant__ int cHsvDivTable[256] =
{
template <typename D>
static __device__ void cvt(const uchar* src, D& dst, int bidx)
0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211,
130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632,
65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412,
43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693,
32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782,
26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223,
21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991,
18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579,
16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711,
14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221,
13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006,
11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995,
10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141,
10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410,
9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777,
8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224,
8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737,
7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304,
7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917,
6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569,
6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254,
6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968,
5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708,
5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468,
5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249,
5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046,
5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858,
4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684,
4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522,
4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370,
4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
};
template <int HR, typename D>
__device__ void RGB2HSVConvert(const uchar* src, D& dst, int bidx)
{
const int hsv_shift = 12;
const int hscale = HR == 180 ? 15 : 21;
......@@ -1051,11 +978,8 @@ namespace cv { namespace gpu { namespace color
dst.y = (uchar)s;
dst.z = (uchar)v;
}
};
template<int HR> struct RGB2HSVConvertor<float, HR>
{
template <typename D>
static __device__ void cvt(const float* src, D& dst, int bidx)
template<int HR, typename D>
__device__ void RGB2HSVConvert(const float* src, D& dst, int bidx)
{
const float hscale = HR * (1.f / 360.f);
......@@ -1087,35 +1011,32 @@ namespace cv { namespace gpu { namespace color
dst.y = s;
dst.z = v;
}
};
template <int SRCCN, int DSTCN, int HR, typename T>
__global__ void RGB2HSV(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template <int SRCCN, int DSTCN, int HR, typename T> struct RGB2HSV
{
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit RGB2HSV(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
RGB2HSVConvertor<T, HR>::cvt(&src.x, dst, bidx);
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
RGB2HSVConvert<HR>(&src.x, dst, bidx);
return dst;
}
__constant__ int cHsvSectorData[6][3];
private:
int bidx;
};
template<typename T, int HR> struct HSV2RGBConvertor;
template<int HR> struct HSV2RGBConvertor<float, HR>
__constant__ int cHsvSectorData[6][3] =
{
template <typename T>
static __device__ void cvt(const T& src, float* dst, int bidx)
{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}
};
template <int HR, typename T>
__device__ void HSV2RGBConvert(const T& src, float* dst, int bidx)
{
const float hscale = 6.f / HR;
......@@ -1150,11 +1071,8 @@ namespace cv { namespace gpu { namespace color
dst[1] = g;
dst[bidx^2] = r;
}
};
template<int HR> struct HSV2RGBConvertor<uchar, HR>
{
template <typename T>
static __device__ void cvt(const T& src, uchar* dst, int bidx)
template <int HR, typename T>
__device__ void HSV2RGBConvert(const T& src, uchar* dst, int bidx)
{
float3 buf;
......@@ -1162,55 +1080,45 @@ namespace cv { namespace gpu { namespace color
buf.y = src.y * (1.f/255.f);
buf.z = src.z * (1.f/255.f);
HSV2RGBConvertor<float, HR>::cvt(buf, &buf.x, bidx);
HSV2RGBConvert<HR>(buf, &buf.x, bidx);
dst[0] = saturate_cast<uchar>(buf.x * 255.f);
dst[1] = saturate_cast<uchar>(buf.y * 255.f);
dst[2] = saturate_cast<uchar>(buf.z * 255.f);
}
};
template <int SRCCN, int DSTCN, int HR, typename T>
__global__ void HSV2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template <int SRCCN, int DSTCN, int HR, typename T> struct HSV2RGB
{
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit HSV2RGB(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
HSV2RGBConvertor<T, HR>::cvt(src, &dst.x, bidx);
HSV2RGBConvert<HR>(src, &dst.x, bidx);
setAlpha(dst, ColorChannel<T>::max());
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
return dst;
}
private:
int bidx;
};
template <typename T, int SRCCN, int DSTCN>
void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
{
RGB2HSV<SRCCN, DSTCN, 180, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
else
RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
{
RGB2HSV<SRCCN, DSTCN, 255, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
}
void RGB2HSV_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
......@@ -1222,43 +1130,6 @@ namespace cv { namespace gpu { namespace color
{RGB2HSV_caller<uchar, 4, 3>, RGB2HSV_caller<uchar, 4, 4>}
};
static const int div_table[] =
{
0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211,
130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632,
65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412,
43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693,
32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782,
26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223,
21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991,
18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579,
16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711,
14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221,
13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006,
11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995,
10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141,
10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410,
9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777,
8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224,
8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737,
7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304,
7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917,
6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569,
6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254,
6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968,
5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708,
5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468,
5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249,
5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046,
5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858,
4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684,
4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522,
4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370,
4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
};
cudaSafeCall( cudaMemcpyToSymbol(cHsvDivTable, div_table, sizeof(div_table)) );
RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
......@@ -1274,27 +1145,19 @@ namespace cv { namespace gpu { namespace color
RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
template <typename T, int SRCCN, int DSTCN>
void HSV2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
{
HSV2RGB<SRCCN, DSTCN, 180, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
else
HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
{
HSV2RGB<SRCCN, DSTCN, 255, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
}
void HSV2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
......@@ -1306,11 +1169,6 @@ namespace cv { namespace gpu { namespace color
{HSV2RGB_caller<uchar, 4, 3>, HSV2RGB_caller<uchar, 4, 4>}
};
static const int sector_data[][3] =
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
......@@ -1323,21 +1181,13 @@ namespace cv { namespace gpu { namespace color
{HSV2RGB_caller<float, 4, 3>, HSV2RGB_caller<float, 4, 4>}
};
static const int sector_data[][3] =
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
/////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
template<typename T, int HR> struct RGB2HLSConvertor;
template<int HR> struct RGB2HLSConvertor<float, HR>
{
template <typename D>
static __device__ void cvt(const float* src, D& dst, int bidx)
template <int HR, typename D>
__device__ void RGB2HLSConvert(const float* src, D& dst, int bidx)
{
const float hscale = HR * (1.f/360.f);
......@@ -1373,11 +1223,8 @@ namespace cv { namespace gpu { namespace color
dst.y = l;
dst.z = s;
}
};
template<int HR> struct RGB2HLSConvertor<uchar, HR>
{
template <typename D>
static __device__ void cvt(const uchar* src, D& dst, int bidx)
template <int HR, typename D>
__device__ void RGB2HLSConvert(const uchar* src, D& dst, int bidx)
{
float3 buf;
......@@ -1385,41 +1232,38 @@ namespace cv { namespace gpu { namespace color
buf.y = src[1]*(1.f/255.f);
buf.z = src[2]*(1.f/255.f);
RGB2HLSConvertor<float, HR>::cvt(&buf.x, buf, bidx);
RGB2HLSConvert<HR>(&buf.x, buf, bidx);
dst.x = saturate_cast<uchar>(buf.x);
dst.y = saturate_cast<uchar>(buf.y*255.f);
dst.z = saturate_cast<uchar>(buf.z*255.f);
}
};
template <int SRCCN, int DSTCN, int HR, typename T>
__global__ void RGB2HLS(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template <int SRCCN, int DSTCN, int HR, typename T> struct RGB2HLS
{
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit RGB2HLS(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
RGB2HLSConvertor<T, HR>::cvt(&src.x, dst, bidx);
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
RGB2HLSConvert<HR>(&src.x, dst, bidx);
return dst;
}
__constant__ int cHlsSectorData[6][3];
private:
int bidx;
};
template<typename T, int HR> struct HLS2RGBConvertor;
template<int HR> struct HLS2RGBConvertor<float, HR>
__constant__ int cHlsSectorData[6][3] =
{
template <typename T>
static __device__ void cvt(const T& src, float* dst, int bidx)
{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}
};
template <int HR, typename T>
__device__ void HLS2RGBConvert(const T& src, float* dst, int bidx)
{
const float hscale = 6.0f / HR;
......@@ -1460,11 +1304,8 @@ namespace cv { namespace gpu { namespace color
dst[1] = g;
dst[bidx^2] = r;
}
};
template<int HR> struct HLS2RGBConvertor<uchar, HR>
{
template <typename T>
static __device__ void cvt(const T& src, uchar* dst, int bidx)
template <int HR, typename T>
__device__ void HLS2RGBConvert(const T& src, uchar* dst, int bidx)
{
float3 buf;
......@@ -1472,53 +1313,45 @@ namespace cv { namespace gpu { namespace color
buf.y = src.y*(1.f/255.f);
buf.z = src.z*(1.f/255.f);
HLS2RGBConvertor<float, HR>::cvt(buf, &buf.x, bidx);
HLS2RGBConvert<HR>(buf, &buf.x, bidx);
dst[0] = saturate_cast<uchar>(buf.x*255.f);
dst[1] = saturate_cast<uchar>(buf.y*255.f);
dst[2] = saturate_cast<uchar>(buf.z*255.f);
}
};
template <int SRCCN, int DSTCN, int HR, typename T>
__global__ void HLS2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
template <int SRCCN, int DSTCN, int HR, typename T> struct HLS2RGB
{
typedef typename TypeVec<T, SRCCN>::vec_t src_t;
typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
explicit HLS2RGB(int bidx) : bidx(bidx) {}
if (y < rows && x < cols)
__device__ dst_t operator()(const src_t& src) const
{
src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
dst_t dst;
HLS2RGBConvertor<T, HR>::cvt(src, &dst.x, bidx);
HLS2RGBConvert<HR>(src, &dst.x, bidx);
setAlpha(dst, ColorChannel<T>::max());
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
}
return dst;
}
private:
int bidx;
};
template <typename T, int SRCCN, int DSTCN>
void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
{
RGB2HLS<SRCCN, DSTCN, 180, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
else
RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
{
RGB2HLS<SRCCN, DSTCN, 255, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
}
void RGB2HLS_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
......@@ -1549,23 +1382,16 @@ namespace cv { namespace gpu { namespace color
template <typename T, int SRCCN, int DSTCN>
void HLS2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y);
if (hrange == 180)
HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
{
HLS2RGB<SRCCN, DSTCN, 180, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
else
HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
{
HLS2RGB<SRCCN, DSTCN, 255, T> cvt(bidx);
callConvert(src, dst, cvt, stream);
}
}
void HLS2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
......@@ -1577,11 +1403,6 @@ namespace cv { namespace gpu { namespace color
{HLS2RGB_caller<uchar, 4, 3>, HLS2RGB_caller<uchar, 4, 4>}
};
static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
......@@ -1594,11 +1415,6 @@ namespace cv { namespace gpu { namespace color
{HLS2RGB_caller<float, 4, 3>, HLS2RGB_caller<float, 4, 4>}
};
static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
}
}}}
......@@ -190,6 +190,9 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src,
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
{
CV_Assert((src.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
static const set_caller_t set_callers[] =
{
......@@ -201,6 +204,11 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
{
CV_Assert((src.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
CV_Assert(mask.type() == CV_8UC1);
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
static const set_caller_t set_callers[] =
{
......@@ -212,6 +220,9 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
{
CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
if( rtype < 0 )
......
......@@ -626,6 +626,10 @@ namespace
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -637,6 +641,10 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -648,6 +656,9 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
{
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -659,6 +670,9 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -670,6 +684,10 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -681,6 +699,10 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -692,6 +714,9 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)
{
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -703,6 +728,9 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
......@@ -749,6 +777,9 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
}
else
{
CV_Assert((src.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type,
cudaStream_t stream);
......
......@@ -205,6 +205,9 @@ namespace
void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double beta ) const
{
CV_Assert((depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
if( rtype < 0 )
......@@ -428,6 +431,9 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
{
CV_Assert(mask.type() == CV_8UC1);
CV_Assert((depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
CV_DbgAssert(!this->empty());
NppiSize sz;
......
......@@ -393,11 +393,37 @@ namespace cv
}
};
template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
{
static const bool value = false;
};
template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
{
static const bool value = device::UnReadWriteTraits<T, D>::shift != 1;
};
template <typename T, typename D> struct UseSmartUn
{
static const bool value = UseSmartUn_<T, D, device::VecTraits<T>::cn, device::VecTraits<D>::cn>::value;
};
template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
{
static const bool value = false;
};
template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
{
static const bool value = device::BinReadWriteTraits<T1, T2, D>::shift != 1;
};
template <typename T1, typename T2, typename D> struct UseSmartBin
{
static const bool value = UseSmartBin_<T1, T2, D, device::VecTraits<T1>::cn, device::VecTraits<T2>::cn, device::VecTraits<D>::cn>::value;
};
template <typename T, typename D, typename UnOp, typename Mask>
static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask,
cudaStream_t stream = 0)
{
TransformDispatcher<device::VecTraits<T>::cn == 1 && device::VecTraits<D>::cn == 1 && device::UnReadWriteTraits<T, D>::shift != 1>::call(src, dst, op, mask, stream);
TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
}
template <typename T, typename D, typename UnOp>
......@@ -416,7 +442,7 @@ namespace cv
static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst,
BinOp op, const Mask& mask, cudaStream_t stream = 0)
{
TransformDispatcher<device::VecTraits<T1>::cn == 1 && device::VecTraits<T2>::cn == 1 && device::VecTraits<D>::cn == 1 && device::BinReadWriteTraits<T1, T2, D>::shift != 1>::call(src1, src2, dst, op, mask, stream);
TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
}
template <typename T1, typename T2, typename D, typename BinOp>
......
......@@ -681,3 +681,66 @@ TEST(erode)
GPU_OFF;
}
}
TEST(threshold)
{
Mat src, dst;
gpu::GpuMat d_src, d_dst;
for (int size = 2000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
gen(src, size, size, CV_8U, 0, 100);
dst.create(size, size, CV_8U);
CPU_ON;
threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
CPU_OFF;
d_src = src;
d_dst.create(size, size, CV_8U);
GPU_ON;
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
GPU_OFF;
}
for (int size = 2000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
gen(src, size, size, CV_8U, 0, 100);
dst.create(size, size, CV_8U);
CPU_ON;
threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
CPU_OFF;
d_src = src;
d_dst.create(size, size, CV_8U);
GPU_ON;
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
GPU_OFF;
}
for (int size = 2000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
gen(src, size, size, CV_32F, 0, 100);
dst.create(size, size, CV_32F);
CPU_ON;
threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
CPU_OFF;
d_src = src;
d_dst.create(size, size, CV_32F);
GPU_ON;
gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
GPU_OFF;
}
}
\ No newline at end of file
......@@ -384,7 +384,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )
{
bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);
bool atomics_ok = TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS);
if (!atomics_ok)
{
ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");
......
......@@ -53,7 +53,7 @@ struct CV_GpuMeanShiftTest : public CvTest
void run(int)
{
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
if (!cc12_ok)
{
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
......@@ -67,8 +67,8 @@ struct CV_GpuMeanShiftTest : public CvTest
cv::Mat img = cv::imread(std::string(ts->get_data_path()) + "meanshift/cones.png");
cv::Mat img_template;
if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result.png");
else
img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result_CC1X.png");
......@@ -145,7 +145,7 @@ struct CV_GpuMeanShiftProcTest : public CvTest
void run(int)
{
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
if (!cc12_ok)
{
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
......@@ -219,8 +219,8 @@ struct CV_GpuMeanShiftProcTest : public CvTest
cv::Mat spmap_template;
cv::FileStorage fs;
if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
fs.open(std::string(ts->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ);
else
fs.open(std::string(ts->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ);
......
......@@ -54,7 +54,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
{
try
{
bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
if (!cc12_ok)
{
ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
......@@ -77,7 +77,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
{
stringstream path;
path << ts->get_data_path() << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize;
if (TargetArchs::builtWith(COMPUTE_20) && DeviceInfo().supports(COMPUTE_20))
if (TargetArchs::builtWith(FEATURE_SET_COMPUTE_20) && DeviceInfo().supports(FEATURE_SET_COMPUTE_20))
path << ".png";
else
path << "_CC1X.png";
......
......@@ -66,21 +66,24 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
{
const Size img_size(67, 35);
const int types[] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F};
const int types_num = sizeof(types) / sizeof(int);
const char* types_str[] = {"CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F"};
bool passed = true;
try
{
for (int i = 0; i < types_num && passed; ++i)
int lastType = CV_32F;
if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
lastType = CV_64F;
for (int i = 0; i <= lastType && passed; ++i)
{
for (int j = 0; j < types_num && passed; ++j)
for (int j = 0; j <= lastType && passed; ++j)
{
for (int c = 1; c < 5 && passed; ++c)
{
const int src_type = CV_MAKETYPE(types[i], c);
const int dst_type = types[j];
const int src_type = CV_MAKETYPE(i, c);
const int dst_type = j;
cv::RNG rng(*ts->get_rng());
......
......@@ -126,7 +126,12 @@ void CV_GpuMatOpCopyToTest::run( int /* start_from */)
try
{
for (int i = 0 ; i < 7; i++)
int lastType = CV_32F;
if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
lastType = CV_64F;
for (int i = 0 ; i <= lastType; i++)
{
Mat cpumat(rows, cols, i);
cpumat.setTo(Scalar::all(127));
......
......@@ -101,7 +101,12 @@ void CV_GpuMatOpSetToTest::run( int /* start_from */)
rng.fill(cpumask, RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar(1.5));
cv::gpu::GpuMat gpumask(cpumask);
for (int i = 0; i < 7; i++)
int lastType = CV_32F;
if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
lastType = CV_64F;
for (int i = 0; i <= lastType; i++)
{
for (int cn = 1; cn <= 4; ++cn)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment