Commit ada6ab37 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

fixed compilation for old compute capabilities

parent 65bef258
...@@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device ...@@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
} }
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int PATCH_PER_BLOCK, int HALO_SIZE, int KSIZE, typename T, typename D, typename B> template <int KSIZE, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd) __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
{ {
Static<KSIZE <= MAX_KERNEL_SIZE>::check(); #if __CUDA_ARCH__ >= 200
Static<HALO_SIZE * BLOCK_DIM_Y >= KSIZE>::check(); const int BLOCK_DIM_X = 16;
Static<VecTraits<T>::cn == VecTraits<D>::cn>::check(); const int BLOCK_DIM_Y = 16;
const int PATCH_PER_BLOCK = 4;
const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
#else
const int BLOCK_DIM_X = 16;
const int BLOCK_DIM_Y = 8;
const int PATCH_PER_BLOCK = 2;
const int HALO_SIZE = 2;
#endif
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
...@@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device ...@@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device
{ {
const int y = yStart + j * BLOCK_DIM_Y; const int y = yStart + j * BLOCK_DIM_Y;
if (y >= src.rows) if (y < src.rows)
return; {
sum_t sum = VecTraits<sum_t>::all(0);
sum_t sum = VecTraits<sum_t>::all(0);
#pragma unroll #pragma unroll
for (int k = 0; k < KSIZE; ++k) for (int k = 0; k < KSIZE; ++k)
sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
dst(y, x) = saturate_cast<D>(sum); dst(y, x) = saturate_cast<D>(sum);
}
} }
} }
template <int KSIZE, typename T, typename D, template<typename> class B> template <int KSIZE, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream) void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)
{ {
const int BLOCK_DIM_X = 16; int BLOCK_DIM_X;
const int BLOCK_DIM_Y = 16; int BLOCK_DIM_Y;
const int PATCH_PER_BLOCK = 4; int PATCH_PER_BLOCK;
if (cc >= 20)
{
BLOCK_DIM_X = 16;
BLOCK_DIM_Y = 16;
PATCH_PER_BLOCK = 4;
}
else
{
BLOCK_DIM_X = 16;
BLOCK_DIM_Y = 8;
PATCH_PER_BLOCK = 2;
}
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
B<T> brd(src.rows); B<T> brd(src.rows);
linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, PATCH_PER_BLOCK, KSIZE <= 16 ? 1 : 2, KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
...@@ -137,9 +158,9 @@ namespace cv { namespace gpu { namespace device ...@@ -137,9 +158,9 @@ namespace cv { namespace gpu { namespace device
} }
template <typename T, typename D> template <typename T, typename D>
void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
{ {
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream); typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
static const caller_t callers[5][33] = static const caller_t callers[5][33] =
{ {
...@@ -322,13 +343,13 @@ namespace cv { namespace gpu { namespace device ...@@ -322,13 +343,13 @@ namespace cv { namespace gpu { namespace device
loadKernel(kernel, ksize); loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
} }
template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float , int >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
} // namespace column_filter } // namespace column_filter
}}} // namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device
...@@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device ...@@ -61,12 +61,20 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
} }
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int PATCH_PER_BLOCK, int HALO_SIZE, int KSIZE, typename T, typename D, typename B> template <int KSIZE, typename T, typename D, typename B>
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd) __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
{ {
Static<KSIZE <= MAX_KERNEL_SIZE>::check(); #if __CUDA_ARCH__ >= 200
Static<HALO_SIZE * BLOCK_DIM_X >= KSIZE>::check(); const int BLOCK_DIM_X = 32;
Static<VecTraits<T>::cn == VecTraits<D>::cn>::check(); const int BLOCK_DIM_Y = 8;
const int PATCH_PER_BLOCK = 4;
const int HALO_SIZE = 1;
#else
const int BLOCK_DIM_X = 32;
const int BLOCK_DIM_Y = 4;
const int PATCH_PER_BLOCK = 4;
const int HALO_SIZE = 1;
#endif
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
...@@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device ...@@ -103,32 +111,45 @@ namespace cv { namespace gpu { namespace device
{ {
const int x = xStart + j * BLOCK_DIM_X; const int x = xStart + j * BLOCK_DIM_X;
if (x >= src.cols) if (x < src.cols)
return; {
sum_t sum = VecTraits<sum_t>::all(0);
sum_t sum = VecTraits<sum_t>::all(0);
#pragma unroll #pragma unroll
for (int k = 0; k < KSIZE; ++k) for (int k = 0; k < KSIZE; ++k)
sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
dst(y, x) = saturate_cast<D>(sum); dst(y, x) = saturate_cast<D>(sum);
}
} }
} }
template <int KSIZE, typename T, typename D, template<typename> class B> template <int KSIZE, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream) void linearRowFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)
{ {
const int BLOCK_DIM_X = 32; int BLOCK_DIM_X;
const int BLOCK_DIM_Y = 8; int BLOCK_DIM_Y;
const int PATCH_PER_BLOCK = 4; int PATCH_PER_BLOCK;
if (cc >= 20)
{
BLOCK_DIM_X = 32;
BLOCK_DIM_Y = 8;
PATCH_PER_BLOCK = 4;
}
else
{
BLOCK_DIM_X = 32;
BLOCK_DIM_Y = 4;
PATCH_PER_BLOCK = 4;
}
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
B<T> brd(src.cols); B<T> brd(src.cols);
linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, PATCH_PER_BLOCK, 1, KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
...@@ -136,9 +157,9 @@ namespace cv { namespace gpu { namespace device ...@@ -136,9 +157,9 @@ namespace cv { namespace gpu { namespace device
} }
template <typename T, typename D> template <typename T, typename D>
void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
{ {
typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, cudaStream_t stream); typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
static const caller_t callers[5][33] = static const caller_t callers[5][33] =
{ {
...@@ -321,13 +342,13 @@ namespace cv { namespace gpu { namespace device ...@@ -321,13 +342,13 @@ namespace cv { namespace gpu { namespace device
loadKernel(kernel, ksize); loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
} }
template void linearRowFilter_gpu<uchar , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<uchar , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearRowFilter_gpu<uchar4, float4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<uchar4, float4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearRowFilter_gpu<short3, float3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<short3, float3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearRowFilter_gpu<int , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<int , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
template void linearRowFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
} // namespace row_filter } // namespace row_filter
}}} // namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device
...@@ -740,13 +740,13 @@ namespace cv { namespace gpu { namespace device ...@@ -740,13 +740,13 @@ namespace cv { namespace gpu { namespace device
namespace row_filter namespace row_filter
{ {
template <typename T, typename D> template <typename T, typename D>
void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); void linearRowFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
} }
namespace column_filter namespace column_filter
{ {
template <typename T, typename D> template <typename T, typename D>
void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
} }
}}} }}}
...@@ -755,7 +755,7 @@ namespace ...@@ -755,7 +755,7 @@ namespace
typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor); const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
struct NppLinearRowFilter : public BaseRowFilter_GPU struct NppLinearRowFilter : public BaseRowFilter_GPU
{ {
...@@ -791,7 +791,9 @@ namespace ...@@ -791,7 +791,9 @@ namespace
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s)); DeviceInfo devInfo;
int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
} }
Mat kernel; Mat kernel;
...@@ -899,7 +901,10 @@ namespace ...@@ -899,7 +901,10 @@ namespace
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s)); DeviceInfo devInfo;
int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
CV_Assert(cc >= 20 || ksize <= 16);
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
} }
Mat kernel; Mat kernel;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment