Commit d661b8e3 authored by Anatoly Baksheev's avatar Anatoly Baksheev

added PtrStep PtrElemStep structures. Refactored name spaces,

parent 501e81eb
...@@ -50,56 +50,79 @@ namespace cv ...@@ -50,56 +50,79 @@ namespace cv
// Simple lightweight structures that encapsulates information about an image on device. // Simple lightweight structures that encapsulates information about an image on device.
// It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
template<typename T> struct PtrStep_
{
T* ptr;
size_t step;
typedef T elem_type;
enum { elem_size = sizeof(elem_type) };
#if defined(__CUDACC__) #if defined(__CUDACC__)
__host__ __device__ #define __CV_GPU_HOST_DEVICE__ __host__ __device__
#else
#define __CV_GPU_HOST_DEVICE__
#endif #endif
size_t elemSize() const { return elem_size; }
};
template <typename T> struct DevMem2D_ template <typename T> struct DevMem2D_
{ {
int cols; int cols;
int rows; int rows;
T* ptr; T* data;
size_t step; size_t step;
size_t elem_step;
/*__host__*/
DevMem2D_() : cols(0), rows(0), ptr(0), step(0), elem_step(0) {}
/*__host__*/
DevMem2D_(int rows_, int cols_, T *ptr_, size_t step_)
: cols(cols_), rows(rows_), ptr(ptr_), step(step_), elem_step(step_ / sizeof(T)) {}
template <typename U> DevMem2D_() : cols(0), rows(0), data(0), step(0) {}
/*__host__*/
DevMem2D_(int rows_, int cols_, T *data_, size_t step_)
: cols(cols_), rows(rows_), data(data_), step(step_) {}
template <typename U>
explicit DevMem2D_(const DevMem2D_<U>& d) explicit DevMem2D_(const DevMem2D_<U>& d)
: cols(d.cols), rows(d.rows), ptr((T*)d.ptr), step(d.step), elem_step(d.step / sizeof(T)) {} : cols(d.cols), rows(d.rows), data((T*)d.data), step(d.step) {}
typedef T elem_type;
enum { elem_size = sizeof(elem_type) };
template <typename U> __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
/*__host__*/ __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step ); }
operator PtrStep_<U>() const { PtrStep_<U> dt; dt.ptr = ptr; dt.step = step; return dt; } __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step ); }
};
template<typename T> struct PtrStep_
{
T* data;
size_t step;
typedef typename PtrStep_<T>::elem_type elem_type; PtrStep_() : data(0), step(0) {}
enum { elem_size = PtrStep_<T>::elem_size }; PtrStep_(const DevMem2D_<T>& mem) : data(mem.data), step(mem.step) {}
#if defined(__CUDACC__)
__host__ __device__ typedef T elem_type;
#endif enum { elem_size = sizeof(elem_type) };
size_t elemSize() const { return elem_size; }
__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step); }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step); }
};
template<typename T> struct PtrElemStep_ : public PtrStep_<T>
{
PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep_<T>(mem)
{
step /= elem_size;
}
private:
template <bool> struct StaticCheck;
template <> struct StaticCheck<true>{};
StaticCheck<256 % sizeof(T) == 0> ElemStepTypeCheck;
}; };
typedef DevMem2D_<unsigned char> DevMem2D; typedef DevMem2D_<unsigned char> DevMem2D;
typedef DevMem2D_<float> DevMem2Df; typedef DevMem2D_<float> DevMem2Df;
typedef DevMem2D_<int> DevMem2Di; typedef DevMem2D_<int> DevMem2Di;
}
typedef PtrStep_<unsigned char> PtrStep;
typedef PtrStep_<float> PtrStepf;
typedef PtrStep_<int> PtrStepi;
typedef PtrElemStep_<unsigned char> PtrElemStep;
typedef PtrElemStep_<float> PtrElemStepf;
typedef PtrElemStep_<int> PtrElemStepi;
#undef __CV_GPU_HOST_DEVICE__
}
} }
#endif /* __OPENCV_GPU_DEVMEM2D_HPP__ */ #endif /* __OPENCV_GPU_DEVMEM2D_HPP__ */
...@@ -109,6 +109,7 @@ namespace cv ...@@ -109,6 +109,7 @@ namespace cv
//! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code. //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
// Contains just image size, data ptr and step. // Contains just image size, data ptr and step.
template <class T> operator DevMem2D_<T>() const; template <class T> operator DevMem2D_<T>() const;
template <class T> operator PtrStep_<T>() const;
//! pefroms blocking upload data to GpuMat. . //! pefroms blocking upload data to GpuMat. .
void upload(const cv::Mat& m); void upload(const cv::Mat& m);
......
...@@ -207,6 +207,7 @@ inline GpuMat& GpuMat::operator = (const GpuMat& m) ...@@ -207,6 +207,7 @@ inline GpuMat& GpuMat::operator = (const GpuMat& m)
inline GpuMat& GpuMat::operator = (const Mat& m) { upload(m); return *this; } inline GpuMat& GpuMat::operator = (const Mat& m) { upload(m); return *this; }
template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); } template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); }
template <class T> inline GpuMat::operator PtrStep_<T>() const { return PtrStep_<T>(*this); }
//CPP: void GpuMat::upload(const Mat& m); //CPP: void GpuMat::upload(const Mat& m);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -54,9 +54,9 @@ namespace cv ...@@ -54,9 +54,9 @@ namespace cv
typedef unsigned char uchar; typedef unsigned char uchar;
typedef signed char schar; typedef signed char schar;
typedef unsigned short ushort; typedef unsigned short ushort;
typedef unsigned int uint; typedef unsigned int uint;
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
template<class T> template<class T>
static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); } static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
......
...@@ -128,8 +128,8 @@ namespace cv { namespace gpu { namespace filters ...@@ -128,8 +128,8 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.ptr, src.elem_step, filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.data, src.step/src.elemSize(),
dst.ptr, dst.elem_step, anchor, src.cols, src.rows); dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -152,10 +152,12 @@ namespace cv { namespace gpu { namespace filters ...@@ -152,10 +152,12 @@ namespace cv { namespace gpu { namespace filters
callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor); callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
} }
void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) template void linearRowFilter_gpu<4, uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
/* void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
{ {
linearRowFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor); linearRowFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor);
} }*/
void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
{ {
linearRowFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor); linearRowFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor);
...@@ -262,8 +264,8 @@ namespace cv { namespace gpu { namespace filters ...@@ -262,8 +264,8 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.ptr, src.elem_step, filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.data, src.step/src.elemSize(),
dst.ptr, dst.elem_step, anchor, src.cols, src.rows); dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -357,7 +359,7 @@ namespace cv { namespace gpu { namespace bf ...@@ -357,7 +359,7 @@ namespace cv { namespace gpu { namespace bf
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
{ {
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.ptr, sizeof(table_space.ptr)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float); size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
...@@ -491,15 +493,15 @@ namespace cv { namespace gpu { namespace bf ...@@ -491,15 +493,15 @@ namespace cv { namespace gpu { namespace bf
case 1: case 1:
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
} }
break; break;
case 3: case 3:
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
} }
break; break;
default: default:
......
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
using namespace cv::gpu; using namespace cv::gpu;
/////////////////////////////////// Remap /////////////////////////////////////////////// /////////////////////////////////// Remap ///////////////////////////////////////////////
namespace imgproc_krnls namespace cv { namespace gpu { namespace imgproc
{ {
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap; texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
...@@ -121,10 +121,7 @@ namespace imgproc_krnls ...@@ -121,10 +121,7 @@ namespace imgproc_krnls
*(dst + y * dst_step + 3 * x + 2) = out.z; *(dst + y * dst_step + 3 * x + 2) = out.z;
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
{ {
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
...@@ -132,15 +129,15 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -132,15 +129,15 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(dst.cols, threads.x); grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y); grid.y = divUp(dst.rows, threads.y);
imgproc_krnls::tex_remap.filterMode = cudaFilterModeLinear; tex_remap.filterMode = cudaFilterModeLinear;
imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::tex_remap.addressMode[1] = cudaAddressModeWrap; tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D(0, imgproc_krnls::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) ); cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
imgproc_krnls::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture(imgproc_krnls::tex_remap) ); cudaSafeCall( cudaUnbindTexture(tex_remap) );
} }
void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
...@@ -150,17 +147,13 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -150,17 +147,13 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(dst.cols, threads.x); grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y); grid.y = divUp(dst.rows, threads.y);
imgproc_krnls::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
}}}
/////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
namespace imgproc_krnls
{
texture<uchar4, 2> tex_meanshift; texture<uchar4, 2> tex_meanshift;
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
...@@ -252,10 +245,7 @@ namespace imgproc_krnls ...@@ -252,10 +245,7 @@ namespace imgproc_krnls
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps) extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)
{ {
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -264,11 +254,11 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -264,11 +254,11 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
{ {
...@@ -278,18 +268,15 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -278,18 +268,15 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
}}}
/////////////////////////////////// drawColorDisp /////////////////////////////////////////////// /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
namespace imgproc_krnls
{
template <typename T> template <typename T>
__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1) __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
{ {
...@@ -389,10 +376,8 @@ namespace imgproc_krnls ...@@ -389,10 +376,8 @@ namespace imgproc_krnls
line[x >> 1] = res; line[x >> 1] = res;
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream) void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
{ {
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
...@@ -400,7 +385,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -400,7 +385,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(src.cols, threads.x << 2); grid.x = divUp(src.cols, threads.x << 2);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp); drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -413,17 +398,14 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -413,17 +398,14 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(src.cols, threads.x << 1); grid.x = divUp(src.cols, threads.x << 1);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp); drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
}}}
/////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
namespace imgproc_krnls
{
__constant__ float cq[16]; __constant__ float cq[16];
template <typename T> template <typename T>
...@@ -455,10 +437,7 @@ namespace imgproc_krnls ...@@ -455,10 +437,7 @@ namespace imgproc_krnls
*(float4*)(xyzw + xyzw_step * y + (x * 4)) = v; *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
template <typename T> template <typename T>
inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
{ {
...@@ -467,9 +446,9 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -467,9 +446,9 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(disp.cols, threads.x); grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y); grid.y = divUp(disp.rows, threads.y);
cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cq, q, 16 * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
imgproc_krnls::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols); reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
......
...@@ -41,9 +41,7 @@ ...@@ -41,9 +41,7 @@
//M*/ //M*/
#include "cuda_shared.hpp" #include "cuda_shared.hpp"
#include "saturate_cast.hpp"
#include "transform.hpp" #include "transform.hpp"
#include "vecmath.hpp"
using namespace cv::gpu; using namespace cv::gpu;
...@@ -54,7 +52,7 @@ using namespace cv::gpu; ...@@ -54,7 +52,7 @@ using namespace cv::gpu;
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar // Cart <-> Polar
namespace mathfunc_krnls namespace cv { namespace gpu { namespace mathfunc
{ {
struct Nothing struct Nothing
{ {
...@@ -133,10 +131,7 @@ namespace mathfunc_krnls ...@@ -133,10 +131,7 @@ namespace mathfunc_krnls
yptr[y * y_step + x] = mag_data * sin_a; yptr[y * y_step + x] = mag_data * sin_a;
} }
} }
}
namespace cv { namespace gpu { namespace mathfunc
{
template <typename Mag, typename Angle> template <typename Mag, typename Angle>
void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
{ {
...@@ -148,9 +143,9 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -148,9 +143,9 @@ namespace cv { namespace gpu { namespace mathfunc
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f; const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
mathfunc_krnls::cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>( cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.ptr, x.elem_step, y.ptr, y.elem_step, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.ptr, mag.elem_step, angle.ptr, angle.elem_step, scale, x.cols, x.rows); mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -163,27 +158,27 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -163,27 +158,27 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
{ {
{ {
cartToPolar_caller<mathfunc_krnls::Magnitude, mathfunc_krnls::Atan2>, cartToPolar_caller<Magnitude, Atan2>,
cartToPolar_caller<mathfunc_krnls::Magnitude, mathfunc_krnls::Nothing> cartToPolar_caller<Magnitude, Nothing>
}, },
{ {
cartToPolar_caller<mathfunc_krnls::MagnitudeSqr, mathfunc_krnls::Atan2>, cartToPolar_caller<MagnitudeSqr, Atan2>,
cartToPolar_caller<mathfunc_krnls::MagnitudeSqr, mathfunc_krnls::Nothing>, cartToPolar_caller<MagnitudeSqr, Nothing>,
} }
}, },
{ {
{ {
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Atan2>, cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Nothing> cartToPolar_caller<Nothing, Nothing>
}, },
{ {
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Atan2>, cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Nothing>, cartToPolar_caller<Nothing, Nothing>,
} }
} }
}; };
callers[mag.ptr == 0][magSqr][angle.ptr == 0](x, y, mag, angle, angleInDegrees, stream); callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
} }
template <typename Mag> template <typename Mag>
...@@ -197,8 +192,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -197,8 +192,8 @@ namespace cv { namespace gpu { namespace mathfunc
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f; const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.elem_step, polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step, mag.cols, mag.rows); angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -209,19 +204,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -209,19 +204,16 @@ namespace cv { namespace gpu { namespace mathfunc
typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] = static const caller_t callers[2] =
{ {
polarToCart_caller<mathfunc_krnls::NonEmptyMag>, polarToCart_caller<NonEmptyMag>,
polarToCart_caller<mathfunc_krnls::EmptyMag> polarToCart_caller<EmptyMag>
}; };
callers[mag.ptr == 0](mag, angle, x, y, angleInDegrees, stream); callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
} }
}}}
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Compare // Compare
namespace mathfunc_krnls
{
template <typename T1, typename T2> template <typename T1, typename T2>
struct NotEqual struct NotEqual
{ {
...@@ -230,14 +222,11 @@ namespace mathfunc_krnls ...@@ -230,14 +222,11 @@ namespace mathfunc_krnls
return static_cast<uchar>(static_cast<int>(src1 != src2) * 255); return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
} }
}; };
}
namespace cv { namespace gpu { namespace mathfunc
{
template <typename T1, typename T2> template <typename T1, typename T2>
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
{ {
mathfunc_krnls::NotEqual<T1, T2> op; NotEqual<T1, T2> op;
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0); transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
} }
......
...@@ -40,16 +40,11 @@ ...@@ -40,16 +40,11 @@
// //
//M*/ //M*/
#include <stddef.h>
#include <stdio.h>
#include "cuda_shared.hpp" #include "cuda_shared.hpp"
#include "cuda_runtime.h"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
using namespace cv::gpu; namespace cv { namespace gpu { namespace matrix_operations {
namespace matop_krnls
{
template <typename T> struct shift_and_sizeof; template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<char> { enum { shift = 0 }; }; template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; }; template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
...@@ -115,14 +110,11 @@ namespace matop_krnls ...@@ -115,14 +110,11 @@ namespace matop_krnls
typedef int2 read_type; typedef int2 read_type;
typedef short2 write_type; typedef short2 write_type;
}; };
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo ///////////////////////////////// ////////////////////////////////// CopyTo /////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
template<typename T> template<typename T>
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels) __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
{ {
...@@ -136,10 +128,6 @@ namespace matop_krnls ...@@ -136,10 +128,6 @@ namespace matop_krnls
mat_dst[idx] = mat_src[idx]; mat_dst[idx] = mat_src[idx];
} }
} }
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream); typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T> template<typename T>
...@@ -147,17 +135,12 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -147,17 +135,12 @@ namespace cv { namespace gpu { namespace matrix_operations
{ {
dim3 threadsPerBlock(16,16, 1); dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
if (stream == 0) if (stream == 0)
{ cudaSafeCall ( cudaThreadSynchronize() );
::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
}
} }
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
...@@ -180,14 +163,11 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -180,14 +163,11 @@ namespace cv { namespace gpu { namespace matrix_operations
func(mat_src, mat_dst, mask, channels, stream); func(mat_src, mat_dst, mask, channels, stream);
} }
}}}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo ////////////////////////////////// ////////////////////////////////// SetTo //////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
__constant__ double scalar_d[4]; __constant__ double scalar_d[4];
template<typename T> template<typename T>
...@@ -216,10 +196,6 @@ namespace matop_krnls ...@@ -216,10 +196,6 @@ namespace matop_krnls
mat[idx] = scalar_d[ x % channels ]; mat[idx] = scalar_d[ x % channels ];
} }
} }
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream); typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream); typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
...@@ -229,16 +205,9 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -229,16 +205,9 @@ namespace cv { namespace gpu { namespace matrix_operations
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, (unsigned char *)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
if (stream == 0) if (stream == 0)
{
::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
}
} }
template <typename T> template <typename T>
...@@ -247,20 +216,15 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -247,20 +216,15 @@ namespace cv { namespace gpu { namespace matrix_operations
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
if (stream == 0) if (stream == 0)
{
matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
}
} }
void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream) void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
{ {
cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_without_mask tab[8] = static SetToFunc_without_mask tab[8] =
{ {
...@@ -284,7 +248,7 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -284,7 +248,7 @@ namespace cv { namespace gpu { namespace matrix_operations
void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream) void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{ {
cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_with_mask tab[8] = static SetToFunc_with_mask tab[8] =
{ {
...@@ -305,14 +269,11 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -305,14 +269,11 @@ namespace cv { namespace gpu { namespace matrix_operations
func(mat, mask, channels, stream); func(mat, mask, channels, stream);
} }
}}}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
template <typename T, typename DT> template <typename T, typename DT>
__global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) __global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
{ {
...@@ -348,29 +309,20 @@ namespace matop_krnls ...@@ -348,29 +309,20 @@ namespace matop_krnls
} }
} }
} }
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream); typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
template<typename T, typename DT> template<typename T, typename DT>
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream) void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{ {
const int shift = ::matop_krnls::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift; const int shift = ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y)); dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
convert_to<T, DT><<<grid, block, 0, stream>>>(src.data, src.step, dst.data, dst.step, width, height, alpha, beta);
if (stream == 0) if (stream == 0)
{
matop_krnls::convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
}
else
{
matop_krnls::convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
}
} }
void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream) void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
......
...@@ -230,9 +230,9 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -230,9 +230,9 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>( mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].ptr, src[0].step, src[0].data, src[0].step,
src[1].ptr, src[1].step, src[1].data, src[1].step,
dst.rows, dst.cols, dst.ptr, dst.step); dst.rows, dst.cols, dst.data, dst.step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>( mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].ptr, src[0].step, src[0].data, src[0].step,
src[1].ptr, src[1].step, src[1].data, src[1].step,
src[2].ptr, src[2].step, src[2].data, src[2].step,
dst.rows, dst.cols, dst.ptr, dst.step); dst.rows, dst.cols, dst.data, dst.step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -259,11 +259,11 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -259,11 +259,11 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>( mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].ptr, src[0].step, src[0].data, src[0].step,
src[1].ptr, src[1].step, src[1].data, src[1].step,
src[2].ptr, src[2].step, src[2].data, src[2].step,
src[3].ptr, src[3].step, src[3].data, src[3].step,
dst.rows, dst.cols, dst.ptr, dst.step); dst.rows, dst.cols, dst.data, dst.step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -433,9 +433,9 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -433,9 +433,9 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC2_<T><<<gridDim, blockDim, 0, stream>>>( splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
src.ptr, src.step, src.rows, src.cols, src.data, src.step, src.rows, src.cols,
dst[0].ptr, dst[0].step, dst[0].data, dst[0].step,
dst[1].ptr, dst[1].step); dst[1].data, dst[1].step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -447,10 +447,10 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -447,10 +447,10 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC3_<T><<<gridDim, blockDim, 0, stream>>>( splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
src.ptr, src.step, src.rows, src.cols, src.data, src.step, src.rows, src.cols,
dst[0].ptr, dst[0].step, dst[0].data, dst[0].step,
dst[1].ptr, dst[1].step, dst[1].data, dst[1].step,
dst[2].ptr, dst[2].step); dst[2].data, dst[2].step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -462,11 +462,11 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -462,11 +462,11 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC4_<T><<<gridDim, blockDim, 0, stream>>>( splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
src.ptr, src.step, src.rows, src.cols, src.data, src.step, src.rows, src.cols,
dst[0].ptr, dst[0].step, dst[0].data, dst[0].step,
dst[1].ptr, dst[1].step, dst[1].data, dst[1].step,
dst[2].ptr, dst[2].step, dst[2].data, dst[2].step,
dst[3].ptr, dst[3].step); dst[3].data, dst[3].step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
......
This diff is collapsed.
...@@ -44,36 +44,32 @@ ...@@ -44,36 +44,32 @@
#define __OPENCV_GPU_TRANSFORM_HPP__ #define __OPENCV_GPU_TRANSFORM_HPP__
#include "cuda_shared.hpp" #include "cuda_shared.hpp"
#include "saturate_cast.hpp"
#include "vecmath.hpp"
namespace cv { namespace gpu { namespace algo_krnls namespace cv { namespace gpu { namespace device
{ {
template <typename T, typename D, typename UnOp> template <typename T, typename D, typename UnOp>
static __global__ void transform(const T* src, size_t src_step, static __global__ void transform(const DevMem2D_<T> src, PtrStep_<D> dst, UnOp op)
D* dst, size_t dst_step, int width, int height, UnOp op)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height) if (x < src.cols && y < src.rows)
{ {
T src_data = src[y * src_step + x]; T src_data = src.ptr(y)[x];
dst[y * dst_step + x] = op(src_data, x, y); dst.ptr(y)[x] = op(src_data, x, y);
} }
} }
template <typename T1, typename T2, typename D, typename BinOp> template <typename T1, typename T2, typename D, typename BinOp>
static __global__ void transform(const T1* src1, size_t src1_step, const T2* src2, size_t src2_step, static __global__ void transform(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, BinOp op)
D* dst, size_t dst_step, int width, int height, BinOp op)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height) if (x < src1.cols && y < src1.rows)
{ {
T1 src1_data = src1[y * src1_step + x]; T1 src1_data = src1.ptr(y)[x];
T2 src2_data = src2[y * src2_step + x]; T2 src2_data = src2.ptr(y)[x];
dst[y * dst_step + x] = op(src1_data, src2_data, x, y); dst.ptr(y)[x] = op(src1_data, src2_data, x, y);
} }
} }
}}} }}}
...@@ -83,7 +79,7 @@ namespace cv ...@@ -83,7 +79,7 @@ namespace cv
namespace gpu namespace gpu
{ {
template <typename T, typename D, typename UnOp> template <typename T, typename D, typename UnOp>
static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream) static void transform2(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream)
{ {
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -91,8 +87,7 @@ namespace cv ...@@ -91,8 +87,7 @@ namespace cv
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
algo_krnls::transform<<<grid, threads, 0, stream>>>(src.ptr, src.elem_step, device::transform<T, D, UnOp><<<grid, threads, 0, stream>>>(src, dst, op);
dst.ptr, dst.elem_step, src.cols, src.rows, op);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -106,11 +101,10 @@ namespace cv ...@@ -106,11 +101,10 @@ namespace cv
grid.x = divUp(src1.cols, threads.x); grid.x = divUp(src1.cols, threads.x);
grid.y = divUp(src1.rows, threads.y); grid.y = divUp(src1.rows, threads.y);
algo_krnls::transform<<<grid, threads, 0, stream>>>(src1.ptr, src1.elem_step, device::transform<T1, T2, D, BinOp><<<grid, threads, 0, stream>>>(src1, src2, dst, op);
src2.ptr, src2.elem_step, dst.ptr, dst.elem_step, src1.cols, src1.rows, op);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
} }
} }
......
...@@ -384,7 +384,14 @@ namespace cv ...@@ -384,7 +384,14 @@ namespace cv
template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v) template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
{ {
SatCast<VecTraits<VecD>::cn, VecD> cast; SatCast<
VecTraits<VecD>::cn,
VecD
>
cast;
return cast(v); return cast(v);
} }
......
...@@ -577,7 +577,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke ...@@ -577,7 +577,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
namespace cv { namespace gpu { namespace filters namespace cv { namespace gpu { namespace filters
{ {
void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); template <int CN, typename T, typename D>
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
//void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearRowFilter_gpu_8s_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8s_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearRowFilter_gpu_8s_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8s_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
...@@ -653,7 +656,8 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, ...@@ -653,7 +656,8 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
static const gpuFilter1D_t gpuFilter1D_callers[6][6] = static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
{ {
{linearRowFilter_gpu_8u_8u_c4,linearRowFilter_gpu_8u_8s_c4,0,0,0,0}, {linearRowFilter_gpu<4, uchar4, uchar4>/*linearRowFilter_gpu_8u_8u_c4*/,linearRowFilter_gpu_8u_8s_c4,0,0,0,0},
{linearRowFilter_gpu_8s_8u_c4,linearRowFilter_gpu_8s_8s_c4,0,0,0,0}, {linearRowFilter_gpu_8s_8u_c4,linearRowFilter_gpu_8s_8s_c4,0,0,0,0},
{0,0,linearRowFilter_gpu_16u_16u_c2,linearRowFilter_gpu_16u_16s_c2,0,0}, {0,0,linearRowFilter_gpu_16u_16u_c2,linearRowFilter_gpu_16u_16s_c2,0,0},
{0,0,linearRowFilter_gpu_16s_16u_c2,linearRowFilter_gpu_16s_16s_c2,0,0}, {0,0,linearRowFilter_gpu_16s_16u_c2,linearRowFilter_gpu_16s_16s_c2,0,0},
......
...@@ -61,9 +61,9 @@ namespace cv { namespace gpu ...@@ -61,9 +61,9 @@ namespace cv { namespace gpu
namespace bm namespace bm
{ {
//extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf); //extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf);
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, const cudaStream_t & stream); extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream);
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap /*= 31*/, const cudaStream_t & stream); extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D output, int prefilterCap /*= 31*/, cudaStream_t & stream);
extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, const cudaStream_t & stream); extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, cudaStream_t & stream);
} }
}} }}
...@@ -98,7 +98,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() ...@@ -98,7 +98,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
return false; return false;
} }
static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, const cudaStream_t & stream) static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
{ {
CV_DbgAssert(left.rows == right.rows && left.cols == right.cols); CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
CV_DbgAssert(left.type() == CV_8UC1); CV_DbgAssert(left.type() == CV_8UC1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment