Commit d661b8e3 authored by Anatoly Baksheev's avatar Anatoly Baksheev

added PtrStep PtrElemStep structures. Refactored name spaces,

parent 501e81eb
...@@ -50,56 +50,79 @@ namespace cv ...@@ -50,56 +50,79 @@ namespace cv
// Simple lightweight structures that encapsulates information about an image on device. // Simple lightweight structures that encapsulates information about an image on device.
// It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
template<typename T> struct PtrStep_
{
T* ptr;
size_t step;
typedef T elem_type;
enum { elem_size = sizeof(elem_type) };
#if defined(__CUDACC__) #if defined(__CUDACC__)
__host__ __device__ #define __CV_GPU_HOST_DEVICE__ __host__ __device__
#else
#define __CV_GPU_HOST_DEVICE__
#endif #endif
size_t elemSize() const { return elem_size; }
};
template <typename T> struct DevMem2D_ template <typename T> struct DevMem2D_
{ {
int cols; int cols;
int rows; int rows;
T* ptr; T* data;
size_t step; size_t step;
size_t elem_step;
/*__host__*/
DevMem2D_() : cols(0), rows(0), ptr(0), step(0), elem_step(0) {}
/*__host__*/
DevMem2D_(int rows_, int cols_, T *ptr_, size_t step_)
: cols(cols_), rows(rows_), ptr(ptr_), step(step_), elem_step(step_ / sizeof(T)) {}
template <typename U> DevMem2D_() : cols(0), rows(0), data(0), step(0) {}
/*__host__*/
DevMem2D_(int rows_, int cols_, T *data_, size_t step_)
: cols(cols_), rows(rows_), data(data_), step(step_) {}
template <typename U>
explicit DevMem2D_(const DevMem2D_<U>& d) explicit DevMem2D_(const DevMem2D_<U>& d)
: cols(d.cols), rows(d.rows), ptr((T*)d.ptr), step(d.step), elem_step(d.step / sizeof(T)) {} : cols(d.cols), rows(d.rows), data((T*)d.data), step(d.step) {}
typedef T elem_type;
enum { elem_size = sizeof(elem_type) };
template <typename U> __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
/*__host__*/ __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step ); }
operator PtrStep_<U>() const { PtrStep_<U> dt; dt.ptr = ptr; dt.step = step; return dt; } __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step ); }
};
template<typename T> struct PtrStep_
{
T* data;
size_t step;
typedef typename PtrStep_<T>::elem_type elem_type; PtrStep_() : data(0), step(0) {}
enum { elem_size = PtrStep_<T>::elem_size }; PtrStep_(const DevMem2D_<T>& mem) : data(mem.data), step(mem.step) {}
#if defined(__CUDACC__)
__host__ __device__ typedef T elem_type;
#endif enum { elem_size = sizeof(elem_type) };
size_t elemSize() const { return elem_size; }
__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step); }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step); }
};
template<typename T> struct PtrElemStep_ : public PtrStep_<T>
{
PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep_<T>(mem)
{
step /= elem_size;
}
private:
template <bool> struct StaticCheck;
template <> struct StaticCheck<true>{};
StaticCheck<256 % sizeof(T) == 0> ElemStepTypeCheck;
}; };
typedef DevMem2D_<unsigned char> DevMem2D; typedef DevMem2D_<unsigned char> DevMem2D;
typedef DevMem2D_<float> DevMem2Df; typedef DevMem2D_<float> DevMem2Df;
typedef DevMem2D_<int> DevMem2Di; typedef DevMem2D_<int> DevMem2Di;
}
typedef PtrStep_<unsigned char> PtrStep;
typedef PtrStep_<float> PtrStepf;
typedef PtrStep_<int> PtrStepi;
typedef PtrElemStep_<unsigned char> PtrElemStep;
typedef PtrElemStep_<float> PtrElemStepf;
typedef PtrElemStep_<int> PtrElemStepi;
#undef __CV_GPU_HOST_DEVICE__
}
} }
#endif /* __OPENCV_GPU_DEVMEM2D_HPP__ */ #endif /* __OPENCV_GPU_DEVMEM2D_HPP__ */
...@@ -109,6 +109,7 @@ namespace cv ...@@ -109,6 +109,7 @@ namespace cv
//! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code. //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
// Contains just image size, data ptr and step. // Contains just image size, data ptr and step.
template <class T> operator DevMem2D_<T>() const; template <class T> operator DevMem2D_<T>() const;
template <class T> operator PtrStep_<T>() const;
//! pefroms blocking upload data to GpuMat. . //! pefroms blocking upload data to GpuMat. .
void upload(const cv::Mat& m); void upload(const cv::Mat& m);
......
...@@ -207,6 +207,7 @@ inline GpuMat& GpuMat::operator = (const GpuMat& m) ...@@ -207,6 +207,7 @@ inline GpuMat& GpuMat::operator = (const GpuMat& m)
inline GpuMat& GpuMat::operator = (const Mat& m) { upload(m); return *this; } inline GpuMat& GpuMat::operator = (const Mat& m) { upload(m); return *this; }
template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); } template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); }
template <class T> inline GpuMat::operator PtrStep_<T>() const { return PtrStep_<T>(*this); }
//CPP: void GpuMat::upload(const Mat& m); //CPP: void GpuMat::upload(const Mat& m);
......
...@@ -50,36 +50,32 @@ using namespace cv::gpu; ...@@ -50,36 +50,32 @@ using namespace cv::gpu;
#define FLT_MAX 3.402823466e+38F #define FLT_MAX 3.402823466e+38F
#endif #endif
namespace cv { namespace gpu { namespace bp {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// load constants //////////////////////// /////////////////////// load constants ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace bp_kernels
{
__constant__ int cndisp; __constant__ int cndisp;
__constant__ float cmax_data_term; __constant__ float cmax_data_term;
__constant__ float cdata_weight; __constant__ float cdata_weight;
__constant__ float cmax_disc_term; __constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump; __constant__ float cdisc_single_jump;
};
namespace cv { namespace gpu { namespace bp {
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump) void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
{ {
cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cndisp, &ndisp, sizeof(int )) ); cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cmax_data_term, &max_data_term, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cdata_weight, &data_weight, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cmax_disc_term, &max_disc_term, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(bp_kernels::cdisc_single_jump, &disc_single_jump, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
////////////////////////// comp data ////////////////////////// ////////////////////////// comp data //////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T> template <typename T>
__global__ void comp_data_gray(const uchar* l, const uchar* r, size_t step, T* data, size_t data_step, int cols, int rows) __global__ void comp_data_gray(const uchar* l, const uchar* r, size_t step, T* data, size_t data_step, int cols, int rows)
{ {
...@@ -145,9 +141,7 @@ namespace bp_kernels ...@@ -145,9 +141,7 @@ namespace bp_kernels
} }
} }
} }
}
namespace cv { namespace gpu { namespace bp {
typedef void (*CompDataFunc)(const DevMem2D& l, const DevMem2D& r, int channels, DevMem2D mdata, const cudaStream_t& stream); typedef void (*CompDataFunc)(const DevMem2D& l, const DevMem2D& r, int channels, DevMem2D mdata, const cudaStream_t& stream);
template<typename T> template<typename T>
...@@ -160,9 +154,9 @@ namespace cv { namespace gpu { namespace bp { ...@@ -160,9 +154,9 @@ namespace cv { namespace gpu { namespace bp {
grid.y = divUp(l.rows, threads.y); grid.y = divUp(l.rows, threads.y);
if (channels == 1) if (channels == 1)
bp_kernels::comp_data_gray<T><<<grid, threads, 0, stream>>>(l.ptr, r.ptr, l.step, (T*)mdata.ptr, mdata.step/sizeof(T), l.cols, l.rows); comp_data_gray<T><<<grid, threads, 0, stream>>>(l.data, r.data, l.step, (T*)mdata.data, mdata.step/sizeof(T), l.cols, l.rows);
else else
bp_kernels::comp_data_bgr<T><<<grid, threads, 0, stream>>>(l.ptr, r.ptr, l.step, (T*)mdata.ptr, mdata.step/sizeof(T), l.cols, l.rows); comp_data_bgr<T><<<grid, threads, 0, stream>>>(l.data, r.data, l.step, (T*)mdata.data, mdata.step/sizeof(T), l.cols, l.rows);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -187,14 +181,11 @@ namespace cv { namespace gpu { namespace bp { ...@@ -187,14 +181,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__); cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(l, r, channels, mdata, stream); func(l, r, channels, mdata, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////////// data step down /////////////////////// //////////////////////// data step down ///////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T> template <typename T>
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step) __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step)
{ {
...@@ -217,9 +208,7 @@ namespace bp_kernels ...@@ -217,9 +208,7 @@ namespace bp_kernels
} }
} }
} }
}
namespace cv { namespace gpu { namespace bp {
typedef void (*DataStepDownFunc)(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, DevMem2D dst, const cudaStream_t& stream); typedef void (*DataStepDownFunc)(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, DevMem2D dst, const cudaStream_t& stream);
template<typename T> template<typename T>
...@@ -231,7 +220,7 @@ namespace cv { namespace gpu { namespace bp { ...@@ -231,7 +220,7 @@ namespace cv { namespace gpu { namespace bp {
grid.x = divUp(dst_cols, threads.x); grid.x = divUp(dst_cols, threads.x);
grid.y = divUp(dst_rows, threads.y); grid.y = divUp(dst_rows, threads.y);
bp_kernels::data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)src.ptr, src.step/sizeof(T), (T*)dst.ptr, dst.step/sizeof(T)); data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)src.data, src.step/sizeof(T), (T*)dst.data, dst.step/sizeof(T));
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -256,14 +245,11 @@ namespace cv { namespace gpu { namespace bp { ...@@ -256,14 +245,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__); cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(dst_cols, dst_rows, src_rows, src, dst, stream); func(dst_cols, dst_rows, src_rows, src, dst, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////// level up messages //////////////////////// /////////////////// level up messages ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T> template <typename T>
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step) __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const T* src, size_t src_step, T* dst, size_t dst_step)
{ {
...@@ -282,9 +268,7 @@ namespace bp_kernels ...@@ -282,9 +268,7 @@ namespace bp_kernels
dstr[d * dst_disp_step] = srcr[d * src_disp_step]; dstr[d * dst_disp_step] = srcr[d * src_disp_step];
} }
} }
}
namespace cv { namespace gpu { namespace bp {
typedef void (*LevelUpMessagesFunc)(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, const cudaStream_t& stream); typedef void (*LevelUpMessagesFunc)(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, const cudaStream_t& stream);
template<typename T> template<typename T>
...@@ -298,10 +282,10 @@ namespace cv { namespace gpu { namespace bp { ...@@ -298,10 +282,10 @@ namespace cv { namespace gpu { namespace bp {
int src_idx = (dst_idx + 1) & 1; int src_idx = (dst_idx + 1) & 1;
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mus[src_idx].ptr, mus[src_idx].step/sizeof(T), (T*)mus[dst_idx].ptr, mus[dst_idx].step/sizeof(T)); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mus[src_idx].data, mus[src_idx].step/sizeof(T), (T*)mus[dst_idx].data, mus[dst_idx].step/sizeof(T));
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mds[src_idx].ptr, mds[src_idx].step/sizeof(T), (T*)mds[dst_idx].ptr, mds[dst_idx].step/sizeof(T)); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mds[src_idx].data, mds[src_idx].step/sizeof(T), (T*)mds[dst_idx].data, mds[dst_idx].step/sizeof(T));
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mls[src_idx].ptr, mls[src_idx].step/sizeof(T), (T*)mls[dst_idx].ptr, mls[dst_idx].step/sizeof(T)); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mls[src_idx].data, mls[src_idx].step/sizeof(T), (T*)mls[dst_idx].data, mls[dst_idx].step/sizeof(T));
bp_kernels::level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mrs[src_idx].ptr, mrs[src_idx].step/sizeof(T), (T*)mrs[dst_idx].ptr, mrs[dst_idx].step/sizeof(T)); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (const T*)mrs[src_idx].data, mrs[src_idx].step/sizeof(T), (T*)mrs[dst_idx].data, mrs[dst_idx].step/sizeof(T));
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -326,14 +310,11 @@ namespace cv { namespace gpu { namespace bp { ...@@ -326,14 +310,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__); cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(dst_idx, dst_cols, dst_rows, src_rows, mus, mds, mls, mrs, stream); func(dst_idx, dst_cols, dst_rows, src_rows, mus, mds, mls, mrs, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T> template <typename T>
__device__ void calc_min_linear_penalty(T* dst, size_t step) __device__ void calc_min_linear_penalty(T* dst, size_t step)
{ {
...@@ -427,9 +408,7 @@ namespace bp_kernels ...@@ -427,9 +408,7 @@ namespace bp_kernels
message(us + msg_step, ds - msg_step, ls + 1, dt, ls, msg_disp_step, data_disp_step); message(us + msg_step, ds - msg_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
} }
} }
}
namespace cv { namespace gpu { namespace bp {
typedef void (*CalcAllIterationFunc)(int cols, int rows, int iters, DevMem2D& u, DevMem2D& d, DevMem2D& l, DevMem2D& r, const DevMem2D& data, const cudaStream_t& stream); typedef void (*CalcAllIterationFunc)(int cols, int rows, int iters, DevMem2D& u, DevMem2D& d, DevMem2D& l, DevMem2D& r, const DevMem2D& data, const cudaStream_t& stream);
template<typename T> template<typename T>
...@@ -443,7 +422,7 @@ namespace cv { namespace gpu { namespace bp { ...@@ -443,7 +422,7 @@ namespace cv { namespace gpu { namespace bp {
for(int t = 0; t < iters; ++t) for(int t = 0; t < iters; ++t)
{ {
bp_kernels::one_iteration<T><<<grid, threads, 0, stream>>>(t, (T*)u.ptr, (T*)d.ptr, (T*)l.ptr, (T*)r.ptr, u.step/sizeof(T), (const T*)data.ptr, data.step/sizeof(T), cols, rows); one_iteration<T><<<grid, threads, 0, stream>>>(t, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, u.step/sizeof(T), (const T*)data.data, data.step/sizeof(T), cols, rows);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -469,14 +448,11 @@ namespace cv { namespace gpu { namespace bp { ...@@ -469,14 +448,11 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__); cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(cols, rows, iters, u, d, l, r, data, stream); func(cols, rows, iters, u, d, l, r, data, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////////// output //////////////////////////// /////////////////////////// output ////////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace bp_kernels
{
template <typename T> template <typename T>
__global__ void output(int cols, int rows, const T* u, const T* d, const T* l, const T* r, const T* data, size_t step, short* disp, size_t res_step) __global__ void output(int cols, int rows, const T* u, const T* d, const T* l, const T* r, const T* data, size_t step, short* disp, size_t res_step)
{ {
...@@ -513,9 +489,7 @@ namespace bp_kernels ...@@ -513,9 +489,7 @@ namespace bp_kernels
disp[res_step * y + x] = saturate_cast<short>(best); disp[res_step * y + x] = saturate_cast<short>(best);
} }
} }
}
namespace cv { namespace gpu { namespace bp {
typedef void (*OutputFunc)(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, DevMem2D disp, const cudaStream_t& stream); typedef void (*OutputFunc)(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, DevMem2D disp, const cudaStream_t& stream);
template<typename T> template<typename T>
...@@ -527,7 +501,7 @@ namespace cv { namespace gpu { namespace bp { ...@@ -527,7 +501,7 @@ namespace cv { namespace gpu { namespace bp {
grid.x = divUp(disp.cols, threads.x); grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y); grid.y = divUp(disp.rows, threads.y);
bp_kernels::output<T><<<grid, threads, 0, stream>>>(disp.cols, disp.rows, (const T*)u.ptr, (const T*)d.ptr, (const T*)l.ptr, (const T*)r.ptr, (const T*)data.ptr, u.step/sizeof(T), (short*)disp.ptr, disp.step/sizeof(short)); output<T><<<grid, threads, 0, stream>>>(disp.cols, disp.rows, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, u.step/sizeof(T), (short*)disp.data, disp.step/sizeof(short));
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -552,4 +526,5 @@ namespace cv { namespace gpu { namespace bp { ...@@ -552,4 +526,5 @@ namespace cv { namespace gpu { namespace bp {
cv::gpu::error("Unsupported message type", __FILE__, __LINE__); cv::gpu::error("Unsupported message type", __FILE__, __LINE__);
func(u, d, l, r, data, disp, stream); func(u, d, l, r, data, disp, stream);
} }
}}} }}}
\ No newline at end of file
...@@ -54,7 +54,7 @@ using namespace cv::gpu; ...@@ -54,7 +54,7 @@ using namespace cv::gpu;
#define FLT_EPSILON 1.192092896e-07F #define FLT_EPSILON 1.192092896e-07F
#endif #endif
namespace color_krnls namespace cv { namespace gpu { namespace color
{ {
template<typename T> struct ColorChannel {}; template<typename T> struct ColorChannel {};
template<> struct ColorChannel<uchar> template<> struct ColorChannel<uchar>
...@@ -95,12 +95,9 @@ namespace color_krnls ...@@ -95,12 +95,9 @@ namespace color_krnls
{ {
return vec.w; return vec.w;
} }
}
////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
namespace color_krnls
{
template <int SRCCN, int DSTCN, typename T> template <int SRCCN, int DSTCN, typename T>
__global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx) __global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
{ {
...@@ -123,10 +120,7 @@ namespace color_krnls ...@@ -123,10 +120,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN> template <typename T, int SRCCN, int DSTCN>
void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{ {
...@@ -136,8 +130,8 @@ namespace cv { namespace gpu { namespace color ...@@ -136,8 +130,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -178,12 +172,9 @@ namespace cv { namespace gpu { namespace color ...@@ -178,12 +172,9 @@ namespace cv { namespace gpu { namespace color
RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
} }
}}}
/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
namespace color_krnls
{
template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {}; template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};
template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN> template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN>
{ {
...@@ -272,10 +263,7 @@ namespace color_krnls ...@@ -272,10 +263,7 @@ namespace color_krnls
*(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter<SRCCN, GREEN_BITS>::cvt(&src.x, bidx); *(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter<SRCCN, GREEN_BITS>::cvt(&src.x, bidx);
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <int GREEN_BITS, int DSTCN> template <int GREEN_BITS, int DSTCN>
void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{ {
...@@ -285,8 +273,8 @@ namespace cv { namespace gpu { namespace color ...@@ -285,8 +273,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -313,8 +301,8 @@ namespace cv { namespace gpu { namespace color ...@@ -313,8 +301,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -331,12 +319,9 @@ namespace cv { namespace gpu { namespace color ...@@ -331,12 +319,9 @@ namespace cv { namespace gpu { namespace color
RGB2RGB5x5_callers[srccn - 3][green_bits - 5](src, dst, bidx, stream); RGB2RGB5x5_callers[srccn - 3][green_bits - 5](src, dst, bidx, stream);
} }
}}}
///////////////////////////////// Grayscale to Color //////////////////////////////// ///////////////////////////////// Grayscale to Color ////////////////////////////////
namespace color_krnls
{
template <int DSTCN, typename T> template <int DSTCN, typename T>
__global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols) __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
{ {
...@@ -387,10 +372,7 @@ namespace color_krnls ...@@ -387,10 +372,7 @@ namespace color_krnls
*(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter<GREEN_BITS>::cvt(src); *(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter<GREEN_BITS>::cvt(src);
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int DSTCN> template <typename T, int DSTCN>
void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
{ {
...@@ -400,8 +382,8 @@ namespace cv { namespace gpu { namespace color ...@@ -400,8 +382,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -440,8 +422,8 @@ namespace cv { namespace gpu { namespace color ...@@ -440,8 +422,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -457,12 +439,9 @@ namespace cv { namespace gpu { namespace color ...@@ -457,12 +439,9 @@ namespace cv { namespace gpu { namespace color
Gray2RGB5x5_callers[green_bits - 5](src, dst, stream); Gray2RGB5x5_callers[green_bits - 5](src, dst, stream);
} }
}}}
///////////////////////////////// Color to Grayscale //////////////////////////////// ///////////////////////////////// Color to Grayscale ////////////////////////////////
namespace color_krnls
{
#undef R2Y #undef R2Y
#undef G2Y #undef G2Y
#undef B2Y #undef B2Y
...@@ -541,10 +520,7 @@ namespace color_krnls ...@@ -541,10 +520,7 @@ namespace color_krnls
*(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt(&src.x, bidx); *(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt(&src.x, bidx);
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN> template <typename T, int SRCCN>
void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{ {
...@@ -554,8 +530,8 @@ namespace cv { namespace gpu { namespace color ...@@ -554,8 +530,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -594,8 +570,8 @@ namespace cv { namespace gpu { namespace color ...@@ -594,8 +570,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -611,12 +587,9 @@ namespace cv { namespace gpu { namespace color ...@@ -611,12 +587,9 @@ namespace cv { namespace gpu { namespace color
RGB5x52Gray_callers[green_bits - 5](src, dst, stream); RGB5x52Gray_callers[green_bits - 5](src, dst, stream);
} }
}}}
///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
namespace color_krnls
{
__constant__ float cYCrCbCoeffs_f[5]; __constant__ float cYCrCbCoeffs_f[5];
__constant__ int cYCrCbCoeffs_i[5]; __constant__ int cYCrCbCoeffs_i[5];
...@@ -712,10 +685,7 @@ namespace color_krnls ...@@ -712,10 +685,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN> template <typename T, int SRCCN, int DSTCN>
void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream) void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
{ {
...@@ -725,8 +695,8 @@ namespace cv { namespace gpu { namespace color ...@@ -725,8 +695,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -741,7 +711,7 @@ namespace cv { namespace gpu { namespace color ...@@ -741,7 +711,7 @@ namespace cv { namespace gpu { namespace color
{RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>} {RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
} }
...@@ -755,7 +725,7 @@ namespace cv { namespace gpu { namespace color ...@@ -755,7 +725,7 @@ namespace cv { namespace gpu { namespace color
{RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>} {RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
} }
...@@ -769,7 +739,7 @@ namespace cv { namespace gpu { namespace color ...@@ -769,7 +739,7 @@ namespace cv { namespace gpu { namespace color
{RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>} {RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream); RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
} }
...@@ -783,8 +753,8 @@ namespace cv { namespace gpu { namespace color ...@@ -783,8 +753,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -799,7 +769,7 @@ namespace cv { namespace gpu { namespace color ...@@ -799,7 +769,7 @@ namespace cv { namespace gpu { namespace color
{YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>} {YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
} }
...@@ -813,7 +783,7 @@ namespace cv { namespace gpu { namespace color ...@@ -813,7 +783,7 @@ namespace cv { namespace gpu { namespace color
{YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>} {YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
} }
...@@ -827,16 +797,13 @@ namespace cv { namespace gpu { namespace color ...@@ -827,16 +797,13 @@ namespace cv { namespace gpu { namespace color
{YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>} {YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream); YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
} }
}}}
////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
namespace color_krnls
{
__constant__ float cXYZ_D65f[9]; __constant__ float cXYZ_D65f[9];
__constant__ int cXYZ_D65i[9]; __constant__ int cXYZ_D65i[9];
...@@ -922,10 +889,7 @@ namespace color_krnls ...@@ -922,10 +889,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN> template <typename T, int SRCCN, int DSTCN>
void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
{ {
...@@ -935,8 +899,8 @@ namespace cv { namespace gpu { namespace color ...@@ -935,8 +899,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -951,7 +915,7 @@ namespace cv { namespace gpu { namespace color ...@@ -951,7 +915,7 @@ namespace cv { namespace gpu { namespace color
{RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>} {RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
} }
...@@ -965,7 +929,7 @@ namespace cv { namespace gpu { namespace color ...@@ -965,7 +929,7 @@ namespace cv { namespace gpu { namespace color
{RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>} {RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
} }
...@@ -979,7 +943,7 @@ namespace cv { namespace gpu { namespace color ...@@ -979,7 +943,7 @@ namespace cv { namespace gpu { namespace color
{RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>} {RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream); RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
} }
...@@ -993,8 +957,8 @@ namespace cv { namespace gpu { namespace color ...@@ -993,8 +957,8 @@ namespace cv { namespace gpu { namespace color
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
color_krnls::XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols); dst.data, dst.step, src.rows, src.cols);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -1009,7 +973,7 @@ namespace cv { namespace gpu { namespace color ...@@ -1009,7 +973,7 @@ namespace cv { namespace gpu { namespace color
{XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>} {XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
} }
...@@ -1023,7 +987,7 @@ namespace cv { namespace gpu { namespace color ...@@ -1023,7 +987,7 @@ namespace cv { namespace gpu { namespace color
{XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>} {XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
} }
...@@ -1037,16 +1001,13 @@ namespace cv { namespace gpu { namespace color ...@@ -1037,16 +1001,13 @@ namespace cv { namespace gpu { namespace color
{XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>} {XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>}
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream); XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
} }
}}}
////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
namespace color_krnls
{
__constant__ int cHsvDivTable[256]; __constant__ int cHsvDivTable[256];
template<typename T, int HR> struct RGB2HSVConvertor; template<typename T, int HR> struct RGB2HSVConvertor;
...@@ -1220,10 +1181,7 @@ namespace color_krnls ...@@ -1220,10 +1181,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN> template <typename T, int SRCCN, int DSTCN>
void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{ {
...@@ -1234,11 +1192,11 @@ namespace cv { namespace gpu { namespace color ...@@ -1234,11 +1192,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
if (hrange == 180) if (hrange == 180)
color_krnls::RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
else else
color_krnls::RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -1288,7 +1246,7 @@ namespace cv { namespace gpu { namespace color ...@@ -1288,7 +1246,7 @@ namespace cv { namespace gpu { namespace color
4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
}; };
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHsvDivTable, div_table, sizeof(div_table)) ); cudaSafeCall( cudaMemcpyToSymbol(cHsvDivTable, div_table, sizeof(div_table)) );
RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
} }
...@@ -1316,11 +1274,11 @@ namespace cv { namespace gpu { namespace color ...@@ -1316,11 +1274,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
if (hrange == 180) if (hrange == 180)
color_krnls::HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
else else
color_krnls::HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -1338,7 +1296,7 @@ namespace cv { namespace gpu { namespace color ...@@ -1338,7 +1296,7 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3] = static const int sector_data[][3] =
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) ); cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
} }
...@@ -1355,16 +1313,13 @@ namespace cv { namespace gpu { namespace color ...@@ -1355,16 +1313,13 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3] = static const int sector_data[][3] =
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) ); cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
} }
}}}
/////////////////////////////////////// RGB <-> HLS //////////////////////////////////////// /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
namespace color_krnls
{
template<typename T, int HR> struct RGB2HLSConvertor; template<typename T, int HR> struct RGB2HLSConvertor;
template<int HR> struct RGB2HLSConvertor<float, HR> template<int HR> struct RGB2HLSConvertor<float, HR>
{ {
...@@ -1532,10 +1487,7 @@ namespace color_krnls ...@@ -1532,10 +1487,7 @@ namespace color_krnls
*(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst; *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
} }
} }
}
namespace cv { namespace gpu { namespace color
{
template <typename T, int SRCCN, int DSTCN> template <typename T, int SRCCN, int DSTCN>
void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream) void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
{ {
...@@ -1546,11 +1498,11 @@ namespace cv { namespace gpu { namespace color ...@@ -1546,11 +1498,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
if (hrange == 180) if (hrange == 180)
color_krnls::RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
else else
color_krnls::RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -1591,11 +1543,11 @@ namespace cv { namespace gpu { namespace color ...@@ -1591,11 +1543,11 @@ namespace cv { namespace gpu { namespace color
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
if (hrange == 180) if (hrange == 180)
color_krnls::HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
else else
color_krnls::HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.ptr, dst.step, src.rows, src.cols, bidx); dst.data, dst.step, src.rows, src.cols, bidx);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -1613,7 +1565,7 @@ namespace cv { namespace gpu { namespace color ...@@ -1613,7 +1565,7 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3]= static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) ); cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
} }
...@@ -1630,7 +1582,7 @@ namespace cv { namespace gpu { namespace color ...@@ -1630,7 +1582,7 @@ namespace cv { namespace gpu { namespace color
static const int sector_data[][3]= static const int sector_data[][3]=
{{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}}; {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
cudaSafeCall( cudaMemcpyToSymbol(color_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) ); cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream); HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
} }
......
...@@ -54,8 +54,9 @@ using namespace cv::gpu; ...@@ -54,8 +54,9 @@ using namespace cv::gpu;
#define SHRT_MAX 32767 #define SHRT_MAX 32767
#endif #endif
namespace csbp_krnls namespace cv { namespace gpu { namespace csbp
{ {
template <typename T> struct TypeLimits; template <typename T> struct TypeLimits;
template <> struct TypeLimits<short> template <> struct TypeLimits<short>
{ {
...@@ -65,14 +66,11 @@ namespace csbp_krnls ...@@ -65,14 +66,11 @@ namespace csbp_krnls
{ {
static __device__ float max() {return FLT_MAX;} static __device__ float max() {return FLT_MAX;}
}; };
}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// load constants //////////////////////// /////////////////////// load constants ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
__constant__ int cndisp; __constant__ int cndisp;
__constant__ float cmax_data_term; __constant__ float cmax_data_term;
...@@ -91,36 +89,30 @@ namespace csbp_krnls ...@@ -91,36 +89,30 @@ namespace csbp_krnls
__constant__ uchar* cleft; __constant__ uchar* cleft;
__constant__ uchar* cright; __constant__ uchar* cright;
__constant__ uchar* ctemp; __constant__ uchar* ctemp;
}
namespace cv { namespace gpu { namespace csbp
{
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th, void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
const DevMem2D& left, const DevMem2D& right, const DevMem2D& temp) const DevMem2D& left, const DevMem2D& right, const DevMem2D& temp)
{ {
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cndisp, &ndisp, sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmax_data_term, &max_data_term, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdata_weight, &data_weight, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmax_disc_term, &max_disc_term, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisc_single_jump, &disc_single_jump, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cth, &min_disp_th, sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cimg_step, &left.step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cleft, &left.ptr, sizeof(left.ptr)) ); cudaSafeCall( cudaMemcpyToSymbol(cleft, &left.data, sizeof(left.data)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cright, &right.ptr, sizeof(right.ptr)) ); cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::ctemp, &temp.ptr, sizeof(temp.ptr)) ); cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// init data cost //////////////////////// /////////////////////// init data cost ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <int channels> template <int channels>
struct DataCostPerPixel struct DataCostPerPixel
{ {
...@@ -334,10 +326,8 @@ namespace csbp_krnls ...@@ -334,10 +326,8 @@ namespace csbp_krnls
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]); data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
} }
} }
}
namespace cv { namespace gpu { namespace csbp
{
template <typename T> template <typename T>
void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream) void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
{ {
...@@ -349,8 +339,8 @@ namespace cv { namespace gpu { namespace csbp ...@@ -349,8 +339,8 @@ namespace cv { namespace gpu { namespace csbp
switch (channels) switch (channels)
{ {
case 1: csbp_krnls::init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break; case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
case 3: csbp_krnls::init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break; case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
...@@ -367,8 +357,8 @@ namespace cv { namespace gpu { namespace csbp ...@@ -367,8 +357,8 @@ namespace cv { namespace gpu { namespace csbp
switch (channels) switch (channels)
{ {
case 1: csbp_krnls::init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break; case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
case 3: csbp_krnls::init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break; case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
...@@ -388,8 +378,8 @@ namespace cv { namespace gpu { namespace csbp ...@@ -388,8 +378,8 @@ namespace cv { namespace gpu { namespace csbp
}; };
size_t disp_step = msg_step * h; size_t disp_step = msg_step * h;
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream); init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
if (stream == 0) if (stream == 0)
...@@ -402,9 +392,9 @@ namespace cv { namespace gpu { namespace csbp ...@@ -402,9 +392,9 @@ namespace cv { namespace gpu { namespace csbp
grid.y = divUp(h, threads.y); grid.y = divUp(h, threads.y);
if (use_local_init_data_cost == true) if (use_local_init_data_cost == true)
csbp_krnls::get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane); get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
else else
csbp_krnls::get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane); get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -421,14 +411,10 @@ namespace cv { namespace gpu { namespace csbp ...@@ -421,14 +411,10 @@ namespace cv { namespace gpu { namespace csbp
init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream); init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
////////////////////// compute data cost ////////////////////// ////////////////////// compute data cost //////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <typename T, int channels> template <typename T, int channels>
__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane) __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
{ {
...@@ -536,10 +522,7 @@ namespace csbp_krnls ...@@ -536,10 +522,7 @@ namespace csbp_krnls
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]); data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
} }
} }
}
namespace cv { namespace gpu { namespace csbp
{
template <typename T> template <typename T>
void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/, void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream) int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
...@@ -552,8 +535,8 @@ namespace cv { namespace gpu { namespace csbp ...@@ -552,8 +535,8 @@ namespace cv { namespace gpu { namespace csbp
switch(channels) switch(channels)
{ {
case 1: csbp_krnls::compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break; case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
case 3: csbp_krnls::compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break; case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
...@@ -571,13 +554,12 @@ namespace cv { namespace gpu { namespace csbp ...@@ -571,13 +554,12 @@ namespace cv { namespace gpu { namespace csbp
switch (channels) switch (channels)
{ {
case 1: csbp_krnls::compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break; case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
case 3: csbp_krnls::compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break; case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
template<class T> template<class T>
void compute_data_cost_tmpl(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2, void compute_data_cost_tmpl(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream) int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
...@@ -594,10 +576,10 @@ namespace cv { namespace gpu { namespace csbp ...@@ -594,10 +576,10 @@ namespace cv { namespace gpu { namespace csbp
size_t disp_step1 = msg_step1 * h; size_t disp_step1 = msg_step1 * h;
size_t disp_step2 = msg_step2 * h2; size_t disp_step2 = msg_step2 * h2;
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step1, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step2, &disp_step2, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step1, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step2, &msg_step2, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream); callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
...@@ -616,15 +598,12 @@ namespace cv { namespace gpu { namespace csbp ...@@ -616,15 +598,12 @@ namespace cv { namespace gpu { namespace csbp
compute_data_cost_tmpl(disp_selected_pyr, data_cost, msg_step1, msg_step2, rows, cols, h, w, h2, level, nr_plane, channels, stream); compute_data_cost_tmpl(disp_selected_pyr, data_cost, msg_step1, msg_step2, rows, cols, h, w, h2, level, nr_plane, channels, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////////// init message ///////////////////////// //////////////////////// init message /////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace csbp_krnls
{ template <typename T>
template <typename T>
__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new, __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur, const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
T* data_cost_selected, T* disparity_selected_new, T* data_cost_new, T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
...@@ -705,10 +684,8 @@ namespace csbp_krnls ...@@ -705,10 +684,8 @@ namespace csbp_krnls
data_cost, disparity_selected_cur, nr_plane, nr_plane2); data_cost, disparity_selected_cur, nr_plane, nr_plane2);
} }
} }
}
namespace cv { namespace gpu { namespace csbp
{
template<class T> template<class T>
void init_message_tmpl(T* u_new, T* d_new, T* l_new, T* r_new, void init_message_tmpl(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur, const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
...@@ -719,10 +696,10 @@ namespace cv { namespace gpu { namespace csbp ...@@ -719,10 +696,10 @@ namespace cv { namespace gpu { namespace csbp
size_t disp_step1 = msg_step1 * h; size_t disp_step1 = msg_step1 * h;
size_t disp_step2 = msg_step2 * h2; size_t disp_step2 = msg_step2 * h2;
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step1, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step2, &disp_step2, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step1, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step1, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step2, &msg_step2, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -730,7 +707,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -730,7 +707,7 @@ namespace cv { namespace gpu { namespace csbp
grid.x = divUp(w, threads.x); grid.x = divUp(w, threads.x);
grid.y = divUp(h, threads.y); grid.y = divUp(h, threads.y);
csbp_krnls::init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new, init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
u_cur, d_cur, l_cur, r_cur, u_cur, d_cur, l_cur, r_cur,
selected_disp_pyr_new, selected_disp_pyr_cur, selected_disp_pyr_new, selected_disp_pyr_cur,
data_cost_selected, data_cost, data_cost_selected, data_cost,
...@@ -761,14 +738,11 @@ namespace cv { namespace gpu { namespace csbp ...@@ -761,14 +738,11 @@ namespace cv { namespace gpu { namespace csbp
selected_disp_pyr_new, selected_disp_pyr_cur, data_cost_selected, data_cost, msg_step1, msg_step2, selected_disp_pyr_new, selected_disp_pyr_cur, data_cost_selected, data_cost, msg_step1, msg_step2,
h, w, nr_plane, h2, w2, nr_plane2, stream); h, w, nr_plane, h2, w2, nr_plane2, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <typename T> template <typename T>
__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3, __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
const T* dst_disp, const T* src_disp, int nr_plane, T* temp) const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
...@@ -829,17 +803,15 @@ namespace csbp_krnls ...@@ -829,17 +803,15 @@ namespace csbp_krnls
message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp); message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);
} }
} }
}
namespace cv { namespace gpu { namespace csbp
{
template<class T> template<class T>
void calc_all_iterations_tmpl(T* u, T* d, T* l, T* r, const T* data_cost_selected, void calc_all_iterations_tmpl(T* u, T* d, T* l, T* r, const T* data_cost_selected,
const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream) const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
{ {
size_t disp_step = msg_step * h; size_t disp_step = msg_step * h;
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -849,7 +821,7 @@ namespace cv { namespace gpu { namespace csbp ...@@ -849,7 +821,7 @@ namespace cv { namespace gpu { namespace csbp
for(int t = 0; t < iters; ++t) for(int t = 0; t < iters; ++t)
{ {
csbp_krnls::compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1); compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -868,14 +840,12 @@ namespace cv { namespace gpu { namespace csbp ...@@ -868,14 +840,12 @@ namespace cv { namespace gpu { namespace csbp
calc_all_iterations_tmpl(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, msg_step, h, w, nr_plane, iters, stream); calc_all_iterations_tmpl(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, msg_step, h, w, nr_plane, iters, stream);
} }
}}}
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////////// output //////////////////////////// /////////////////////////// output ////////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
namespace csbp_krnls
{
template <typename T> template <typename T>
__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_, __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
const T* data_cost_selected, const T* disp_selected_pyr, const T* data_cost_selected, const T* disp_selected_pyr,
...@@ -910,17 +880,15 @@ namespace csbp_krnls ...@@ -910,17 +880,15 @@ namespace csbp_krnls
disp[res_step * y + x] = best; disp[res_step * y + x] = best;
} }
} }
}
namespace cv { namespace gpu { namespace csbp
{
template<class T> template<class T>
void compute_disp_tmpl(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step, void compute_disp_tmpl(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream) const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
{ {
size_t disp_step = disp.rows * msg_step; size_t disp_step = disp.rows * msg_step;
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cdisp_step1, &disp_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(csbp_krnls::cmsg_step1, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -928,8 +896,8 @@ namespace cv { namespace gpu { namespace csbp ...@@ -928,8 +896,8 @@ namespace cv { namespace gpu { namespace csbp
grid.x = divUp(disp.cols, threads.x); grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y); grid.y = divUp(disp.rows, threads.y);
csbp_krnls::compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
disp.ptr, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane); disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
......
...@@ -54,9 +54,9 @@ namespace cv ...@@ -54,9 +54,9 @@ namespace cv
typedef unsigned char uchar; typedef unsigned char uchar;
typedef signed char schar; typedef signed char schar;
typedef unsigned short ushort; typedef unsigned short ushort;
typedef unsigned int uint; typedef unsigned int uint;
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
template<class T> template<class T>
static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); } static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
......
...@@ -128,8 +128,8 @@ namespace cv { namespace gpu { namespace filters ...@@ -128,8 +128,8 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.ptr, src.elem_step, filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.data, src.step/src.elemSize(),
dst.ptr, dst.elem_step, anchor, src.cols, src.rows); dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -152,10 +152,12 @@ namespace cv { namespace gpu { namespace filters ...@@ -152,10 +152,12 @@ namespace cv { namespace gpu { namespace filters
callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor); callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
} }
void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) template void linearRowFilter_gpu<4, uchar4, uchar4>(const DevMem2D&, const DevMem2D&, const float[], int , int);
/* void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
{ {
linearRowFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor); linearRowFilter_gpu<4, uchar4, uchar4>(src, dst, kernel, ksize, anchor);
} }*/
void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor) void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
{ {
linearRowFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor); linearRowFilter_gpu<4, uchar4, char4>(src, dst, kernel, ksize, anchor);
...@@ -262,8 +264,8 @@ namespace cv { namespace gpu { namespace filters ...@@ -262,8 +264,8 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.ptr, src.elem_step, filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE, CN><<<blocks, threads>>>(src.data, src.step/src.elemSize(),
dst.ptr, dst.elem_step, anchor, src.cols, src.rows); dst.data, dst.step/dst.elemSize(), anchor, src.cols, src.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
...@@ -357,7 +359,7 @@ namespace cv { namespace gpu { namespace bf ...@@ -357,7 +359,7 @@ namespace cv { namespace gpu { namespace bf
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
{ {
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.ptr, sizeof(table_space.ptr)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float); size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
...@@ -491,15 +493,15 @@ namespace cv { namespace gpu { namespace bf ...@@ -491,15 +493,15 @@ namespace cv { namespace gpu { namespace bf
case 1: case 1:
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
} }
break; break;
case 3: case 3:
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.ptr, disp.step/sizeof(T), img.ptr, img.step, disp.rows, disp.cols); bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
} }
break; break;
default: default:
......
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
using namespace cv::gpu; using namespace cv::gpu;
/////////////////////////////////// Remap /////////////////////////////////////////////// /////////////////////////////////// Remap ///////////////////////////////////////////////
namespace imgproc_krnls namespace cv { namespace gpu { namespace imgproc
{ {
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap; texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
...@@ -121,10 +121,7 @@ namespace imgproc_krnls ...@@ -121,10 +121,7 @@ namespace imgproc_krnls
*(dst + y * dst_step + 3 * x + 2) = out.z; *(dst + y * dst_step + 3 * x + 2) = out.z;
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
{ {
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
...@@ -132,15 +129,15 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -132,15 +129,15 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(dst.cols, threads.x); grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y); grid.y = divUp(dst.rows, threads.y);
imgproc_krnls::tex_remap.filterMode = cudaFilterModeLinear; tex_remap.filterMode = cudaFilterModeLinear;
imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::tex_remap.addressMode[1] = cudaAddressModeWrap; tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D(0, imgproc_krnls::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) ); cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
imgproc_krnls::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture(imgproc_krnls::tex_remap) ); cudaSafeCall( cudaUnbindTexture(tex_remap) );
} }
void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst) void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
...@@ -150,17 +147,13 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -150,17 +147,13 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(dst.cols, threads.x); grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y); grid.y = divUp(dst.rows, threads.y);
imgproc_krnls::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows); remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
}}}
/////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
namespace imgproc_krnls
{
texture<uchar4, 2> tex_meanshift; texture<uchar4, 2> tex_meanshift;
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
...@@ -252,10 +245,7 @@ namespace imgproc_krnls ...@@ -252,10 +245,7 @@ namespace imgproc_krnls
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps) extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)
{ {
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -264,11 +254,11 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -264,11 +254,11 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
{ {
...@@ -278,18 +268,15 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -278,18 +268,15 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) ); cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
}}}
/////////////////////////////////// drawColorDisp /////////////////////////////////////////////// /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
namespace imgproc_krnls
{
template <typename T> template <typename T>
__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1) __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
{ {
...@@ -389,10 +376,8 @@ namespace imgproc_krnls ...@@ -389,10 +376,8 @@ namespace imgproc_krnls
line[x >> 1] = res; line[x >> 1] = res;
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream) void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
{ {
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
...@@ -400,7 +385,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -400,7 +385,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(src.cols, threads.x << 2); grid.x = divUp(src.cols, threads.x << 2);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp); drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -413,17 +398,14 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -413,17 +398,14 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(src.cols, threads.x << 1); grid.x = divUp(src.cols, threads.x << 1);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp); drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
}}}
/////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
namespace imgproc_krnls
{
__constant__ float cq[16]; __constant__ float cq[16];
template <typename T> template <typename T>
...@@ -455,10 +437,7 @@ namespace imgproc_krnls ...@@ -455,10 +437,7 @@ namespace imgproc_krnls
*(float4*)(xyzw + xyzw_step * y + (x * 4)) = v; *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
} }
} }
}
namespace cv { namespace gpu { namespace imgproc
{
template <typename T> template <typename T>
inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
{ {
...@@ -467,9 +446,9 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -467,9 +446,9 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(disp.cols, threads.x); grid.x = divUp(disp.cols, threads.x);
grid.y = divUp(disp.rows, threads.y); grid.y = divUp(disp.rows, threads.y);
cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cq, q, 16 * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
imgproc_krnls::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols); reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
......
...@@ -41,9 +41,7 @@ ...@@ -41,9 +41,7 @@
//M*/ //M*/
#include "cuda_shared.hpp" #include "cuda_shared.hpp"
#include "saturate_cast.hpp"
#include "transform.hpp" #include "transform.hpp"
#include "vecmath.hpp"
using namespace cv::gpu; using namespace cv::gpu;
...@@ -54,7 +52,7 @@ using namespace cv::gpu; ...@@ -54,7 +52,7 @@ using namespace cv::gpu;
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar // Cart <-> Polar
namespace mathfunc_krnls namespace cv { namespace gpu { namespace mathfunc
{ {
struct Nothing struct Nothing
{ {
...@@ -133,10 +131,7 @@ namespace mathfunc_krnls ...@@ -133,10 +131,7 @@ namespace mathfunc_krnls
yptr[y * y_step + x] = mag_data * sin_a; yptr[y * y_step + x] = mag_data * sin_a;
} }
} }
}
namespace cv { namespace gpu { namespace mathfunc
{
template <typename Mag, typename Angle> template <typename Mag, typename Angle>
void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
{ {
...@@ -148,9 +143,9 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -148,9 +143,9 @@ namespace cv { namespace gpu { namespace mathfunc
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f; const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
mathfunc_krnls::cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>( cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.ptr, x.elem_step, y.ptr, y.elem_step, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.ptr, mag.elem_step, angle.ptr, angle.elem_step, scale, x.cols, x.rows); mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -163,27 +158,27 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -163,27 +158,27 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
{ {
{ {
cartToPolar_caller<mathfunc_krnls::Magnitude, mathfunc_krnls::Atan2>, cartToPolar_caller<Magnitude, Atan2>,
cartToPolar_caller<mathfunc_krnls::Magnitude, mathfunc_krnls::Nothing> cartToPolar_caller<Magnitude, Nothing>
}, },
{ {
cartToPolar_caller<mathfunc_krnls::MagnitudeSqr, mathfunc_krnls::Atan2>, cartToPolar_caller<MagnitudeSqr, Atan2>,
cartToPolar_caller<mathfunc_krnls::MagnitudeSqr, mathfunc_krnls::Nothing>, cartToPolar_caller<MagnitudeSqr, Nothing>,
} }
}, },
{ {
{ {
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Atan2>, cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Nothing> cartToPolar_caller<Nothing, Nothing>
}, },
{ {
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Atan2>, cartToPolar_caller<Nothing, Atan2>,
cartToPolar_caller<mathfunc_krnls::Nothing, mathfunc_krnls::Nothing>, cartToPolar_caller<Nothing, Nothing>,
} }
} }
}; };
callers[mag.ptr == 0][magSqr][angle.ptr == 0](x, y, mag, angle, angleInDegrees, stream); callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
} }
template <typename Mag> template <typename Mag>
...@@ -197,8 +192,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -197,8 +192,8 @@ namespace cv { namespace gpu { namespace mathfunc
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f; const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.elem_step, polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step, mag.cols, mag.rows); angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -209,19 +204,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -209,19 +204,16 @@ namespace cv { namespace gpu { namespace mathfunc
typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] = static const caller_t callers[2] =
{ {
polarToCart_caller<mathfunc_krnls::NonEmptyMag>, polarToCart_caller<NonEmptyMag>,
polarToCart_caller<mathfunc_krnls::EmptyMag> polarToCart_caller<EmptyMag>
}; };
callers[mag.ptr == 0](mag, angle, x, y, angleInDegrees, stream); callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
} }
}}}
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Compare // Compare
namespace mathfunc_krnls
{
template <typename T1, typename T2> template <typename T1, typename T2>
struct NotEqual struct NotEqual
{ {
...@@ -230,14 +222,11 @@ namespace mathfunc_krnls ...@@ -230,14 +222,11 @@ namespace mathfunc_krnls
return static_cast<uchar>(static_cast<int>(src1 != src2) * 255); return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
} }
}; };
}
namespace cv { namespace gpu { namespace mathfunc
{
template <typename T1, typename T2> template <typename T1, typename T2>
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
{ {
mathfunc_krnls::NotEqual<T1, T2> op; NotEqual<T1, T2> op;
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0); transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
} }
......
...@@ -40,16 +40,11 @@ ...@@ -40,16 +40,11 @@
// //
//M*/ //M*/
#include <stddef.h>
#include <stdio.h>
#include "cuda_shared.hpp" #include "cuda_shared.hpp"
#include "cuda_runtime.h"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
using namespace cv::gpu; namespace cv { namespace gpu { namespace matrix_operations {
namespace matop_krnls
{
template <typename T> struct shift_and_sizeof; template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<char> { enum { shift = 0 }; }; template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; }; template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
...@@ -115,14 +110,11 @@ namespace matop_krnls ...@@ -115,14 +110,11 @@ namespace matop_krnls
typedef int2 read_type; typedef int2 read_type;
typedef short2 write_type; typedef short2 write_type;
}; };
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo ///////////////////////////////// ////////////////////////////////// CopyTo /////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
template<typename T> template<typename T>
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels) __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
{ {
...@@ -136,10 +128,6 @@ namespace matop_krnls ...@@ -136,10 +128,6 @@ namespace matop_krnls
mat_dst[idx] = mat_src[idx]; mat_dst[idx] = mat_src[idx];
} }
} }
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream); typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T> template<typename T>
...@@ -147,17 +135,12 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -147,17 +135,12 @@ namespace cv { namespace gpu { namespace matrix_operations
{ {
dim3 threadsPerBlock(16,16, 1); dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
if (stream == 0) if (stream == 0)
{ cudaSafeCall ( cudaThreadSynchronize() );
::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
}
} }
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
...@@ -180,14 +163,11 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -180,14 +163,11 @@ namespace cv { namespace gpu { namespace matrix_operations
func(mat_src, mat_dst, mask, channels, stream); func(mat_src, mat_dst, mask, channels, stream);
} }
}}}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo ////////////////////////////////// ////////////////////////////////// SetTo //////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
__constant__ double scalar_d[4]; __constant__ double scalar_d[4];
template<typename T> template<typename T>
...@@ -216,10 +196,6 @@ namespace matop_krnls ...@@ -216,10 +196,6 @@ namespace matop_krnls
mat[idx] = scalar_d[ x % channels ]; mat[idx] = scalar_d[ x % channels ];
} }
} }
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream); typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream); typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
...@@ -229,16 +205,9 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -229,16 +205,9 @@ namespace cv { namespace gpu { namespace matrix_operations
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, (unsigned char *)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
if (stream == 0) if (stream == 0)
{
::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
}
} }
template <typename T> template <typename T>
...@@ -247,20 +216,15 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -247,20 +216,15 @@ namespace cv { namespace gpu { namespace matrix_operations
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
if (stream == 0) if (stream == 0)
{
matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
}
} }
void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream) void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
{ {
cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_without_mask tab[8] = static SetToFunc_without_mask tab[8] =
{ {
...@@ -284,7 +248,7 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -284,7 +248,7 @@ namespace cv { namespace gpu { namespace matrix_operations
void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream) void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{ {
cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4)); cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_with_mask tab[8] = static SetToFunc_with_mask tab[8] =
{ {
...@@ -305,14 +269,11 @@ namespace cv { namespace gpu { namespace matrix_operations ...@@ -305,14 +269,11 @@ namespace cv { namespace gpu { namespace matrix_operations
func(mat, mask, channels, stream); func(mat, mask, channels, stream);
} }
}}}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
namespace matop_krnls
{
template <typename T, typename DT> template <typename T, typename DT>
__global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) __global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
{ {
...@@ -348,29 +309,20 @@ namespace matop_krnls ...@@ -348,29 +309,20 @@ namespace matop_krnls
} }
} }
} }
}
namespace cv { namespace gpu { namespace matrix_operations
{
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream); typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
template<typename T, typename DT> template<typename T, typename DT>
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream) void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{ {
const int shift = ::matop_krnls::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift; const int shift = ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y)); dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
convert_to<T, DT><<<grid, block, 0, stream>>>(src.data, src.step, dst.data, dst.step, width, height, alpha, beta);
if (stream == 0) if (stream == 0)
{
matop_krnls::convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
}
else
{
matop_krnls::convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
}
} }
void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream) void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
......
...@@ -230,9 +230,9 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -230,9 +230,9 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>( mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].ptr, src[0].step, src[0].data, src[0].step,
src[1].ptr, src[1].step, src[1].data, src[1].step,
dst.rows, dst.cols, dst.ptr, dst.step); dst.rows, dst.cols, dst.data, dst.step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>( mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].ptr, src[0].step, src[0].data, src[0].step,
src[1].ptr, src[1].step, src[1].data, src[1].step,
src[2].ptr, src[2].step, src[2].data, src[2].step,
dst.rows, dst.cols, dst.ptr, dst.step); dst.rows, dst.cols, dst.data, dst.step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -259,11 +259,11 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -259,11 +259,11 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>( mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
src[0].ptr, src[0].step, src[0].data, src[0].step,
src[1].ptr, src[1].step, src[1].data, src[1].step,
src[2].ptr, src[2].step, src[2].data, src[2].step,
src[3].ptr, src[3].step, src[3].data, src[3].step,
dst.rows, dst.cols, dst.ptr, dst.step); dst.rows, dst.cols, dst.data, dst.step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -433,9 +433,9 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -433,9 +433,9 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC2_<T><<<gridDim, blockDim, 0, stream>>>( splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
src.ptr, src.step, src.rows, src.cols, src.data, src.step, src.rows, src.cols,
dst[0].ptr, dst[0].step, dst[0].data, dst[0].step,
dst[1].ptr, dst[1].step); dst[1].data, dst[1].step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -447,10 +447,10 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -447,10 +447,10 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC3_<T><<<gridDim, blockDim, 0, stream>>>( splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
src.ptr, src.step, src.rows, src.cols, src.data, src.step, src.rows, src.cols,
dst[0].ptr, dst[0].step, dst[0].data, dst[0].step,
dst[1].ptr, dst[1].step, dst[1].data, dst[1].step,
dst[2].ptr, dst[2].step); dst[2].data, dst[2].step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
...@@ -462,11 +462,11 @@ namespace cv { namespace gpu { namespace split_merge { ...@@ -462,11 +462,11 @@ namespace cv { namespace gpu { namespace split_merge {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC4_<T><<<gridDim, blockDim, 0, stream>>>( splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
src.ptr, src.step, src.rows, src.cols, src.data, src.step, src.rows, src.cols,
dst[0].ptr, dst[0].step, dst[0].data, dst[0].step,
dst[1].ptr, dst[1].step, dst[1].data, dst[1].step,
dst[2].ptr, dst[2].step, dst[2].data, dst[2].step,
dst[3].ptr, dst[3].step); dst[3].data, dst[3].step);
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
......
...@@ -43,8 +43,7 @@ ...@@ -43,8 +43,7 @@
//#include "cuda_shared.hpp" //#include "cuda_shared.hpp"
#include "opencv2/gpu/devmem2d.hpp" #include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp" #include "safe_call.hpp"
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
using namespace cv::gpu; using namespace cv::gpu;
...@@ -55,7 +54,7 @@ using namespace cv::gpu; ...@@ -55,7 +54,7 @@ using namespace cv::gpu;
#define ROWSperTHREAD 21 // the number of rows a thread will process #define ROWSperTHREAD 21 // the number of rows a thread will process
namespace stereobm_gpu namespace cv { namespace gpu { namespace bm
{ {
#define BLOCK_W 128 // the thread block width (464) #define BLOCK_W 128 // the thread block width (464)
...@@ -233,7 +232,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im ...@@ -233,7 +232,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
} }
template<int RADIUS> template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp) __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStep disp, int maxdisp)
{ {
extern __shared__ unsigned int col_ssd_cache[]; extern __shared__ unsigned int col_ssd_cache[];
volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x; volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
...@@ -246,7 +245,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i ...@@ -246,7 +245,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
//int Y = blockIdx.y * ROWSperTHREAD + RADIUS; //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp + X + Y * disp_pitch; unsigned char* disparImage = disp.data + X + Y * disp.step;
/* if (X < cwidth) /* if (X < cwidth)
{ {
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step; unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
...@@ -305,7 +304,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i ...@@ -305,7 +304,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd); uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx]) if (minSSD.x < minSSDImage[idx])
{ {
disparImage[disp_pitch * row] = (unsigned char)(d + minSSD.y); disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x; minSSDImage[idx] = minSSD.x;
} }
} }
...@@ -313,88 +312,73 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i ...@@ -313,88 +312,73 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
} // for d loop } // for d loop
} }
}
template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream)
namespace cv { namespace gpu { namespace bm
{ {
template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, const cudaStream_t & stream) dim3 grid(1,1,1);
{ dim3 threads(BLOCK_W, 1, 1);
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W); grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD); grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS) //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int); size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
if (stream == 0)
{
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
}
}; stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
};
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, const cudaStream_t & stream); typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream);
const static kernel_caller_t callers[] = const static kernel_caller_t callers[] =
{ {
0, 0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>, kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>, kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>, kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>, kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25> kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9> //0,0,0, 0,0,0, 0,0,kernel_caller<9>
}; };
const int calles_num = sizeof(callers)/sizeof(callers[0]); const int calles_num = sizeof(callers)/sizeof(callers[0]);
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, const cudaStream_t & stream) extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
{ {
int winsz2 = winsz >> 1; int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num) if (winsz2 == 0 || winsz2 >= calles_num)
cv::gpu::error("Unsupported window size", __FILE__, __LINE__); cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) ); //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) ); //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp.rows) ); cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) ); cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cwidth, &left.cols, sizeof(left.cols) ) ); cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cheight, &left.rows, sizeof(left.rows) ) ); cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSDImage, &minSSD_buf.ptr, sizeof(minSSD_buf.ptr) ) ); cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize(); size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSD_step, &minssd_step, sizeof(minssd_step) ) ); cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
callers[winsz2](left, right, disp, maxdisp, stream); callers[winsz2](left, right, disp, maxdisp, stream);
} }
}}}
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Sobel Prefiler /////////////////////////////////////////// /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
namespace stereobm_gpu
{
texture<unsigned char, 2, cudaReadModeElementType> texForSobel; texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step, int width, int height, int prefilterCap) extern "C" __global__ void prefilter_kernel(DevMem2D output, int prefilterCap)
{ {
int x = blockDim.x * blockIdx.x + threadIdx.x; int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y; int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height) if (x < output.cols && y < output.rows)
{ {
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) + int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) + (int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
...@@ -402,48 +386,35 @@ extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step, ...@@ -402,48 +386,35 @@ extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step,
conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255); conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
output[y * step + x] = conv & 0xFF; output.ptr(y)[x] = conv & 0xFF;
} }
} }
}
namespace cv { namespace gpu { namespace bm extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap, cudaStream_t & stream)
{ {
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap, const cudaStream_t & stream) cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
{ cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForSobel, input.ptr, desc, input.cols, input.rows, input.step ) );
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x); grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y); grid.y = divUp(input.rows, threads.y);
if (stream == 0) prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
{
stereobm_gpu::prefilter_kernel<<<grid, threads>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
stereobm_gpu::prefilter_kernel<<<grid, threads, 0, stream>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
}
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForSobel ) ); if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
} cudaSafeCall( cudaUnbindTexture (texForSobel ) );
}
}}}
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Textureness filtering //////////////////////////////////////// /////////////////////////////////// Textureness filtering ////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
namespace stereobm_gpu
{
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF; texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ float sobel(int x, int y) __device__ float sobel(int x, int y)
...@@ -478,7 +449,7 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz) ...@@ -478,7 +449,7 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
#define RpT (2 * ROWSperTHREAD) // got experimentally #define RpT (2 * ROWSperTHREAD) // got experimentally
extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_step, int winsz, float threshold, int width, int height) extern "C" __global__ void textureness_kernel(DevMem2D disp, int winsz, float threshold)
{ {
int winsz2 = winsz/2; int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2; int n_dirty_pixels = (winsz2) * 2;
...@@ -489,9 +460,9 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s ...@@ -489,9 +460,9 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT; int beg_row = blockIdx.y * RpT;
int end_row = min(beg_row + RpT, height); int end_row = min(beg_row + RpT, disp.rows);
if (x < width) if (x < disp.cols)
{ {
int y = beg_row; int y = beg_row;
...@@ -512,7 +483,7 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s ...@@ -512,7 +483,7 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255; float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold) if (sum_win < threshold)
disp[y * disp_step + x] = 0; disp.data[y * disp.step + x] = 0;
__syncthreads(); __syncthreads();
...@@ -530,45 +501,37 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s ...@@ -530,45 +501,37 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
__syncthreads(); __syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255; float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold) if (sum_win < threshold)
disp[y * disp_step + x] = 0; disp.data[y * disp.step + x] = 0;
__syncthreads(); __syncthreads();
} }
} }
} }
}
namespace cv { namespace gpu { namespace bm extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, cudaStream_t & stream)
{ {
extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, const cudaStream_t & stream) avgTexturenessThreshold *= winsz * winsz;
{
avgTexturenessThreshold *= winsz * winsz;
stereobm_gpu::texForTF.filterMode = cudaFilterModeLinear; texForTF.filterMode = cudaFilterModeLinear;
stereobm_gpu::texForTF.addressMode[0] = cudaAddressModeWrap; texForTF.addressMode[0] = cudaAddressModeWrap;
stereobm_gpu::texForTF.addressMode[1] = cudaAddressModeWrap; texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForTF, input.ptr, desc, input.cols, input.rows, input.step ) ); cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
dim3 threads(128, 1, 1); dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x); grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT); grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float); size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
if (stream == 0) if (stream == 0)
{ cudaSafeCall( cudaThreadSynchronize() );
stereobm_gpu::textureness_kernel<<<grid, threads, smem_size>>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows); cudaSafeCall( cudaUnbindTexture (texForTF) );
cudaSafeCall( cudaThreadSynchronize() );
} }
else
{
stereobm_gpu::textureness_kernel<<<grid, threads, smem_size, stream>>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows);
}
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForTF) );
}
}}} }}}
...@@ -44,36 +44,32 @@ ...@@ -44,36 +44,32 @@
#define __OPENCV_GPU_TRANSFORM_HPP__ #define __OPENCV_GPU_TRANSFORM_HPP__
#include "cuda_shared.hpp" #include "cuda_shared.hpp"
#include "saturate_cast.hpp"
#include "vecmath.hpp"
namespace cv { namespace gpu { namespace algo_krnls namespace cv { namespace gpu { namespace device
{ {
template <typename T, typename D, typename UnOp> template <typename T, typename D, typename UnOp>
static __global__ void transform(const T* src, size_t src_step, static __global__ void transform(const DevMem2D_<T> src, PtrStep_<D> dst, UnOp op)
D* dst, size_t dst_step, int width, int height, UnOp op)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height) if (x < src.cols && y < src.rows)
{ {
T src_data = src[y * src_step + x]; T src_data = src.ptr(y)[x];
dst[y * dst_step + x] = op(src_data, x, y); dst.ptr(y)[x] = op(src_data, x, y);
} }
} }
template <typename T1, typename T2, typename D, typename BinOp> template <typename T1, typename T2, typename D, typename BinOp>
static __global__ void transform(const T1* src1, size_t src1_step, const T2* src2, size_t src2_step, static __global__ void transform(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, BinOp op)
D* dst, size_t dst_step, int width, int height, BinOp op)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height) if (x < src1.cols && y < src1.rows)
{ {
T1 src1_data = src1[y * src1_step + x]; T1 src1_data = src1.ptr(y)[x];
T2 src2_data = src2[y * src2_step + x]; T2 src2_data = src2.ptr(y)[x];
dst[y * dst_step + x] = op(src1_data, src2_data, x, y); dst.ptr(y)[x] = op(src1_data, src2_data, x, y);
} }
} }
}}} }}}
...@@ -83,7 +79,7 @@ namespace cv ...@@ -83,7 +79,7 @@ namespace cv
namespace gpu namespace gpu
{ {
template <typename T, typename D, typename UnOp> template <typename T, typename D, typename UnOp>
static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream) static void transform2(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream)
{ {
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
...@@ -91,8 +87,7 @@ namespace cv ...@@ -91,8 +87,7 @@ namespace cv
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
grid.y = divUp(src.rows, threads.y); grid.y = divUp(src.rows, threads.y);
algo_krnls::transform<<<grid, threads, 0, stream>>>(src.ptr, src.elem_step, device::transform<T, D, UnOp><<<grid, threads, 0, stream>>>(src, dst, op);
dst.ptr, dst.elem_step, src.cols, src.rows, op);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
...@@ -106,11 +101,10 @@ namespace cv ...@@ -106,11 +101,10 @@ namespace cv
grid.x = divUp(src1.cols, threads.x); grid.x = divUp(src1.cols, threads.x);
grid.y = divUp(src1.rows, threads.y); grid.y = divUp(src1.rows, threads.y);
algo_krnls::transform<<<grid, threads, 0, stream>>>(src1.ptr, src1.elem_step, device::transform<T1, T2, D, BinOp><<<grid, threads, 0, stream>>>(src1, src2, dst, op);
src2.ptr, src2.elem_step, dst.ptr, dst.elem_step, src1.cols, src1.rows, op);
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaThreadSynchronize() );
} }
} }
} }
......
...@@ -384,7 +384,14 @@ namespace cv ...@@ -384,7 +384,14 @@ namespace cv
template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v) template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
{ {
SatCast<VecTraits<VecD>::cn, VecD> cast; SatCast<
VecTraits<VecD>::cn,
VecD
>
cast;
return cast(v); return cast(v);
} }
......
...@@ -577,7 +577,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke ...@@ -577,7 +577,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
namespace cv { namespace gpu { namespace filters namespace cv { namespace gpu { namespace filters
{ {
void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); template <int CN, typename T, typename D>
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
//void linearRowFilter_gpu_8u_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8u_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearRowFilter_gpu_8s_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8s_8u_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
void linearRowFilter_gpu_8s_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor); void linearRowFilter_gpu_8s_8s_c4(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
...@@ -653,7 +656,8 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, ...@@ -653,7 +656,8 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
static const gpuFilter1D_t gpuFilter1D_callers[6][6] = static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
{ {
{linearRowFilter_gpu_8u_8u_c4,linearRowFilter_gpu_8u_8s_c4,0,0,0,0}, {linearRowFilter_gpu<4, uchar4, uchar4>/*linearRowFilter_gpu_8u_8u_c4*/,linearRowFilter_gpu_8u_8s_c4,0,0,0,0},
{linearRowFilter_gpu_8s_8u_c4,linearRowFilter_gpu_8s_8s_c4,0,0,0,0}, {linearRowFilter_gpu_8s_8u_c4,linearRowFilter_gpu_8s_8s_c4,0,0,0,0},
{0,0,linearRowFilter_gpu_16u_16u_c2,linearRowFilter_gpu_16u_16s_c2,0,0}, {0,0,linearRowFilter_gpu_16u_16u_c2,linearRowFilter_gpu_16u_16s_c2,0,0},
{0,0,linearRowFilter_gpu_16s_16u_c2,linearRowFilter_gpu_16s_16s_c2,0,0}, {0,0,linearRowFilter_gpu_16s_16u_c2,linearRowFilter_gpu_16s_16s_c2,0,0},
......
...@@ -61,9 +61,9 @@ namespace cv { namespace gpu ...@@ -61,9 +61,9 @@ namespace cv { namespace gpu
namespace bm namespace bm
{ {
//extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf); //extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf);
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, const cudaStream_t & stream); extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream);
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap /*= 31*/, const cudaStream_t & stream); extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D output, int prefilterCap /*= 31*/, cudaStream_t & stream);
extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, const cudaStream_t & stream); extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp, cudaStream_t & stream);
} }
}} }}
...@@ -98,7 +98,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() ...@@ -98,7 +98,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
return false; return false;
} }
static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, const cudaStream_t & stream) static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
{ {
CV_DbgAssert(left.rows == right.rows && left.cols == right.cols); CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
CV_DbgAssert(left.type() == CV_8UC1); CV_DbgAssert(left.type() == CV_8UC1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment