Commit 90ae1e3a authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

refactored gpu module

parent 8503f752
...@@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false. ...@@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false.
\end{description} \end{description}
\cvCppFunc{gpu::checkPtxVersion} \cvCppFunc{gpu::ptxVersionIs}
Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false. Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false.
\cvdefCpp{template $<$unsigned int cmp\_op$>$\newline \cvdefCpp{bool ptxVersionIs(int major, int minor);}
bool checkPtxVersion(int major, int minor);}
\begin{description} \begin{description}
\cvarg{cmp\_op}{Comparison operation: \cvarg{major}{Major compute capability version.}
\cvarg{minor}{Minor compute capability version.}
\end{description}
\cvCppFunc{gpu::ptxVersionIsLessOrEqual}
Returns true, if the GPU module was built with PTX support of the given compute capability or less, otherwise false.
\cvdefCpp{bool ptxVersionIsLessOrEqual(int major, int minor);}
\begin{description} \begin{description}
\cvarg{CMP\_EQ}{Return true, if at least one of GPU module PTX versions matches the given one, otherwise false} \cvarg{major}{Major compute capability version.}
\cvarg{CMP\_LT}{Return true, if at least one of GPU module PTX versions is less than the given one, otherwise false} \cvarg{minor}{Minor compute capability version.}
\cvarg{CMP\_LE}{Return true, if at least one of GPU module PTX versions is less or equal to the given one, otherwise false} \end{description}
\cvarg{CMP\_GT}{Return true, if at least one of GPU module PTX versions is greater than the given one, otherwise false}
\cvarg{CMP\_GE}{Return true, if at least one of GPU module PTX versions is greater or equal to the given one, otherwise false}
\end{description}} \cvCppFunc{gpu::ptxVersionIsGreaterOrEqual}
\cvarg{major}{Major CC version.} Returns true, if the GPU module was built with PTX support of the given compute capability or greater, otherwise false.
\cvarg{minor}{Minor CC version.}
\cvdefCpp{bool ptxVersionIsGreaterOrEqual(int major, int minor);}
\begin{description}
\cvarg{major}{Major compute capability version.}
\cvarg{minor}{Minor compute capability version.}
\end{description} \end{description}
......
...@@ -72,8 +72,9 @@ namespace cv ...@@ -72,8 +72,9 @@ namespace cv
CV_EXPORTS bool hasNativeDoubleSupport(int device); CV_EXPORTS bool hasNativeDoubleSupport(int device);
CV_EXPORTS bool hasAtomicsSupport(int device); CV_EXPORTS bool hasAtomicsSupport(int device);
template <unsigned int cmp_op> CV_EXPORTS bool ptxVersionIs(int major, int minor);
CV_EXPORTS bool checkPtxVersion(int major, int minor); CV_EXPORTS bool ptxVersionIsLessOrEqual(int major, int minor);
CV_EXPORTS bool ptxVersionIsGreaterOrEqual(int major, int minor);
//! Checks if the GPU module is PTX compatible with the given NVIDIA device //! Checks if the GPU module is PTX compatible with the given NVIDIA device
CV_EXPORTS bool isCompatibleWith(int device); CV_EXPORTS bool isCompatibleWith(int device);
......
...@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
////////////////////////////// Column Sum ////////////////////////////////////// ////////////////////////////// Column Sum //////////////////////////////////////
__global__ void column_sum_kernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst) __global__ void column_sumKernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc ...@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
dim3 threads(256); dim3 threads(256);
dim3 grid(divUp(src.cols, threads.x)); dim3 grid(divUp(src.cols, threads.x));
column_sum_kernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst); column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
......
...@@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Performs reduction in shared memory // Performs reduction in shared memory
template <int size, typename T> template <int size, typename T>
__device__ void sum_in_smem(volatile T* data, const uint tid) __device__ void sumInSmem(volatile T* data, const uint tid)
{ {
T sum = data[tid]; T sum = data[tid];
...@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration // Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256 // - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(32, 8); threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...@@ -132,17 +132,17 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -132,17 +132,17 @@ namespace cv { namespace gpu { namespace mathfunc
// Returns required buffer sizes // Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows) void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid); estimateThreadCfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size; bufcols = grid.x * grid.y * elem_size;
bufrows = 2; bufrows = 2;
} }
// Estimates device constants which are used in the kernels using specified thread configuration // Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
...@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T> template <int size, typename T>
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid) __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid)
{ {
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); } if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
...@@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T, typename Mask> template <int nthreads, typename T, typename Mask>
__global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval) __global__ void minMaxKernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
{ {
typedef typename MinMaxTypeTraits<T>::best_type best_type; typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads]; __shared__ best_type sminval[nthreads];
...@@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = mymax; smaxval[tid] = mymax;
__syncthreads(); __syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid); findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = maxval[idx]; smaxval[tid] = maxval[idx];
__syncthreads(); __syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid); findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -263,16 +263,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -263,16 +263,16 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) void minMaxMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf); minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -282,26 +282,26 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -282,26 +282,26 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_; *maxval = maxval_;
} }
template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskCaller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskCaller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskCaller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskCaller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskCaller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskCaller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskCaller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T> template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) void minMaxCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf); minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -311,17 +311,17 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -311,17 +311,17 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_; *maxval = maxval_;
} }
template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep); template void minMaxCaller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep); template void minMaxCaller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep); template void minMaxCaller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep); template void minMaxCaller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep); template void minMaxCaller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep); template void minMaxCaller<float>(const DevMem2D, double*,double*, PtrStep);
template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep); template void minMaxCaller<double>(const DevMem2D, double*, double*, PtrStep);
template <int nthreads, typename T> template <int nthreads, typename T>
__global__ void min_max_pass2_kernel(T* minval, T* maxval, int size) __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size)
{ {
typedef typename MinMaxTypeTraits<T>::best_type best_type; typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads]; __shared__ best_type sminval[nthreads];
...@@ -334,7 +334,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -334,7 +334,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = maxval[idx]; smaxval[tid] = maxval[idx];
__syncthreads(); __syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid); findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -345,17 +345,17 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -345,17 +345,17 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) void minMaxMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf); minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -365,26 +365,26 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -365,26 +365,26 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_; *maxval = maxval_;
} }
template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskMultipassCaller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskMultipassCaller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskMultipassCaller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskMultipassCaller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskMultipassCaller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void minMaxMaskMultipassCaller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T> template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) void minMaxMultipassCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf); minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -394,12 +394,12 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -394,12 +394,12 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_; *maxval = maxval_;
} }
template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep); template void minMaxMultipassCaller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep); template void minMaxMultipassCaller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep); template void minMaxMultipassCaller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep); template void minMaxMultipassCaller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep); template void minMaxMultipassCaller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep); template void minMaxMultipassCaller<float>(const DevMem2D, double*, double*, PtrStep);
} // namespace minmax } // namespace minmax
...@@ -417,7 +417,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -417,7 +417,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration // Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256 // - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(32, 8); threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...@@ -427,11 +427,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -427,11 +427,11 @@ namespace cv { namespace gpu { namespace mathfunc
// Returns required buffer sizes // Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows) int& b1rows, int& b2cols, int& b2rows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid); estimateThreadCfg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2; b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations b2cols = grid.x * grid.y * sizeof(int); // For locations
...@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates device constants which are used in the kernels using specified thread configuration // Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
...@@ -469,8 +469,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -469,8 +469,8 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T> template <int size, typename T>
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc, __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid) volatile uint* maxloc, const uint tid)
{ {
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
...@@ -489,8 +489,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -489,8 +489,8 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T, typename Mask> template <int nthreads, typename T, typename Mask>
__global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval, __global__ void minMaxLocKernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc) uint* minloc, uint* maxloc)
{ {
typedef typename MinMaxTypeTraits<T>::best_type best_type; typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads]; __shared__ best_type sminval[nthreads];
...@@ -503,7 +503,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -503,7 +503,8 @@ namespace cv { namespace gpu { namespace mathfunc
uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max(); T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min(); T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() :
numeric_limits_gpu<T>::min();
uint myminloc = 0; uint myminloc = 0;
uint mymaxloc = 0; uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
...@@ -529,7 +530,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -529,7 +530,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = mymaxloc; smaxloc[tid] = mymaxloc;
__syncthreads(); __syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid); findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -558,7 +559,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -558,7 +559,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = maxloc[idx]; smaxloc[tid] = maxloc[idx];
__syncthreads(); __syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid); findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -582,19 +583,20 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -582,19 +583,20 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, void minMaxLocMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -610,29 +612,30 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -610,29 +612,30 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
} }
template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskCaller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskCaller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskCaller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskCaller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskCaller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskCaller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskCaller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T> template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, void minMaxLocCaller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -648,18 +651,18 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -648,18 +651,18 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
} }
template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocCaller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocCaller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocCaller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocCaller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocCaller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocCaller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocCaller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
// This kernel will be used only when compute capability is 1.0 // This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T> template <int nthreads, typename T>
__global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
{ {
typedef typename MinMaxTypeTraits<T>::best_type best_type; typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads]; __shared__ best_type sminval[nthreads];
...@@ -676,7 +679,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -676,7 +679,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = maxloc[idx]; smaxloc[tid] = maxloc[idx];
__syncthreads(); __syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid); findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -689,20 +692,21 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -689,20 +692,21 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, void minMaxLocMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minloc_buf, maxloc_buf);
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -718,29 +722,30 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -718,29 +722,30 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
} }
template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskMultipassCaller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskMultipassCaller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskMultipassCaller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskMultipassCaller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskMultipassCaller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMaskMultipassCaller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T> template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, void minMaxLocMultipassCaller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0); uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1); uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minloc_buf, maxloc_buf);
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -756,12 +761,12 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -756,12 +761,12 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
} }
template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMultipassCaller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMultipassCaller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMultipassCaller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMultipassCaller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMultipassCaller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void minMaxLocMultipassCaller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
} // namespace minmaxloc } // namespace minmaxloc
...@@ -776,7 +781,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -776,7 +781,7 @@ namespace cv { namespace gpu { namespace mathfunc
__device__ uint blocks_finished = 0; __device__ uint blocks_finished = 0;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(32, 8); threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...@@ -785,16 +790,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -785,16 +790,16 @@ namespace cv { namespace gpu { namespace mathfunc
} }
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid); estimateThreadCfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int); bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1; bufrows = 1;
} }
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
...@@ -804,7 +809,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -804,7 +809,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T> template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count) __global__ void countNonZeroKernel(const DevMem2D src, volatile uint* count)
{ {
__shared__ uint scount[nthreads]; __shared__ uint scount[nthreads];
...@@ -823,7 +828,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -823,7 +828,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = cnt; scount[tid] = cnt;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, uint>(scount, tid); sumInSmem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -844,7 +849,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -844,7 +849,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, uint>(scount, tid); sumInSmem<nthreads, uint>(scount, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -859,15 +864,15 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -859,15 +864,15 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf) int countNonZeroCaller(const DevMem2D src, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0); uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf); countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
uint count; uint count;
...@@ -876,17 +881,17 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -876,17 +881,17 @@ namespace cv { namespace gpu { namespace mathfunc
return count; return count;
} }
template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep); template int countNonZeroCaller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_caller<char>(const DevMem2D, PtrStep); template int countNonZeroCaller<char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep); template int countNonZeroCaller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_caller<short>(const DevMem2D, PtrStep); template int countNonZeroCaller<short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep); template int countNonZeroCaller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep); template int countNonZeroCaller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep); template int countNonZeroCaller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T> template <int nthreads, typename T>
__global__ void count_non_zero_pass2_kernel(uint* count, int size) __global__ void countNonZeroPass2Kernel(uint* count, int size)
{ {
__shared__ uint scount[nthreads]; __shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint tid = threadIdx.y * blockDim.x + threadIdx.x;
...@@ -894,7 +899,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -894,7 +899,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < size ? count[tid] : 0; scount[tid] = tid < size ? count[tid] : 0;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, uint>(scount, tid); sumInSmem<nthreads, uint>(scount, tid);
if (tid == 0) if (tid == 0)
count[0] = scount[0]; count[0] = scount[0];
...@@ -902,16 +907,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -902,16 +907,16 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf) int countNonZeroMultipassCaller(const DevMem2D src, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0); uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf); countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
uint count; uint count;
...@@ -920,12 +925,12 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -920,12 +925,12 @@ namespace cv { namespace gpu { namespace mathfunc
return count; return count;
} }
template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep); template int countNonZeroMultipassCaller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep); template int countNonZeroMultipassCaller<char>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep); template int countNonZeroMultipassCaller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep); template int countNonZeroMultipassCaller<short>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep); template int countNonZeroMultipassCaller<int>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep); template int countNonZeroMultipassCaller<float>(const DevMem2D, PtrStep);
} // namespace countnonzero } // namespace countnonzero
...@@ -958,7 +963,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -958,7 +963,7 @@ namespace cv { namespace gpu { namespace mathfunc
const int threads_x = 32; const int threads_x = 32;
const int threads_y = 8; const int threads_y = 8;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(threads_x, threads_y); threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y), grid = dim3(divUp(cols, threads.x * threads.y),
...@@ -968,16 +973,16 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -968,16 +973,16 @@ namespace cv { namespace gpu { namespace mathfunc
} }
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows) void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid); estimateThreadCfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn; bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1; bufrows = 1;
} }
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
...@@ -986,7 +991,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -986,7 +991,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
template <typename T, typename R, typename Op, int nthreads> template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel(const DevMem2D src, R* result) __global__ void sumKernel(const DevMem2D src, R* result)
{ {
__shared__ R smem[nthreads]; __shared__ R smem[nthreads];
...@@ -1006,7 +1011,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1006,7 +1011,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = sum; smem[tid] = sum;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -1027,7 +1032,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1027,7 +1032,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1042,7 +1047,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1042,7 +1047,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads> template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel(R* result, int size) __global__ void sumPass2Kernel(R* result, int size)
{ {
__shared__ R smem[nthreads]; __shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x; int tid = threadIdx.y * blockDim.x + threadIdx.x;
...@@ -1050,7 +1055,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1050,7 +1055,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = tid < size ? result[tid] : 0; smem[tid] = tid < size ? result[tid] : 0;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
if (tid == 0) if (tid == 0)
result[0] = smem[0]; result[0] = smem[0];
...@@ -1058,7 +1063,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1058,7 +1063,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, typename Op, int nthreads> template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result) __global__ void sumKernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
{ {
typedef typename TypeVec<T, 2>::vec_t SrcType; typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<R, 2>::vec_t DstType; typedef typename TypeVec<R, 2>::vec_t DstType;
...@@ -1086,8 +1091,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1086,8 +1091,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = sum.y; smem[tid + nthreads] = sum.y;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -1113,8 +1118,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1113,8 +1118,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = res.y; smem[tid + nthreads] = res.y;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1137,7 +1142,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1137,7 +1142,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads> template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size) __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
{ {
typedef typename TypeVec<R, 2>::vec_t DstType; typedef typename TypeVec<R, 2>::vec_t DstType;
...@@ -1150,8 +1155,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1150,8 +1155,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = res.y; smem[tid + nthreads] = res.y;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1163,7 +1168,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1163,7 +1168,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, typename Op, int nthreads> template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result) __global__ void sumKernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{ {
typedef typename TypeVec<T, 3>::vec_t SrcType; typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType; typedef typename TypeVec<R, 3>::vec_t DstType;
...@@ -1192,9 +1197,9 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1192,9 +1197,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = sum.z; smem[tid + 2 * nthreads] = sum.z;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -1222,9 +1227,9 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1222,9 +1227,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = res.z; smem[tid + 2 * nthreads] = res.z;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1249,7 +1254,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1249,7 +1254,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads> template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size) __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{ {
typedef typename TypeVec<R, 3>::vec_t DstType; typedef typename TypeVec<R, 3>::vec_t DstType;
...@@ -1263,9 +1268,9 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1263,9 +1268,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = res.z; smem[tid + 2 * nthreads] = res.z;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1277,7 +1282,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1277,7 +1282,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
template <typename T, typename R, typename Op, int nthreads> template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result) __global__ void sumKernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{ {
typedef typename TypeVec<T, 4>::vec_t SrcType; typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType; typedef typename TypeVec<R, 4>::vec_t DstType;
...@@ -1308,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1308,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = sum.w; smem[tid + 3 * nthreads] = sum.w;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid); sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -1341,10 +1346,10 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1341,10 +1346,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = res.w; smem[tid + 3 * nthreads] = res.w;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid); sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1371,7 +1376,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1371,7 +1376,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads> template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size) __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{ {
typedef typename TypeVec<R, 4>::vec_t DstType; typedef typename TypeVec<R, 4>::vec_t DstType;
...@@ -1386,10 +1391,10 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1386,10 +1391,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = res.z; smem[tid + 3 * nthreads] = res.z;
__syncthreads(); __syncthreads();
sum_in_smem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid); sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0) if (tid == 0)
{ {
...@@ -1405,36 +1410,36 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1405,36 +1410,36 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{ {
using namespace sum; using namespace sum;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
switch (cn) switch (cn)
{ {
case 1: case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 2: case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3: case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4: case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
} }
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
...@@ -1448,40 +1453,40 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1448,40 +1453,40 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3]; sum[3] = result[3];
} }
template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int); template void sumMultipassCaller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int); template void sumMultipassCaller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int); template void sumMultipassCaller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int); template void sumMultipassCaller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int); template void sumMultipassCaller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int); template void sumMultipassCaller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T> template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) void sumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{ {
using namespace sum; using namespace sum;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
switch (cn) switch (cn)
{ {
case 1: case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break; break;
case 2: case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break; break;
case 3: case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break; break;
case 4: case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break; break;
} }
...@@ -1496,48 +1501,48 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1496,48 +1501,48 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3]; sum[3] = result[3];
} }
template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int); template void sumCaller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<char>(const DevMem2D, PtrStep, double*, int); template void sumCaller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int); template void sumCaller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<short>(const DevMem2D, PtrStep, double*, int); template void sumCaller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<int>(const DevMem2D, PtrStep, double*, int); template void sumCaller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<float>(const DevMem2D, PtrStep, double*, int); template void sumCaller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T> template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{ {
using namespace sum; using namespace sum;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
switch (cn) switch (cn)
{ {
case 1: case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break; break;
case 2: case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break; break;
case 3: case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break; break;
case 4: case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>( sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y); (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break; break;
} }
...@@ -1552,40 +1557,40 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1552,40 +1557,40 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3]; sum[3] = result[3];
} }
template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int); template void sqrSumMultipassCaller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int); template void sqrSumMultipassCaller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int); template void sqrSumMultipassCaller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int); template void sqrSumMultipassCaller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int); template void sqrSumMultipassCaller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int); template void sqrSumMultipassCaller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T> template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn) void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{ {
using namespace sum; using namespace sum;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid); estimateThreadCfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid);
switch (cn) switch (cn)
{ {
case 1: case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break; break;
case 2: case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break; break;
case 3: case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break; break;
case 4: case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>( sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0)); src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break; break;
} }
...@@ -1600,10 +1605,10 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1600,10 +1605,10 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3]; sum[3] = result[3];
} }
template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int); template void sqrSumCaller<float>(const DevMem2D, PtrStep, double*, int);
}}} }}}
\ No newline at end of file
...@@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device) ...@@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device)
namespace namespace
{ {
template <unsigned int cmp_op> struct ComparerEqual
bool comparePairs(int lhs1, int lhs2, int rhs1, int rhs2);
template <>
bool comparePairs<CMP_EQ>(int lhs1, int lhs2, int rhs1, int rhs2)
{ {
return lhs1 == rhs1 && lhs2 == rhs2; bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
} {
return lhs1 == rhs1 && lhs2 == rhs2;
}
};
template <>
bool comparePairs<CMP_GT>(int lhs1, int lhs2, int rhs1, int rhs2)
{
return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 > rhs2);
}
template <> struct ComparerLessOrEqual
bool comparePairs<CMP_GE>(int lhs1, int lhs2, int rhs1, int rhs2)
{ {
return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2); bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
} {
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
}
};
template <>
bool comparePairs<CMP_LT>(int lhs1, int lhs2, int rhs1, int rhs2)
{
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 < rhs2);
}
struct ComparerGreaterOrEqual
template <>
bool comparePairs<CMP_LE>(int lhs1, int lhs2, int rhs1, int rhs2)
{
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2);
}
template <>
bool comparePairs<CMP_NE>(int lhs1, int lhs2, int rhs1, int rhs2)
{ {
return lhs1 < rhs1 || (lhs1 == rhs1 && lhs2 <= rhs2); bool operator()(int lhs1, int lhs2, int rhs1, int rhs2) const
} {
} return lhs1 > rhs1 || (lhs1 == rhs1 && lhs2 >= rhs2);
}
};
template <unsigned int cmp_op> template <typename Comparer>
CV_EXPORTS bool cv::gpu::checkPtxVersion(int major, int minor) bool checkPtxVersion(int major, int minor, Comparer cmp)
{ {
#ifdef OPENCV_GPU_CUDA_ARCH_10 #ifdef OPENCV_GPU_CUDA_ARCH_10
if (comparePairs<cmp_op>(1, 0, major, minor)) return true; if (cmp(1, 0, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_11 #ifdef OPENCV_GPU_CUDA_ARCH_11
if (comparePairs<cmp_op>(1, 1, major, minor)) return true; if (cmp(1, 1, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_12 #ifdef OPENCV_GPU_CUDA_ARCH_12
if (comparePairs<cmp_op>(1, 2, major, minor)) return true; if (cmp(1, 2, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_13 #ifdef OPENCV_GPU_CUDA_ARCH_13
if (comparePairs<cmp_op>(1, 3, major, minor)) return true; if (cmp(1, 3, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_20 #ifdef OPENCV_GPU_CUDA_ARCH_20
if (comparePairs<cmp_op>(2, 0, major, minor)) return true; if (cmp(2, 0, major, minor)) return true;
#endif #endif
#ifdef OPENCV_GPU_CUDA_ARCH_21 #ifdef OPENCV_GPU_CUDA_ARCH_21
if (comparePairs<cmp_op>(2, 1, major, minor)) return true; if (cmp(2, 1, major, minor)) return true;
#endif #endif
return false; return false;
}
}
CV_EXPORTS bool cv::gpu::ptxVersionIs(int major, int minor)
{
return checkPtxVersion(major, minor, ComparerEqual());
} }
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_EQ>(int major, int minor); CV_EXPORTS bool cv::gpu::ptxVersionIsLessOrEqual(int major, int minor)
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_GT>(int major, int minor); {
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_GE>(int major, int minor); return checkPtxVersion(major, minor, ComparerLessOrEqual());
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_LT>(int major, int minor); }
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_LE>(int major, int minor);
template CV_EXPORTS bool cv::gpu::checkPtxVersion<CMP_NE>(int major, int minor);
CV_EXPORTS bool cv::gpu::ptxVersionIsGreaterOrEqual(int major, int minor)
{
return checkPtxVersion(major, minor, ComparerGreaterOrEqual());
}
CV_EXPORTS bool isCompatibleWith(int device) CV_EXPORTS bool isCompatibleWith(int device)
...@@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device) ...@@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device)
int major, minor; int major, minor;
getComputeCapability(device, major, minor); getComputeCapability(device, major, minor);
return checkPtxVersion<CMP_LE>(major, minor); return ptxVersionIsLessOrEqual(major, minor);
} }
#endif #endif
......
...@@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType) ...@@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
namespace cv { namespace gpu { namespace mathfunc namespace cv { namespace gpu { namespace mathfunc
{ {
template <typename T> template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T> template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T> template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
template <typename T> template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn); void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
namespace sum namespace sum
{ {
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
} }
}}} }}}
...@@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) ...@@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
using namespace mathfunc; using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sum_multipass_caller<unsigned char>, sum_multipass_caller<char>, static Caller multipass_callers[7] = {
sum_multipass_caller<unsigned short>, sum_multipass_caller<short>, sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
sum_multipass_caller<int>, sum_multipass_caller<float>, 0 }, sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
{ sum_caller<unsigned char>, sum_caller<char>, sumMultipassCaller<int>, sumMultipassCaller<float>, 0 };
sum_caller<unsigned short>, sum_caller<short>,
sum_caller<int>, sum_caller<float>, 0 } }; static Caller singlepass_callers[7] = {
sumCaller<unsigned char>, sumCaller<char>,
Size bufSize; sumCaller<unsigned short>, sumCaller<short>,
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); sumCaller<int>, sumCaller<float>, 0 };
ensureSizeIsEnough(bufSize, CV_8U, buf);
Size buf_size;
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; sum::getBufSizeRequired(src.cols, src.rows, src.channels(),
buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "sum: unsupported type");
double result[4]; double result[4];
...@@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) ...@@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
using namespace mathfunc; using namespace mathfunc;
typedef void (*Caller)(const DevMem2D, PtrStep, double*, int); typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
static const Caller callers[2][7] =
{ { sqsum_multipass_caller<unsigned char>, sqsum_multipass_caller<char>, static Caller multipass_callers[7] = {
sqsum_multipass_caller<unsigned short>, sqsum_multipass_caller<short>, sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
sqsum_multipass_caller<int>, sqsum_multipass_caller<float>, 0 }, sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
{ sqsum_caller<unsigned char>, sqsum_caller<char>, sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 };
sqsum_caller<unsigned short>, sqsum_caller<short>,
sqsum_caller<int>, sqsum_caller<float>, 0 } }; static Caller singlepass_callers[7] = {
sqrSumCaller<unsigned char>, sqrSumCaller<char>,
Size bufSize; sqrSumCaller<unsigned short>, sqrSumCaller<short>,
sum::get_buf_size_required(src.cols, src.rows, src.channels(), bufSize.width, bufSize.height); sqrSumCaller<int>, sqrSumCaller<float>, 0 };
ensureSizeIsEnough(bufSize, CV_8U, buf);
Caller* callers = multipass_callers;
Caller caller = callers[hasAtomicsSupport(getDevice())][src.depth()]; if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Size buf_size;
sum::getBufSizeRequired(src.cols, src.rows, src.channels(),
buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller caller = callers[src.depth()];
if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "sqrSum: unsupported type");
double result[4]; double result[4];
...@@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) ...@@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
namespace cv { namespace gpu { namespace mathfunc { namespace minmax { namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T> template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); void minMaxCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T> template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); void minMaxMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
template <typename T> template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); void minMaxMultipassCaller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
template <typename T> template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf); void minMaxMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf);
}}}} }}}}
...@@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp ...@@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep); typedef void (*Caller)(const DevMem2D, double*, double*, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, PtrStep);
static const Caller callers[2][7] = static Caller multipass_callers[7] = {
{ { min_max_multipass_caller<unsigned char>, min_max_multipass_caller<char>, minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
min_max_multipass_caller<unsigned short>, min_max_multipass_caller<short>, minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
min_max_multipass_caller<int>, min_max_multipass_caller<float>, 0 }, minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 };
{ min_max_caller<unsigned char>, min_max_caller<char>,
min_max_caller<unsigned short>, min_max_caller<short>,
min_max_caller<int>, min_max_caller<float>, min_max_caller<double> } };
static const MaskedCaller masked_callers[2][7] = static Caller singlepass_callers[7] = {
{ { min_max_mask_multipass_caller<unsigned char>, min_max_mask_multipass_caller<char>, minMaxCaller<unsigned char>, minMaxCaller<char>,
min_max_mask_multipass_caller<unsigned short>, min_max_mask_multipass_caller<short>, minMaxCaller<unsigned short>, minMaxCaller<short>,
min_max_mask_multipass_caller<int>, min_max_mask_multipass_caller<float>, 0 }, minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> };
{ min_max_mask_caller<unsigned char>, min_max_mask_caller<char>,
min_max_mask_caller<unsigned short>, min_max_mask_caller<short>,
min_max_mask_caller<int>, min_max_mask_caller<float>,
min_max_mask_caller<double> } };
static MaskedCaller masked_multipass_callers[7] = {
minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 };
static MaskedCaller masked_singlepass_callers[7] = {
minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
minMaxMaskCaller<int>, minMaxMaskCaller<float>,
minMaxMaskCaller<double> };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
...@@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp ...@@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
double minVal_; if (!minVal) minVal = &minVal_; double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size bufSize; Size buf_size;
get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height); getBufSizeRequired(src.cols, src.rows, src.elemSize(), buf_size.width, buf_size.height);
ensureSizeIsEnough(bufSize, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
if (mask.empty()) if (mask.empty())
{ {
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, minVal, maxVal, buf); caller(src, minVal, maxVal, buf);
} }
else else
{ {
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; MaskedCaller* callers = masked_multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMax: unsupported type");
caller(src, mask, minVal, maxVal, buf); caller(src, mask, minVal, maxVal, buf);
} }
...@@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp ...@@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows); int& b1rows, int& b2cols, int& b2rows);
template <typename T> template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, void minMaxLocCaller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
template <typename T> template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, void minMaxLocMaskCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
template <typename T> template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval, void minMaxLocMultipassCaller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
template <typename T> template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, void minMaxLocMaskMultipassCaller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf); int minloc[2], int maxloc[2], PtrStep valBuf, PtrStep locBuf);
}}}} }}}}
...@@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point ...@@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); typedef void (*Caller)(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); typedef void (*MaskedCaller)(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
static const Caller callers[2][7] = static Caller multipass_callers[7] = {
{ { min_max_loc_multipass_caller<unsigned char>, min_max_loc_multipass_caller<char>, minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
min_max_loc_multipass_caller<unsigned short>, min_max_loc_multipass_caller<short>, minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
min_max_loc_multipass_caller<int>, min_max_loc_multipass_caller<float>, 0 }, minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 };
{ min_max_loc_caller<unsigned char>, min_max_loc_caller<char>,
min_max_loc_caller<unsigned short>, min_max_loc_caller<short>, static Caller singlepass_callers[7] = {
min_max_loc_caller<int>, min_max_loc_caller<float>, min_max_loc_caller<double> } }; minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
static const MaskedCaller masked_callers[2][7] = minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> };
{ { min_max_loc_mask_multipass_caller<unsigned char>, min_max_loc_mask_multipass_caller<char>,
min_max_loc_mask_multipass_caller<unsigned short>, min_max_loc_mask_multipass_caller<short>, static MaskedCaller masked_multipass_callers[7] = {
min_max_loc_mask_multipass_caller<int>, min_max_loc_mask_multipass_caller<float>, 0 }, minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
{ min_max_loc_mask_caller<unsigned char>, min_max_loc_mask_caller<char>, minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
min_max_loc_mask_caller<unsigned short>, min_max_loc_mask_caller<short>, minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 };
min_max_loc_mask_caller<int>, min_max_loc_mask_caller<float>, min_max_loc_mask_caller<double> } };
static MaskedCaller masked_singlepass_callers[7] = {
minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>,
minMaxLocMaskCaller<double> };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
...@@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point ...@@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
int minLoc_[2]; int minLoc_[2];
int maxLoc_[2]; int maxLoc_[2];
Size valBufSize, locBufSize; Size valbuf_size, locbuf_size;
get_buf_size_required(src.cols, src.rows, src.elemSize(), valBufSize.width, getBufSizeRequired(src.cols, src.rows, src.elemSize(), valbuf_size.width,
valBufSize.height, locBufSize.width, locBufSize.height); valbuf_size.height, locbuf_size.width, locbuf_size.height);
ensureSizeIsEnough(valBufSize, CV_8U, valBuf); ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
ensureSizeIsEnough(locBufSize, CV_8U, locBuf); ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
if (mask.empty()) if (mask.empty())
{ {
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
} }
else else
{ {
MaskedCaller caller = masked_callers[hasAtomicsSupport(getDevice())][src.type()]; MaskedCaller* callers = masked_multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = masked_singlepass_callers;
MaskedCaller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf); caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
} }
...@@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point ...@@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
template <typename T> template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf); int countNonZeroCaller(const DevMem2D src, PtrStep buf);
template <typename T> template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf); int countNonZeroMultipassCaller(const DevMem2D src, PtrStep buf);
}}}} }}}}
...@@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) ...@@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
typedef int (*Caller)(const DevMem2D src, PtrStep buf); typedef int (*Caller)(const DevMem2D src, PtrStep buf);
static const Caller callers[2][7] = static Caller multipass_callers[7] = {
{ { count_non_zero_multipass_caller<unsigned char>, count_non_zero_multipass_caller<char>, countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
count_non_zero_multipass_caller<unsigned short>, count_non_zero_multipass_caller<short>, countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
count_non_zero_multipass_caller<int>, count_non_zero_multipass_caller<float>, 0}, countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 };
{ count_non_zero_caller<unsigned char>, count_non_zero_caller<char>,
count_non_zero_caller<unsigned short>, count_non_zero_caller<short>, static Caller singlepass_callers[7] = {
count_non_zero_caller<int>, count_non_zero_caller<float>, count_non_zero_caller<double> } }; countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
countNonZeroCaller<int>, countNonZeroCaller<float>,
countNonZeroCaller<double> };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
Size bufSize; Size buf_size;
get_buf_size_required(src.cols, src.rows, bufSize.width, bufSize.height); getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(bufSize, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller* callers = multipass_callers;
if (ptxVersionIsGreaterOrEqual(1, 1) && hasAtomicsSupport(getDevice()))
callers = singlepass_callers;
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; Caller caller = callers[src.type()];
if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type"); if (!caller) CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
return caller(src, buf); return caller(src, buf);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment