Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
90ae1e3a
Commit
90ae1e3a
authored
Jan 19, 2011
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactored gpu module
parent
8503f752
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
460 additions
and
399 deletions
+460
-399
gpu_initialization.tex
doc/gpu_initialization.tex
+23
-12
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+3
-2
imgproc.cu
modules/gpu/src/cuda/imgproc.cu
+2
-2
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+249
-243
initialization.cpp
modules/gpu/src/initialization.cpp
+39
-43
matrix_reductions.cpp
modules/gpu/src/matrix_reductions.cpp
+144
-97
No files found.
doc/gpu_initialization.tex
View file @
90ae1e3a
...
...
@@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false.
\end{description}
\cvCppFunc
{
gpu::
checkPtxVersion
}
\cvCppFunc
{
gpu::
ptxVersionIs
}
Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false.
\cvdefCpp
{
template
$
<
$
unsigned int cmp
\_
op
$
>
$
\newline
bool checkPtxVersion(int major, int minor);
}
\cvdefCpp
{
bool ptxVersionIs(int major, int minor);
}
\begin{description}
\cvarg
{
cmp
\_
op
}{
Comparison operation:
\cvarg
{
major
}{
Major compute capability version.
}
\cvarg
{
minor
}{
Minor compute capability version.
}
\end{description}
\cvCppFunc
{
gpu::ptxVersionIsLessOrEqual
}
Returns true, if the GPU module was built with PTX support of the given compute capability or less, otherwise false.
\cvdefCpp
{
bool ptxVersionIsLessOrEqual(int major, int minor);
}
\begin{description}
\cvarg
{
CMP
\_
EQ
}{
Return true, if at least one of GPU module PTX versions matches the given one, otherwise false
}
\cvarg
{
CMP
\_
LT
}{
Return true, if at least one of GPU module PTX versions is less than the given one, otherwise false
}
\cvarg
{
CMP
\_
LE
}{
Return true, if at least one of GPU module PTX versions is less or equal to the given one, otherwise false
}
\cvarg
{
CMP
\_
GT
}{
Return true, if at least one of GPU module PTX versions is greater than the given one, otherwise false
}
\cvarg
{
CMP
\_
GE
}{
Return true, if at least one of GPU module PTX versions is greater or equal to the given one, otherwise false
}
\end{description}
}
\cvarg
{
major
}{
Major CC version.
}
\cvarg
{
minor
}{
Minor CC version.
}
\cvarg
{
major
}{
Major compute capability version.
}
\cvarg
{
minor
}{
Minor compute capability version.
}
\end{description}
\cvCppFunc
{
gpu::ptxVersionIsGreaterOrEqual
}
Returns true, if the GPU module was built with PTX support of the given compute capability or greater, otherwise false.
\cvdefCpp
{
bool ptxVersionIsGreaterOrEqual(int major, int minor);
}
\begin{description}
\cvarg
{
major
}{
Major compute capability version.
}
\cvarg
{
minor
}{
Minor compute capability version.
}
\end{description}
...
...
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
90ae1e3a
...
...
@@ -72,8 +72,9 @@ namespace cv
CV_EXPORTS
bool
hasNativeDoubleSupport
(
int
device
);
CV_EXPORTS
bool
hasAtomicsSupport
(
int
device
);
template
<
unsigned
int
cmp_op
>
CV_EXPORTS
bool
checkPtxVersion
(
int
major
,
int
minor
);
CV_EXPORTS
bool
ptxVersionIs
(
int
major
,
int
minor
);
CV_EXPORTS
bool
ptxVersionIsLessOrEqual
(
int
major
,
int
minor
);
CV_EXPORTS
bool
ptxVersionIsGreaterOrEqual
(
int
major
,
int
minor
);
//! Checks if the GPU module is PTX compatible with the given NVIDIA device
CV_EXPORTS
bool
isCompatibleWith
(
int
device
);
...
...
modules/gpu/src/cuda/imgproc.cu
View file @
90ae1e3a
...
...
@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
////////////////////////////// Column Sum //////////////////////////////////////
__global__ void column_sum
_k
ernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
__global__ void column_sum
K
ernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
...
...
@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
dim3 threads(256);
dim3 grid(divUp(src.cols, threads.x));
column_sum
_k
ernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
column_sum
K
ernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall(cudaThreadSynchronize());
}
...
...
modules/gpu/src/cuda/matrix_reductions.cu
View file @
90ae1e3a
...
...
@@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Performs reduction in shared memory
template <int size, typename T>
__device__ void sum
_in_s
mem(volatile T* data, const uint tid)
__device__ void sum
InS
mem(volatile T* data, const uint tid)
{
T sum = data[tid];
...
...
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...
...
@@ -132,17 +132,17 @@ namespace cv { namespace gpu { namespace mathfunc
// Returns required buffer sizes
void get
_buf_size_r
equired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
void get
BufSizeR
equired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size;
bufrows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
...
@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T>
__device__ void find
_min_max_in_s
mem(volatile T* minval, volatile T* maxval, const uint tid)
__device__ void find
MinMaxInS
mem(volatile T* minval, volatile T* maxval, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
...
...
@@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T, typename Mask>
__global__ void min
_max_k
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
__global__ void min
MaxK
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
...
...
@@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = mymax;
__syncthreads();
find
_min_max_in_s
mem<nthreads, best_type>(sminval, smaxval, tid);
find
MinMaxInS
mem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
...
...
@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = maxval[idx];
__syncthreads();
find
_min_max_in_s
mem<nthreads, best_type>(sminval, smaxval, tid);
find
MinMaxInS
mem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
...
...
@@ -263,16 +263,16 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void min
_max_mask_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
void min
MaxMaskC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min
MaxK
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -282,26 +282,26 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min
_max_mask_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min
_max_c
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
void min
MaxC
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min
MaxK
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -311,17 +311,17 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min
_max_c
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<float>(const DevMem2D, double*,double*, PtrStep);
template void min
_max_c
aller<double>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<float>(const DevMem2D, double*,double*, PtrStep);
template void min
MaxC
aller<double>(const DevMem2D, double*, double*, PtrStep);
template <int nthreads, typename T>
__global__ void min
_max_pass2_k
ernel(T* minval, T* maxval, int size)
__global__ void min
MaxPass2K
ernel(T* minval, T* maxval, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
...
...
@@ -334,7 +334,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = maxval[idx];
__syncthreads();
find
_min_max_in_s
mem<nthreads, best_type>(sminval, smaxval, tid);
find
MinMaxInS
mem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
...
...
@@ -345,17 +345,17 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void min
_max_mask_multipass_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
void min
MaxMaskMultipassC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min
_max_pass2_k
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
min
MaxK
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min
MaxPass2K
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -365,26 +365,26 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min
_max_mask_multipass_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min
_max_multipass_c
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
void min
MaxMultipassC
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min
_max_pass2_k
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
min
MaxK
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min
MaxPass2K
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -394,12 +394,12 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
}
template void min
_max_multipass_c
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<float>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<float>(const DevMem2D, double*, double*, PtrStep);
} // namespace minmax
...
...
@@ -417,7 +417,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...
...
@@ -427,11 +427,11 @@ namespace cv { namespace gpu { namespace mathfunc
// Returns required buffer sizes
void get
_buf_size_r
equired(int cols, int rows, int elem_size, int& b1cols,
void get
BufSizeR
equired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows)
{
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations
...
...
@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates device constants which are used in the kernels using specified thread configuration
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
...
@@ -469,7 +469,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T>
__device__ void find
_min_max_loc_in_s
mem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
__device__ void find
MinMaxLocInS
mem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
...
...
@@ -489,7 +489,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T, typename Mask>
__global__ void min
_max_loc_k
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
__global__ void min
MaxLocK
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
...
...
@@ -503,7 +503,8 @@ namespace cv { namespace gpu { namespace mathfunc
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() :
numeric_limits_gpu<T>::min();
uint myminloc = 0;
uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
...
...
@@ -529,7 +530,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = mymaxloc;
__syncthreads();
find
_min_max_loc_in_s
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
find
MinMaxLocInS
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -558,7 +559,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = maxloc[idx];
__syncthreads();
find
_min_max_loc_in_s
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
find
MinMaxLocInS
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
...
...
@@ -582,19 +583,20 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void min
_max_loc_mask_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
void min
MaxLocMaskC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -610,29 +612,30 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min
_max_loc_mask_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min
_max_loc_c
aller(const DevMem2D src, double* minval, double* maxval,
void min
MaxLocC
aller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -648,18 +651,18 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min
_max_loc_c
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
// This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T>
__global__ void min
_max_loc_pass2_k
ernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
__global__ void min
MaxLocPass2K
ernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
...
...
@@ -676,7 +679,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = maxloc[idx];
__syncthreads();
find
_min_max_loc_in_s
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
find
MinMaxLocInS
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
...
...
@@ -689,20 +692,21 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void min
_max_loc_mask_multipass_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
void min
MaxLocMaskMultipassC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -718,29 +722,30 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min
_max_loc_mask_multipass_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min
_max_loc_multipass_c
aller(const DevMem2D src, double* minval, double* maxval,
void min
MaxLocMultipassC
aller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -756,12 +761,12 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min
_max_loc_multipass_c
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
} // namespace minmaxloc
...
...
@@ -776,7 +781,7 @@ namespace cv { namespace gpu { namespace mathfunc
__device__ uint blocks_finished = 0;
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...
...
@@ -785,16 +790,16 @@ namespace cv { namespace gpu { namespace mathfunc
}
void get
_buf_size_r
equired(int cols, int rows, int& bufcols, int& bufrows)
void get
BufSizeR
equired(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
...
@@ -804,7 +809,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T>
__global__ void count
_non_zero_k
ernel(const DevMem2D src, volatile uint* count)
__global__ void count
NonZeroK
ernel(const DevMem2D src, volatile uint* count)
{
__shared__ uint scount[nthreads];
...
...
@@ -823,7 +828,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = cnt;
__syncthreads();
sum
_in_s
mem<nthreads, uint>(scount, tid);
sum
InS
mem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -844,7 +849,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
sum
_in_s
mem<nthreads, uint>(scount, tid);
sum
InS
mem<nthreads, uint>(scount, tid);
if (tid == 0)
{
...
...
@@ -859,15 +864,15 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
int count
_non_zero_c
aller(const DevMem2D src, PtrStep buf)
int count
NonZeroC
aller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count
_non_zero_k
ernel<256, T><<<grid, threads>>>(src, count_buf);
count
NonZeroK
ernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
uint count;
...
...
@@ -876,17 +881,17 @@ namespace cv { namespace gpu { namespace mathfunc
return count;
}
template int count
_non_zero_c
aller<uchar>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<char>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<ushort>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<short>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<int>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<float>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<double>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<uchar>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<char>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<ushort>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<short>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<int>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<float>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T>
__global__ void count
_non_zero_pass2_k
ernel(uint* count, int size)
__global__ void count
NonZeroPass2K
ernel(uint* count, int size)
{
__shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
...
...
@@ -894,7 +899,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < size ? count[tid] : 0;
__syncthreads();
sum
_in_s
mem<nthreads, uint>(scount, tid);
sum
InS
mem<nthreads, uint>(scount, tid);
if (tid == 0)
count[0] = scount[0];
...
...
@@ -902,16 +907,16 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
int count
_non_zero_multipass_c
aller(const DevMem2D src, PtrStep buf)
int count
NonZeroMultipassC
aller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count
_non_zero_k
ernel<256, T><<<grid, threads>>>(src, count_buf);
count
_non_zero_pass2_k
ernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
count
NonZeroK
ernel<256, T><<<grid, threads>>>(src, count_buf);
count
NonZeroPass2K
ernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
uint count;
...
...
@@ -920,12 +925,12 @@ namespace cv { namespace gpu { namespace mathfunc
return count;
}
template int count
_non_zero_multipass_c
aller<uchar>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<char>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<ushort>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<short>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<int>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<float>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<uchar>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<char>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<ushort>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<short>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<int>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<float>(const DevMem2D, PtrStep);
} // namespace countnonzero
...
...
@@ -958,7 +963,7 @@ namespace cv { namespace gpu { namespace mathfunc
const int threads_x = 32;
const int threads_y = 8;
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y),
...
...
@@ -968,16 +973,16 @@ namespace cv { namespace gpu { namespace mathfunc
}
void get
_buf_size_r
equired(int cols, int rows, int cn, int& bufcols, int& bufrows)
void get
BufSizeR
equired(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
...
@@ -986,7 +991,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel(const DevMem2D src, R* result)
__global__ void sum
K
ernel(const DevMem2D src, R* result)
{
__shared__ R smem[nthreads];
...
...
@@ -1006,7 +1011,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = sum;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -1027,7 +1032,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
if (tid == 0)
{
...
...
@@ -1042,7 +1047,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel(R* result, int size)
__global__ void sum
Pass2K
ernel(R* result, int size)
{
__shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x;
...
...
@@ -1050,7 +1055,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = tid < size ? result[tid] : 0;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
if (tid == 0)
result[0] = smem[0];
...
...
@@ -1058,7 +1063,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
__global__ void sum
K
ernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
{
typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<R, 2>::vec_t DstType;
...
...
@@ -1086,8 +1091,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = sum.y;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -1113,8 +1118,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = res.y;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
...
...
@@ -1137,7 +1142,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
__global__ void sum
Pass2K
ernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
{
typedef typename TypeVec<R, 2>::vec_t DstType;
...
...
@@ -1150,8 +1155,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = res.y;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
...
...
@@ -1163,7 +1168,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
__global__ void sum
K
ernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType;
...
...
@@ -1192,9 +1197,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = sum.z;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -1222,9 +1227,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
...
...
@@ -1249,7 +1254,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
__global__ void sum
Pass2K
ernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{
typedef typename TypeVec<R, 3>::vec_t DstType;
...
...
@@ -1263,9 +1268,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
...
...
@@ -1277,7 +1282,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
__global__ void sum
K
ernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType;
...
...
@@ -1308,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = sum.w;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 3 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -1341,10 +1346,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = res.w;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 3 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
...
...
@@ -1371,7 +1376,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
__global__ void sum
Pass2K
ernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{
typedef typename TypeVec<R, 4>::vec_t DstType;
...
...
@@ -1386,10 +1391,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = res.z;
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 3 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
...
...
@@ -1405,36 +1410,36 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
void sum
_multipass_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sum
MultipassC
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum
_k
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 2:
sum
_k
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3:
sum
_k
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4:
sum
_k
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
}
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -1448,40 +1453,40 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sum
_multipass_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sum
_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sum
C
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum
_k
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum
_k
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum
_k
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum
_k
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
...
...
@@ -1496,48 +1501,48 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sum
_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sq
sum_multipass_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sq
rSumMultipassC
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum
_k
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 2:
sum
_k
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 3:
sum
_k
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 4:
sum
_k
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
}
...
...
@@ -1552,40 +1557,40 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sq
sum_multipass_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sq
sum_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sq
rSumC
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum
_k
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum
_k
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum
_k
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum
_k
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
...
...
@@ -1600,10 +1605,10 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
}
template void sq
sum_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<float>(const DevMem2D, PtrStep, double*, int);
}}}
\ No newline at end of file
modules/gpu/src/initialization.cpp
View file @
90ae1e3a
...
...
@@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device)
namespace
{
template
<
unsigned
int
cmp_op
>
bool
comparePairs
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
);
template
<>
bool
comparePairs
<
CMP_EQ
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
struct
ComparerEqual
{
bool
operator
()(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
const
{
return
lhs1
==
rhs1
&&
lhs2
==
rhs2
;
}
};
template
<>
bool
comparePairs
<
CMP_GT
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
{
return
lhs1
>
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
>
rhs2
);
}
template
<>
bool
comparePairs
<
CMP_GE
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
struct
ComparerLessOrEqual
{
return
lhs1
>
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
>=
rhs2
);
}
template
<>
bool
comparePairs
<
CMP_LT
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
bool
operator
()(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
const
{
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<
rhs2
);
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<=
rhs2
);
}
};
template
<>
bool
comparePairs
<
CMP_LE
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
struct
ComparerGreaterOrEqual
{
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<=
rhs2
);
}
template
<>
bool
comparePairs
<
CMP_NE
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
bool
operator
()(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
const
{
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<
=
rhs2
);
return
lhs1
>
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
>
=
rhs2
);
}
}
};
template
<
unsigned
int
cmp_op
>
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
(
int
major
,
int
minor
)
{
template
<
typename
Comparer
>
bool
checkPtxVersion
(
int
major
,
int
minor
,
Comparer
cmp
)
{
#ifdef OPENCV_GPU_CUDA_ARCH_10
if
(
comparePairs
<
cmp_op
>
(
1
,
0
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
0
,
major
,
minor
))
return
true
;
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_11
if
(
comparePairs
<
cmp_op
>
(
1
,
1
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
1
,
major
,
minor
))
return
true
;
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_12
if
(
comparePairs
<
cmp_op
>
(
1
,
2
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
2
,
major
,
minor
))
return
true
;
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_13
if
(
comparePairs
<
cmp_op
>
(
1
,
3
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
3
,
major
,
minor
))
return
true
;
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_20
if
(
comparePairs
<
cmp_op
>
(
2
,
0
,
major
,
minor
))
return
true
;
if
(
cmp
(
2
,
0
,
major
,
minor
))
return
true
;
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_21
if
(
comparePairs
<
cmp_op
>
(
2
,
1
,
major
,
minor
))
return
true
;
if
(
cmp
(
2
,
1
,
major
,
minor
))
return
true
;
#endif
return
false
;
}
}
CV_EXPORTS
bool
cv
::
gpu
::
ptxVersionIs
(
int
major
,
int
minor
)
{
return
checkPtxVersion
(
major
,
minor
,
ComparerEqual
());
}
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_EQ
>
(
int
major
,
int
minor
);
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_GT
>
(
int
major
,
int
minor
);
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_GE
>
(
int
major
,
int
minor
);
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_LT
>
(
int
major
,
int
minor
);
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_LE
>
(
int
major
,
int
minor
);
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_NE
>
(
int
major
,
int
minor
);
CV_EXPORTS
bool
cv
::
gpu
::
ptxVersionIsLessOrEqual
(
int
major
,
int
minor
)
{
return
checkPtxVersion
(
major
,
minor
,
ComparerLessOrEqual
());
}
CV_EXPORTS
bool
cv
::
gpu
::
ptxVersionIsGreaterOrEqual
(
int
major
,
int
minor
)
{
return
checkPtxVersion
(
major
,
minor
,
ComparerGreaterOrEqual
());
}
CV_EXPORTS
bool
isCompatibleWith
(
int
device
)
...
...
@@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device)
int
major
,
minor
;
getComputeCapability
(
device
,
major
,
minor
);
return
checkPtxVersion
<
CMP_LE
>
(
major
,
minor
);
return
ptxVersionIsLessOrEqual
(
major
,
minor
);
}
#endif
...
...
modules/gpu/src/matrix_reductions.cpp
View file @
90ae1e3a
...
...
@@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
template
<
typename
T
>
void
sum
_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sum
C
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sum
_multipass_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sum
MultipassC
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sq
sum_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sq
rSumC
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sq
sum_multipass_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sq
rSumMultipassC
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
namespace
sum
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
}
}}}
...
...
@@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sum_multipass_caller
<
unsigned
char
>
,
sum_multipass_caller
<
char
>
,
sum_multipass_caller
<
unsigned
short
>
,
sum_multipass_caller
<
short
>
,
sum_multipass_caller
<
int
>
,
sum_multipass_caller
<
float
>
,
0
},
{
sum_caller
<
unsigned
char
>
,
sum_caller
<
char
>
,
sum_caller
<
unsigned
short
>
,
sum_caller
<
short
>
,
sum_caller
<
int
>
,
sum_caller
<
float
>
,
0
}
};
Size
bufSize
;
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
ensureSizeIsEnough
(
bufSize
,
CV_8U
,
buf
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
static
Caller
multipass_callers
[
7
]
=
{
sumMultipassCaller
<
unsigned
char
>
,
sumMultipassCaller
<
char
>
,
sumMultipassCaller
<
unsigned
short
>
,
sumMultipassCaller
<
short
>
,
sumMultipassCaller
<
int
>
,
sumMultipassCaller
<
float
>
,
0
};
static
Caller
singlepass_callers
[
7
]
=
{
sumCaller
<
unsigned
char
>
,
sumCaller
<
char
>
,
sumCaller
<
unsigned
short
>
,
sumCaller
<
short
>
,
sumCaller
<
int
>
,
sumCaller
<
float
>
,
0
};
Size
buf_size
;
sum
::
getBufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sum: unsupported type"
);
double
result
[
4
];
...
...
@@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sqsum_multipass_caller
<
unsigned
char
>
,
sqsum_multipass_caller
<
char
>
,
sqsum_multipass_caller
<
unsigned
short
>
,
sqsum_multipass_caller
<
short
>
,
sqsum_multipass_caller
<
int
>
,
sqsum_multipass_caller
<
float
>
,
0
},
{
sqsum_caller
<
unsigned
char
>
,
sqsum_caller
<
char
>
,
sqsum_caller
<
unsigned
short
>
,
sqsum_caller
<
short
>
,
sqsum_caller
<
int
>
,
sqsum_caller
<
float
>
,
0
}
};
Size
bufSize
;
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
ensureSizeIsEnough
(
bufSize
,
CV_8U
,
buf
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
static
Caller
multipass_callers
[
7
]
=
{
sqrSumMultipassCaller
<
unsigned
char
>
,
sqrSumMultipassCaller
<
char
>
,
sqrSumMultipassCaller
<
unsigned
short
>
,
sqrSumMultipassCaller
<
short
>
,
sqrSumMultipassCaller
<
int
>
,
sqrSumMultipassCaller
<
float
>
,
0
};
static
Caller
singlepass_callers
[
7
]
=
{
sqrSumCaller
<
unsigned
char
>
,
sqrSumCaller
<
char
>
,
sqrSumCaller
<
unsigned
short
>
,
sqrSumCaller
<
short
>
,
sqrSumCaller
<
int
>
,
sqrSumCaller
<
float
>
,
0
};
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Size
buf_size
;
sum
::
getBufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
Caller
caller
=
callers
[
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sqrSum: unsupported type"
);
double
result
[
4
];
...
...
@@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmax
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
bufcols
,
int
&
bufrows
);
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
void
min
_max_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min
_max_mask_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxMaskC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min
_max_multipass_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxMultipassC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min
_max_mask_multipass_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxMaskMultipassC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
}}}}
...
...
@@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
min_max_multipass_caller
<
unsigned
char
>
,
min_max_multipass_caller
<
char
>
,
min_max_multipass_caller
<
unsigned
short
>
,
min_max_multipass_caller
<
short
>
,
min_max_multipass_caller
<
int
>
,
min_max_multipass_caller
<
float
>
,
0
},
{
min_max_caller
<
unsigned
char
>
,
min_max_caller
<
char
>
,
min_max_caller
<
unsigned
short
>
,
min_max_caller
<
short
>
,
min_max_caller
<
int
>
,
min_max_caller
<
float
>
,
min_max_caller
<
double
>
}
};
static
Caller
multipass_callers
[
7
]
=
{
minMaxMultipassCaller
<
unsigned
char
>
,
minMaxMultipassCaller
<
char
>
,
minMaxMultipassCaller
<
unsigned
short
>
,
minMaxMultipassCaller
<
short
>
,
minMaxMultipassCaller
<
int
>
,
minMaxMultipassCaller
<
float
>
,
0
};
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
{
{
min_max_mask_multipass_caller
<
unsigned
char
>
,
min_max_mask_multipass_caller
<
char
>
,
min_max_mask_multipass_caller
<
unsigned
short
>
,
min_max_mask_multipass_caller
<
short
>
,
min_max_mask_multipass_caller
<
int
>
,
min_max_mask_multipass_caller
<
float
>
,
0
},
{
min_max_mask_caller
<
unsigned
char
>
,
min_max_mask_caller
<
char
>
,
min_max_mask_caller
<
unsigned
short
>
,
min_max_mask_caller
<
short
>
,
min_max_mask_caller
<
int
>
,
min_max_mask_caller
<
float
>
,
min_max_mask_caller
<
double
>
}
};
static
Caller
singlepass_callers
[
7
]
=
{
minMaxCaller
<
unsigned
char
>
,
minMaxCaller
<
char
>
,
minMaxCaller
<
unsigned
short
>
,
minMaxCaller
<
short
>
,
minMaxCaller
<
int
>
,
minMaxCaller
<
float
>
,
minMaxCaller
<
double
>
};
static
MaskedCaller
masked_multipass_callers
[
7
]
=
{
minMaxMaskMultipassCaller
<
unsigned
char
>
,
minMaxMaskMultipassCaller
<
char
>
,
minMaxMaskMultipassCaller
<
unsigned
short
>
,
minMaxMaskMultipassCaller
<
short
>
,
minMaxMaskMultipassCaller
<
int
>
,
minMaxMaskMultipassCaller
<
float
>
,
0
};
static
MaskedCaller
masked_singlepass_callers
[
7
]
=
{
minMaxMaskCaller
<
unsigned
char
>
,
minMaxMaskCaller
<
char
>
,
minMaxMaskCaller
<
unsigned
short
>
,
minMaxMaskCaller
<
short
>
,
minMaxMaskCaller
<
int
>
,
minMaxMaskCaller
<
float
>
,
minMaxMaskCaller
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
...
...
@@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
double
minVal_
;
if
(
!
minVal
)
minVal
=
&
minVal_
;
double
maxVal_
;
if
(
!
maxVal
)
maxVal
=
&
maxVal_
;
Size
buf
S
ize
;
get
_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
bufSize
.
width
,
bufS
ize
.
height
);
ensureSizeIsEnough
(
buf
S
ize
,
CV_8U
,
buf
);
Size
buf
_s
ize
;
get
BufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
buf_size
.
width
,
buf_s
ize
.
height
);
ensureSizeIsEnough
(
buf
_s
ize
,
CV_8U
,
buf
);
if
(
mask
.
empty
())
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
buf
);
}
else
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
MaskedCaller
*
callers
=
masked_multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
masked_singlepass_callers
;
MaskedCaller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
buf
);
}
...
...
@@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmaxloc
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
b1cols
,
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
b1cols
,
int
&
b1rows
,
int
&
b2cols
,
int
&
b2rows
);
template
<
typename
T
>
void
min
_max_loc_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
template
<
typename
T
>
void
min
_max_loc_mask_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocMaskC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
template
<
typename
T
>
void
min
_max_loc_multipass_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocMultipassC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
template
<
typename
T
>
void
min
_max_loc_mask_multipass_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocMaskMultipassC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
}}}}
...
...
@@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
min_max_loc_multipass_caller
<
unsigned
char
>
,
min_max_loc_multipass_caller
<
char
>
,
min_max_loc_multipass_caller
<
unsigned
short
>
,
min_max_loc_multipass_caller
<
short
>
,
min_max_loc_multipass_caller
<
int
>
,
min_max_loc_multipass_caller
<
float
>
,
0
},
{
min_max_loc_caller
<
unsigned
char
>
,
min_max_loc_caller
<
char
>
,
min_max_loc_caller
<
unsigned
short
>
,
min_max_loc_caller
<
short
>
,
min_max_loc_caller
<
int
>
,
min_max_loc_caller
<
float
>
,
min_max_loc_caller
<
double
>
}
};
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
{
{
min_max_loc_mask_multipass_caller
<
unsigned
char
>
,
min_max_loc_mask_multipass_caller
<
char
>
,
min_max_loc_mask_multipass_caller
<
unsigned
short
>
,
min_max_loc_mask_multipass_caller
<
short
>
,
min_max_loc_mask_multipass_caller
<
int
>
,
min_max_loc_mask_multipass_caller
<
float
>
,
0
},
{
min_max_loc_mask_caller
<
unsigned
char
>
,
min_max_loc_mask_caller
<
char
>
,
min_max_loc_mask_caller
<
unsigned
short
>
,
min_max_loc_mask_caller
<
short
>
,
min_max_loc_mask_caller
<
int
>
,
min_max_loc_mask_caller
<
float
>
,
min_max_loc_mask_caller
<
double
>
}
};
static
Caller
multipass_callers
[
7
]
=
{
minMaxLocMultipassCaller
<
unsigned
char
>
,
minMaxLocMultipassCaller
<
char
>
,
minMaxLocMultipassCaller
<
unsigned
short
>
,
minMaxLocMultipassCaller
<
short
>
,
minMaxLocMultipassCaller
<
int
>
,
minMaxLocMultipassCaller
<
float
>
,
0
};
static
Caller
singlepass_callers
[
7
]
=
{
minMaxLocCaller
<
unsigned
char
>
,
minMaxLocCaller
<
char
>
,
minMaxLocCaller
<
unsigned
short
>
,
minMaxLocCaller
<
short
>
,
minMaxLocCaller
<
int
>
,
minMaxLocCaller
<
float
>
,
minMaxLocCaller
<
double
>
};
static
MaskedCaller
masked_multipass_callers
[
7
]
=
{
minMaxLocMaskMultipassCaller
<
unsigned
char
>
,
minMaxLocMaskMultipassCaller
<
char
>
,
minMaxLocMaskMultipassCaller
<
unsigned
short
>
,
minMaxLocMaskMultipassCaller
<
short
>
,
minMaxLocMaskMultipassCaller
<
int
>
,
minMaxLocMaskMultipassCaller
<
float
>
,
0
};
static
MaskedCaller
masked_singlepass_callers
[
7
]
=
{
minMaxLocMaskCaller
<
unsigned
char
>
,
minMaxLocMaskCaller
<
char
>
,
minMaxLocMaskCaller
<
unsigned
short
>
,
minMaxLocMaskCaller
<
short
>
,
minMaxLocMaskCaller
<
int
>
,
minMaxLocMaskCaller
<
float
>
,
minMaxLocMaskCaller
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
...
...
@@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
int
minLoc_
[
2
];
int
maxLoc_
[
2
];
Size
val
BufSize
,
locBufS
ize
;
get
_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
valBufS
ize
.
width
,
val
BufSize
.
height
,
locBufSize
.
width
,
locBufS
ize
.
height
);
ensureSizeIsEnough
(
val
BufS
ize
,
CV_8U
,
valBuf
);
ensureSizeIsEnough
(
loc
BufS
ize
,
CV_8U
,
locBuf
);
Size
val
buf_size
,
locbuf_s
ize
;
get
BufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
valbuf_s
ize
.
width
,
val
buf_size
.
height
,
locbuf_size
.
width
,
locbuf_s
ize
.
height
);
ensureSizeIsEnough
(
val
buf_s
ize
,
CV_8U
,
valBuf
);
ensureSizeIsEnough
(
loc
buf_s
ize
,
CV_8U
,
locBuf
);
if
(
mask
.
empty
())
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valBuf
,
locBuf
);
}
else
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
MaskedCaller
*
callers
=
masked_multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
masked_singlepass_callers
;
MaskedCaller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valBuf
,
locBuf
);
}
...
...
@@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
countnonzero
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
&
bufcols
,
int
&
bufrows
);
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
int
count
_non_zero_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
int
count
NonZeroC
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
template
<
typename
T
>
int
count
_non_zero_multipass_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
int
count
NonZeroMultipassC
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
}}}}
...
...
@@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
typedef
int
(
*
Caller
)(
const
DevMem2D
src
,
PtrStep
buf
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
count_non_zero_multipass_caller
<
unsigned
char
>
,
count_non_zero_multipass_caller
<
char
>
,
count_non_zero_multipass_caller
<
unsigned
short
>
,
count_non_zero_multipass_caller
<
short
>
,
count_non_zero_multipass_caller
<
int
>
,
count_non_zero_multipass_caller
<
float
>
,
0
},
{
count_non_zero_caller
<
unsigned
char
>
,
count_non_zero_caller
<
char
>
,
count_non_zero_caller
<
unsigned
short
>
,
count_non_zero_caller
<
short
>
,
count_non_zero_caller
<
int
>
,
count_non_zero_caller
<
float
>
,
count_non_zero_caller
<
double
>
}
};
static
Caller
multipass_callers
[
7
]
=
{
countNonZeroMultipassCaller
<
unsigned
char
>
,
countNonZeroMultipassCaller
<
char
>
,
countNonZeroMultipassCaller
<
unsigned
short
>
,
countNonZeroMultipassCaller
<
short
>
,
countNonZeroMultipassCaller
<
int
>
,
countNonZeroMultipassCaller
<
float
>
,
0
};
static
Caller
singlepass_callers
[
7
]
=
{
countNonZeroCaller
<
unsigned
char
>
,
countNonZeroCaller
<
char
>
,
countNonZeroCaller
<
unsigned
short
>
,
countNonZeroCaller
<
short
>
,
countNonZeroCaller
<
int
>
,
countNonZeroCaller
<
float
>
,
countNonZeroCaller
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
Size
bufSize
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
bufSize
.
width
,
bufSize
.
height
);
ensureSizeIsEnough
(
bufSize
,
CV_8U
,
buf
);
Size
buf_size
;
getBufSizeRequired
(
src
.
cols
,
src
.
rows
,
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
Caller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"countNonZero: unsupported type"
);
return
caller
(
src
,
buf
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment