Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
90ae1e3a
Commit
90ae1e3a
authored
Jan 19, 2011
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactored gpu module
parent
8503f752
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
472 additions
and
411 deletions
+472
-411
gpu_initialization.tex
doc/gpu_initialization.tex
+23
-12
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+3
-2
imgproc.cu
modules/gpu/src/cuda/imgproc.cu
+2
-2
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+254
-248
initialization.cpp
modules/gpu/src/initialization.cpp
+46
-50
matrix_reductions.cpp
modules/gpu/src/matrix_reductions.cpp
+144
-97
No files found.
doc/gpu_initialization.tex
View file @
90ae1e3a
...
@@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false.
...
@@ -69,22 +69,33 @@ Returns true, if the specified GPU has atomics support, otherwise false.
\end{description}
\end{description}
\cvCppFunc
{
gpu::
checkPtxVersion
}
\cvCppFunc
{
gpu::
ptxVersionIs
}
Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false.
Returns true, if the GPU module was built with PTX support of the given compute capability, otherwise false.
\cvdefCpp
{
template
$
<
$
unsigned int cmp
\_
op
$
>
$
\newline
\cvdefCpp
{
bool ptxVersionIs(int major, int minor);
}
bool checkPtxVersion(int major, int minor);
}
\begin{description}
\begin{description}
\cvarg
{
cmp
\_
op
}{
Comparison operation:
\cvarg
{
major
}{
Major compute capability version.
}
\cvarg
{
minor
}{
Minor compute capability version.
}
\end{description}
\cvCppFunc
{
gpu::ptxVersionIsLessOrEqual
}
Returns true, if the GPU module was built with PTX support of the given compute capability or less, otherwise false.
\cvdefCpp
{
bool ptxVersionIsLessOrEqual(int major, int minor);
}
\begin{description}
\begin{description}
\cvarg
{
CMP
\_
EQ
}{
Return true, if at least one of GPU module PTX versions matches the given one, otherwise false
}
\cvarg
{
major
}{
Major compute capability version.
}
\cvarg
{
CMP
\_
LT
}{
Return true, if at least one of GPU module PTX versions is less than the given one, otherwise false
}
\cvarg
{
minor
}{
Minor compute capability version.
}
\cvarg
{
CMP
\_
LE
}{
Return true, if at least one of GPU module PTX versions is less or equal to the given one, otherwise false
}
\end{description}
\cvarg
{
CMP
\_
GT
}{
Return true, if at least one of GPU module PTX versions is greater than the given one, otherwise false
}
\cvarg
{
CMP
\_
GE
}{
Return true, if at least one of GPU module PTX versions is greater or equal to the given one, otherwise false
}
\end{description}
}
\cvCppFunc
{
gpu::ptxVersionIsGreaterOrEqual
}
\cvarg
{
major
}{
Major CC version.
}
Returns true, if the GPU module was built with PTX support of the given compute capability or greater, otherwise false.
\cvarg
{
minor
}{
Minor CC version.
}
\cvdefCpp
{
bool ptxVersionIsGreaterOrEqual(int major, int minor);
}
\begin{description}
\cvarg
{
major
}{
Major compute capability version.
}
\cvarg
{
minor
}{
Minor compute capability version.
}
\end{description}
\end{description}
...
...
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
90ae1e3a
...
@@ -72,8 +72,9 @@ namespace cv
...
@@ -72,8 +72,9 @@ namespace cv
CV_EXPORTS
bool
hasNativeDoubleSupport
(
int
device
);
CV_EXPORTS
bool
hasNativeDoubleSupport
(
int
device
);
CV_EXPORTS
bool
hasAtomicsSupport
(
int
device
);
CV_EXPORTS
bool
hasAtomicsSupport
(
int
device
);
template
<
unsigned
int
cmp_op
>
CV_EXPORTS
bool
ptxVersionIs
(
int
major
,
int
minor
);
CV_EXPORTS
bool
checkPtxVersion
(
int
major
,
int
minor
);
CV_EXPORTS
bool
ptxVersionIsLessOrEqual
(
int
major
,
int
minor
);
CV_EXPORTS
bool
ptxVersionIsGreaterOrEqual
(
int
major
,
int
minor
);
//! Checks if the GPU module is PTX compatible with the given NVIDIA device
//! Checks if the GPU module is PTX compatible with the given NVIDIA device
CV_EXPORTS
bool
isCompatibleWith
(
int
device
);
CV_EXPORTS
bool
isCompatibleWith
(
int
device
);
...
...
modules/gpu/src/cuda/imgproc.cu
View file @
90ae1e3a
...
@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
...
@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
////////////////////////////// Column Sum //////////////////////////////////////
////////////////////////////// Column Sum //////////////////////////////////////
__global__ void column_sum
_k
ernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
__global__ void column_sum
K
ernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
{
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
...
@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
...
@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
dim3 threads(256);
dim3 threads(256);
dim3 grid(divUp(src.cols, threads.x));
dim3 grid(divUp(src.cols, threads.x));
column_sum
_k
ernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
column_sum
K
ernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
}
}
...
...
modules/gpu/src/cuda/matrix_reductions.cu
View file @
90ae1e3a
...
@@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -54,7 +54,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Performs reduction in shared memory
// Performs reduction in shared memory
template <int size, typename T>
template <int size, typename T>
__device__ void sum
_in_s
mem(volatile T* data, const uint tid)
__device__ void sum
InS
mem(volatile T* data, const uint tid)
{
{
T sum = data[tid];
T sum = data[tid];
...
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
// - threads variable satisfies to threads.x * threads.y == 256
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
{
threads = dim3(32, 8);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...
@@ -132,17 +132,17 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -132,17 +132,17 @@ namespace cv { namespace gpu { namespace mathfunc
// Returns required buffer sizes
// Returns required buffer sizes
void get
_buf_size_r
equired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
void get
BufSizeR
equired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size;
bufcols = grid.x * grid.y * elem_size;
bufrows = 2;
bufrows = 2;
}
}
// Estimates device constants which are used in the kernels using specified thread configuration
// Estimates device constants which are used in the kernels using specified thread configuration
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T>
template <int size, typename T>
__device__ void find
_min_max_in_s
mem(volatile T* minval, volatile T* maxval, const uint tid)
__device__ void find
MinMaxInS
mem(volatile T* minval, volatile T* maxval, const uint tid)
{
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
...
@@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T, typename Mask>
template <int nthreads, typename T, typename Mask>
__global__ void min
_max_k
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
__global__ void min
MaxK
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
{
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type sminval[nthreads];
...
@@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -212,7 +212,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = mymax;
smaxval[tid] = mymax;
__syncthreads();
__syncthreads();
find
_min_max_in_s
mem<nthreads, best_type>(sminval, smaxval, tid);
find
MinMaxInS
mem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = maxval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
__syncthreads();
find
_min_max_in_s
mem<nthreads, best_type>(sminval, smaxval, tid);
find
MinMaxInS
mem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -263,16 +263,16 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -263,16 +263,16 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
template <typename T>
void min
_max_mask_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
void min
MaxMaskC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min
MaxK
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -282,26 +282,26 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -282,26 +282,26 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
*maxval = maxval_;
}
}
template void min
_max_mask_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_c
aller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskC
aller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
template <typename T>
void min
_max_c
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
void min
MaxC
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min
MaxK
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -311,17 +311,17 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -311,17 +311,17 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
*maxval = maxval_;
}
}
template void min
_max_c
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_c
aller<float>(const DevMem2D, double*,double*, PtrStep);
template void min
MaxC
aller<float>(const DevMem2D, double*,double*, PtrStep);
template void min
_max_c
aller<double>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxC
aller<double>(const DevMem2D, double*, double*, PtrStep);
template <int nthreads, typename T>
template <int nthreads, typename T>
__global__ void min
_max_pass2_k
ernel(T* minval, T* maxval, int size)
__global__ void min
MaxPass2K
ernel(T* minval, T* maxval, int size)
{
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type sminval[nthreads];
...
@@ -334,7 +334,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -334,7 +334,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = maxval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
__syncthreads();
find
_min_max_in_s
mem<nthreads, best_type>(sminval, smaxval, tid);
find
MinMaxInS
mem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -345,17 +345,17 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -345,17 +345,17 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
template <typename T>
void min
_max_mask_multipass_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
void min
MaxMaskMultipassC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min
MaxK
ernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min
_max_pass2_k
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
min
MaxPass2K
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -365,26 +365,26 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -365,26 +365,26 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
*maxval = maxval_;
}
}
template void min
_max_mask_multipass_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
_max_mask_multipass_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min
MaxMaskMultipassC
aller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
template <typename T>
void min
_max_multipass_c
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
void min
MaxMultipassC
aller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
T* maxval_buf = (T*)buf.ptr(1);
min
_max_k
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min
MaxK
ernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min
_max_pass2_k
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
min
MaxPass2K
ernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -394,12 +394,12 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -394,12 +394,12 @@ namespace cv { namespace gpu { namespace mathfunc
*maxval = maxval_;
*maxval = maxval_;
}
}
template void min
_max_multipass_c
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<char>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<short>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<int>(const DevMem2D, double*, double*, PtrStep);
template void min
_max_multipass_c
aller<float>(const DevMem2D, double*, double*, PtrStep);
template void min
MaxMultipassC
aller<float>(const DevMem2D, double*, double*, PtrStep);
} // namespace minmax
} // namespace minmax
...
@@ -417,7 +417,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -417,7 +417,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
// - threads variable satisfies to threads.x * threads.y == 256
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
{
threads = dim3(32, 8);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...
@@ -427,11 +427,11 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -427,11 +427,11 @@ namespace cv { namespace gpu { namespace mathfunc
// Returns required buffer sizes
// Returns required buffer sizes
void get
_buf_size_r
equired(int cols, int rows, int elem_size, int& b1cols,
void get
BufSizeR
equired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows)
int& b1rows, int& b2cols, int& b2rows)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values
b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2;
b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations
b2cols = grid.x * grid.y * sizeof(int); // For locations
...
@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates device constants which are used in the kernels using specified thread configuration
// Estimates device constants which are used in the kernels using specified thread configuration
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
@@ -469,8 +469,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -469,8 +469,8 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T>
template <int size, typename T>
__device__ void find
_min_max_loc_in_s
mem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
__device__ void find
MinMaxLocInS
mem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid)
volatile uint* maxloc, const uint tid)
{
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
...
@@ -489,8 +489,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -489,8 +489,8 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T, typename Mask>
template <int nthreads, typename T, typename Mask>
__global__ void min
_max_loc_k
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
__global__ void min
MaxLocK
ernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc)
uint* minloc, uint* maxloc)
{
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type sminval[nthreads];
...
@@ -503,7 +503,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -503,7 +503,8 @@ namespace cv { namespace gpu { namespace mathfunc
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() :
numeric_limits_gpu<T>::min();
uint myminloc = 0;
uint myminloc = 0;
uint mymaxloc = 0;
uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
...
@@ -529,7 +530,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -529,7 +530,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = mymaxloc;
smaxloc[tid] = mymaxloc;
__syncthreads();
__syncthreads();
find
_min_max_loc_in_s
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
find
MinMaxLocInS
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
__shared__ bool is_last;
...
@@ -558,7 +559,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -558,7 +559,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = maxloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
__syncthreads();
find
_min_max_loc_in_s
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
find
MinMaxLocInS
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -582,19 +583,20 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -582,19 +583,20 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
template <typename T>
void min
_max_loc_mask_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
void min
MaxLocMaskC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -610,29 +612,30 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -610,29 +612,30 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
}
template void min
_max_loc_mask_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_c
aller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskC
aller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
template <typename T>
void min
_max_loc_c
aller(const DevMem2D src, double* minval, double* maxval,
void min
MaxLocC
aller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -648,18 +651,18 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -648,18 +651,18 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
}
template void min
_max_loc_c
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_c
aller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocC
aller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
// This kernel will be used only when compute capability is 1.0
// This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T>
template <int nthreads, typename T>
__global__ void min
_max_loc_pass2_k
ernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
__global__ void min
MaxLocPass2K
ernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
{
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type sminval[nthreads];
...
@@ -676,7 +679,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -676,7 +679,7 @@ namespace cv { namespace gpu { namespace mathfunc
smaxloc[tid] = maxloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
__syncthreads();
find
_min_max_loc_in_s
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
find
MinMaxLocInS
mem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -689,20 +692,21 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -689,20 +692,21 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
template <typename T>
void min
_max_loc_mask_multipass_c
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
void min
MaxLocMaskMultipassC
aller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
minloc_buf, maxloc_buf);
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -718,29 +722,30 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -718,29 +722,30 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
}
template void min
_max_loc_mask_multipass_c
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_mask_multipass_c
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMaskMultipassC
aller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
template <typename T>
void min
_max_loc_multipass_c
aller(const DevMem2D src, double* minval, double* maxval,
void min
MaxLocMultipassC
aller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
minloc_buf, maxloc_buf);
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
T minval_, maxval_;
...
@@ -756,12 +761,12 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -756,12 +761,12 @@ namespace cv { namespace gpu { namespace mathfunc
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
}
template void min
_max_loc_multipass_c
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
_max_loc_multipass_c
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min
MaxLocMultipassC
aller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
} // namespace minmaxloc
} // namespace minmaxloc
...
@@ -776,7 +781,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -776,7 +781,7 @@ namespace cv { namespace gpu { namespace mathfunc
__device__ uint blocks_finished = 0;
__device__ uint blocks_finished = 0;
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
{
threads = dim3(32, 8);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
...
@@ -785,16 +790,16 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -785,16 +790,16 @@ namespace cv { namespace gpu { namespace mathfunc
}
}
void get
_buf_size_r
equired(int cols, int rows, int& bufcols, int& bufrows)
void get
BufSizeR
equired(int cols, int rows, int& bufcols, int& bufrows)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
bufrows = 1;
}
}
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
@@ -804,7 +809,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -804,7 +809,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <int nthreads, typename T>
template <int nthreads, typename T>
__global__ void count
_non_zero_k
ernel(const DevMem2D src, volatile uint* count)
__global__ void count
NonZeroK
ernel(const DevMem2D src, volatile uint* count)
{
{
__shared__ uint scount[nthreads];
__shared__ uint scount[nthreads];
...
@@ -823,7 +828,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -823,7 +828,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = cnt;
scount[tid] = cnt;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, uint>(scount, tid);
sum
InS
mem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
__shared__ bool is_last;
...
@@ -844,7 +849,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -844,7 +849,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, uint>(scount, tid);
sum
InS
mem<nthreads, uint>(scount, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -859,15 +864,15 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -859,15 +864,15 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
template <typename T>
int count
_non_zero_c
aller(const DevMem2D src, PtrStep buf)
int count
NonZeroC
aller(const DevMem2D src, PtrStep buf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
uint* count_buf = (uint*)buf.ptr(0);
count
_non_zero_k
ernel<256, T><<<grid, threads>>>(src, count_buf);
count
NonZeroK
ernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
uint count;
uint count;
...
@@ -876,17 +881,17 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -876,17 +881,17 @@ namespace cv { namespace gpu { namespace mathfunc
return count;
return count;
}
}
template int count
_non_zero_c
aller<uchar>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<uchar>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<char>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<char>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<ushort>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<ushort>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<short>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<short>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<int>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<int>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<float>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<float>(const DevMem2D, PtrStep);
template int count
_non_zero_c
aller<double>(const DevMem2D, PtrStep);
template int count
NonZeroC
aller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T>
template <int nthreads, typename T>
__global__ void count
_non_zero_pass2_k
ernel(uint* count, int size)
__global__ void count
NonZeroPass2K
ernel(uint* count, int size)
{
{
__shared__ uint scount[nthreads];
__shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
...
@@ -894,7 +899,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -894,7 +899,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < size ? count[tid] : 0;
scount[tid] = tid < size ? count[tid] : 0;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, uint>(scount, tid);
sum
InS
mem<nthreads, uint>(scount, tid);
if (tid == 0)
if (tid == 0)
count[0] = scount[0];
count[0] = scount[0];
...
@@ -902,16 +907,16 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -902,16 +907,16 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
template <typename T>
int count
_non_zero_multipass_c
aller(const DevMem2D src, PtrStep buf)
int count
NonZeroMultipassC
aller(const DevMem2D src, PtrStep buf)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
uint* count_buf = (uint*)buf.ptr(0);
count
_non_zero_k
ernel<256, T><<<grid, threads>>>(src, count_buf);
count
NonZeroK
ernel<256, T><<<grid, threads>>>(src, count_buf);
count
_non_zero_pass2_k
ernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
count
NonZeroPass2K
ernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
uint count;
uint count;
...
@@ -920,12 +925,12 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -920,12 +925,12 @@ namespace cv { namespace gpu { namespace mathfunc
return count;
return count;
}
}
template int count
_non_zero_multipass_c
aller<uchar>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<uchar>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<char>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<char>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<ushort>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<ushort>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<short>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<short>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<int>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<int>(const DevMem2D, PtrStep);
template int count
_non_zero_multipass_c
aller<float>(const DevMem2D, PtrStep);
template int count
NonZeroMultipassC
aller<float>(const DevMem2D, PtrStep);
} // namespace countnonzero
} // namespace countnonzero
...
@@ -958,7 +963,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -958,7 +963,7 @@ namespace cv { namespace gpu { namespace mathfunc
const int threads_x = 32;
const int threads_x = 32;
const int threads_y = 8;
const int threads_y = 8;
void estimate
_thread_c
fg(int cols, int rows, dim3& threads, dim3& grid)
void estimate
ThreadC
fg(int cols, int rows, dim3& threads, dim3& grid)
{
{
threads = dim3(threads_x, threads_y);
threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y),
grid = dim3(divUp(cols, threads.x * threads.y),
...
@@ -968,16 +973,16 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -968,16 +973,16 @@ namespace cv { namespace gpu { namespace mathfunc
}
}
void get
_buf_size_r
equired(int cols, int rows, int cn, int& bufcols, int& bufrows)
void get
BufSizeR
equired(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
{
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(cols, rows, threads, grid);
estimate
ThreadC
fg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
bufrows = 1;
}
}
void set
_kernel_c
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
void set
KernelC
onsts(int cols, int rows, const dim3& threads, const dim3& grid)
{
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
int theight = divUp(divUp(rows, grid.y), threads.y);
...
@@ -986,7 +991,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -986,7 +991,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
}
template <typename T, typename R, typename Op, int nthreads>
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel(const DevMem2D src, R* result)
__global__ void sum
K
ernel(const DevMem2D src, R* result)
{
{
__shared__ R smem[nthreads];
__shared__ R smem[nthreads];
...
@@ -1006,7 +1011,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1006,7 +1011,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = sum;
smem[tid] = sum;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
__shared__ bool is_last;
...
@@ -1027,7 +1032,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1027,7 +1032,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -1042,7 +1047,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1042,7 +1047,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel(R* result, int size)
__global__ void sum
Pass2K
ernel(R* result, int size)
{
{
__shared__ R smem[nthreads];
__shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x;
int tid = threadIdx.y * blockDim.x + threadIdx.x;
...
@@ -1050,7 +1055,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1050,7 +1055,7 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid] = tid < size ? result[tid] : 0;
smem[tid] = tid < size ? result[tid] : 0;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
if (tid == 0)
if (tid == 0)
result[0] = smem[0];
result[0] = smem[0];
...
@@ -1058,7 +1063,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1058,7 +1063,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, typename Op, int nthreads>
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
__global__ void sum
K
ernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
{
{
typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<R, 2>::vec_t DstType;
typedef typename TypeVec<R, 2>::vec_t DstType;
...
@@ -1086,8 +1091,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1086,8 +1091,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = sum.y;
smem[tid + nthreads] = sum.y;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
__shared__ bool is_last;
...
@@ -1113,8 +1118,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1113,8 +1118,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = res.y;
smem[tid + nthreads] = res.y;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -1137,7 +1142,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1137,7 +1142,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
__global__ void sum
Pass2K
ernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
{
{
typedef typename TypeVec<R, 2>::vec_t DstType;
typedef typename TypeVec<R, 2>::vec_t DstType;
...
@@ -1150,8 +1155,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1150,8 +1155,8 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + nthreads] = res.y;
smem[tid + nthreads] = res.y;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -1163,7 +1168,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1163,7 +1168,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, typename Op, int nthreads>
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
__global__ void sum
K
ernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{
{
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType;
typedef typename TypeVec<R, 3>::vec_t DstType;
...
@@ -1192,9 +1197,9 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1192,9 +1197,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = sum.z;
smem[tid + 2 * nthreads] = sum.z;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
__shared__ bool is_last;
...
@@ -1222,9 +1227,9 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1222,9 +1227,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = res.z;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -1249,7 +1254,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1249,7 +1254,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
__global__ void sum
Pass2K
ernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{
{
typedef typename TypeVec<R, 3>::vec_t DstType;
typedef typename TypeVec<R, 3>::vec_t DstType;
...
@@ -1263,9 +1268,9 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1263,9 +1268,9 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 2 * nthreads] = res.z;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -1277,7 +1282,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1277,7 +1282,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
}
template <typename T, typename R, typename Op, int nthreads>
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum
_k
ernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
__global__ void sum
K
ernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{
{
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType;
typedef typename TypeVec<R, 4>::vec_t DstType;
...
@@ -1308,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1308,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = sum.w;
smem[tid + 3 * nthreads] = sum.w;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 3 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
__shared__ bool is_last;
...
@@ -1341,10 +1346,10 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1341,10 +1346,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = res.w;
smem[tid + 3 * nthreads] = res.w;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 3 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -1371,7 +1376,7 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1371,7 +1376,7 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T, typename R, int nthreads>
template <typename T, typename R, int nthreads>
__global__ void sum
_pass2_k
ernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
__global__ void sum
Pass2K
ernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{
{
typedef typename TypeVec<R, 4>::vec_t DstType;
typedef typename TypeVec<R, 4>::vec_t DstType;
...
@@ -1386,10 +1391,10 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1386,10 +1391,10 @@ namespace cv { namespace gpu { namespace mathfunc
smem[tid + 3 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.z;
__syncthreads();
__syncthreads();
sum
_in_s
mem<nthreads, R>(smem, tid);
sum
InS
mem<nthreads, R>(smem, tid);
sum
_in_s
mem<nthreads, R>(smem + nthreads, tid);
sum
InS
mem<nthreads, R>(smem + nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 2 * nthreads, tid);
sum
_in_s
mem<nthreads, R>(smem + 3 * nthreads, tid);
sum
InS
mem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
if (tid == 0)
{
{
...
@@ -1405,36 +1410,36 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1405,36 +1410,36 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T>
template <typename T>
void sum
_multipass_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sum
MultipassC
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
{
using namespace sum;
using namespace sum;
typedef typename SumType<T>::R R;
typedef typename SumType<T>::R R;
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
switch (cn)
{
{
case 1:
case 1:
sum
_k
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 2:
case 2:
sum
_k
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3:
case 3:
sum
_k
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4:
case 4:
sum
_k
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
}
}
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
...
@@ -1448,40 +1453,40 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1448,40 +1453,40 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
sum[3] = result[3];
}
}
template void sum
_multipass_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
_multipass_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sum
MultipassC
aller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
template <typename T>
void sum
_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sum
C
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
{
using namespace sum;
using namespace sum;
typedef typename SumType<T>::R R;
typedef typename SumType<T>::R R;
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
switch (cn)
{
{
case 1:
case 1:
sum
_k
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
break;
case 2:
case 2:
sum
_k
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
break;
case 3:
case 3:
sum
_k
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
break;
case 4:
case 4:
sum
_k
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
break;
}
}
...
@@ -1496,48 +1501,48 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1496,48 +1501,48 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
sum[3] = result[3];
}
}
template void sum
_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sum
_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sum
C
aller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
template <typename T>
void sq
sum_multipass_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sq
rSumMultipassC
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
{
using namespace sum;
using namespace sum;
typedef typename SumType<T>::R R;
typedef typename SumType<T>::R R;
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
switch (cn)
{
{
case 1:
case 1:
sum
_k
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
break;
case 2:
case 2:
sum
_k
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
break;
case 3:
case 3:
sum
_k
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
break;
case 4:
case 4:
sum
_k
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum
_pass2_k
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
sum
Pass2K
ernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
break;
}
}
...
@@ -1552,40 +1557,40 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1552,40 +1557,40 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
sum[3] = result[3];
}
}
template void sq
sum_multipass_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_multipass_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumMultipassC
aller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
template <typename T>
void sq
sum_c
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
void sq
rSumC
aller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
{
using namespace sum;
using namespace sum;
typedef typename SumType<T>::R R;
typedef typename SumType<T>::R R;
dim3 threads, grid;
dim3 threads, grid;
estimate
_thread_c
fg(src.cols, src.rows, threads, grid);
estimate
ThreadC
fg(src.cols, src.rows, threads, grid);
set
_kernel_c
onsts(src.cols, src.rows, threads, grid);
set
KernelC
onsts(src.cols, src.rows, threads, grid);
switch (cn)
switch (cn)
{
{
case 1:
case 1:
sum
_k
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
break;
case 2:
case 2:
sum
_k
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
break;
case 3:
case 3:
sum
_k
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
break;
case 4:
case 4:
sum
_k
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
sum
K
ernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
break;
}
}
...
@@ -1600,10 +1605,10 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1600,10 +1605,10 @@ namespace cv { namespace gpu { namespace mathfunc
sum[3] = result[3];
sum[3] = result[3];
}
}
template void sq
sum_c
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<char>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<short>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<int>(const DevMem2D, PtrStep, double*, int);
template void sq
sum_c
aller<float>(const DevMem2D, PtrStep, double*, int);
template void sq
rSumC
aller<float>(const DevMem2D, PtrStep, double*, int);
}}}
}}}
\ No newline at end of file
modules/gpu/src/initialization.cpp
View file @
90ae1e3a
...
@@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device)
...
@@ -133,85 +133,81 @@ CV_EXPORTS bool cv::gpu::hasAtomicsSupport(int device)
namespace
namespace
{
{
template
<
unsigned
int
cmp_op
>
struct
ComparerEqual
bool
comparePairs
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
);
template
<>
bool
comparePairs
<
CMP_EQ
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
{
{
return
lhs1
==
rhs1
&&
lhs2
==
rhs2
;
bool
operator
()(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
const
}
{
return
lhs1
==
rhs1
&&
lhs2
==
rhs2
;
}
};
template
<>
bool
comparePairs
<
CMP_GT
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
{
return
lhs1
>
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
>
rhs2
);
}
template
<>
struct
ComparerLessOrEqual
bool
comparePairs
<
CMP_GE
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
{
{
return
lhs1
>
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
>=
rhs2
);
bool
operator
()(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
const
}
{
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<=
rhs2
);
}
};
template
<>
bool
comparePairs
<
CMP_LT
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
{
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<
rhs2
);
}
struct
ComparerGreaterOrEqual
template
<>
bool
comparePairs
<
CMP_LE
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
{
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<=
rhs2
);
}
template
<>
bool
comparePairs
<
CMP_NE
>
(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
{
{
return
lhs1
<
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
<=
rhs2
);
bool
operator
()(
int
lhs1
,
int
lhs2
,
int
rhs1
,
int
rhs2
)
const
}
{
}
return
lhs1
>
rhs1
||
(
lhs1
==
rhs1
&&
lhs2
>=
rhs2
);
}
};
template
<
unsigned
int
cmp_op
>
template
<
typename
Comparer
>
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
(
int
major
,
int
minor
)
bool
checkPtxVersion
(
int
major
,
int
minor
,
Comparer
cmp
)
{
{
#ifdef OPENCV_GPU_CUDA_ARCH_10
#ifdef OPENCV_GPU_CUDA_ARCH_10
if
(
comparePairs
<
cmp_op
>
(
1
,
0
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
0
,
major
,
minor
))
return
true
;
#endif
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_11
#ifdef OPENCV_GPU_CUDA_ARCH_11
if
(
comparePairs
<
cmp_op
>
(
1
,
1
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
1
,
major
,
minor
))
return
true
;
#endif
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_12
#ifdef OPENCV_GPU_CUDA_ARCH_12
if
(
comparePairs
<
cmp_op
>
(
1
,
2
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
2
,
major
,
minor
))
return
true
;
#endif
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_13
#ifdef OPENCV_GPU_CUDA_ARCH_13
if
(
comparePairs
<
cmp_op
>
(
1
,
3
,
major
,
minor
))
return
true
;
if
(
cmp
(
1
,
3
,
major
,
minor
))
return
true
;
#endif
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_20
#ifdef OPENCV_GPU_CUDA_ARCH_20
if
(
comparePairs
<
cmp_op
>
(
2
,
0
,
major
,
minor
))
return
true
;
if
(
cmp
(
2
,
0
,
major
,
minor
))
return
true
;
#endif
#endif
#ifdef OPENCV_GPU_CUDA_ARCH_21
#ifdef OPENCV_GPU_CUDA_ARCH_21
if
(
comparePairs
<
cmp_op
>
(
2
,
1
,
major
,
minor
))
return
true
;
if
(
cmp
(
2
,
1
,
major
,
minor
))
return
true
;
#endif
#endif
return
false
;
return
false
;
}
}
CV_EXPORTS
bool
cv
::
gpu
::
ptxVersionIs
(
int
major
,
int
minor
)
{
return
checkPtxVersion
(
major
,
minor
,
ComparerEqual
());
}
}
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_EQ
>
(
int
major
,
int
minor
);
CV_EXPORTS
bool
cv
::
gpu
::
ptxVersionIsLessOrEqual
(
int
major
,
int
minor
)
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_GT
>
(
int
major
,
int
minor
);
{
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_GE
>
(
int
major
,
int
minor
);
return
checkPtxVersion
(
major
,
minor
,
ComparerLessOrEqual
());
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_LT
>
(
int
major
,
int
minor
);
}
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_LE
>
(
int
major
,
int
minor
);
template
CV_EXPORTS
bool
cv
::
gpu
::
checkPtxVersion
<
CMP_NE
>
(
int
major
,
int
minor
);
CV_EXPORTS
bool
cv
::
gpu
::
ptxVersionIsGreaterOrEqual
(
int
major
,
int
minor
)
{
return
checkPtxVersion
(
major
,
minor
,
ComparerGreaterOrEqual
());
}
CV_EXPORTS
bool
isCompatibleWith
(
int
device
)
CV_EXPORTS
bool
isCompatibleWith
(
int
device
)
...
@@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device)
...
@@ -223,7 +219,7 @@ CV_EXPORTS bool isCompatibleWith(int device)
int
major
,
minor
;
int
major
,
minor
;
getComputeCapability
(
device
,
major
,
minor
);
getComputeCapability
(
device
,
major
,
minor
);
return
checkPtxVersion
<
CMP_LE
>
(
major
,
minor
);
return
ptxVersionIsLessOrEqual
(
major
,
minor
);
}
}
#endif
#endif
...
...
modules/gpu/src/matrix_reductions.cpp
View file @
90ae1e3a
...
@@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
...
@@ -119,20 +119,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
{
template
<
typename
T
>
template
<
typename
T
>
void
sum
_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sum
C
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
template
<
typename
T
>
void
sum
_multipass_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sum
MultipassC
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
template
<
typename
T
>
void
sq
sum_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sq
rSumC
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
template
<
typename
T
>
void
sq
sum_multipass_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
void
sq
rSumMultipassC
aller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
namespace
sum
namespace
sum
{
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
}
}
}}}
}}}
...
@@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
...
@@ -149,19 +149,27 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
using
namespace
mathfunc
;
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sum_multipass_caller
<
unsigned
char
>
,
sum_multipass_caller
<
char
>
,
static
Caller
multipass_callers
[
7
]
=
{
sum_multipass_caller
<
unsigned
short
>
,
sum_multipass_caller
<
short
>
,
sumMultipassCaller
<
unsigned
char
>
,
sumMultipassCaller
<
char
>
,
sum_multipass_caller
<
int
>
,
sum_multipass_caller
<
float
>
,
0
},
sumMultipassCaller
<
unsigned
short
>
,
sumMultipassCaller
<
short
>
,
{
sum_caller
<
unsigned
char
>
,
sum_caller
<
char
>
,
sumMultipassCaller
<
int
>
,
sumMultipassCaller
<
float
>
,
0
};
sum_caller
<
unsigned
short
>
,
sum_caller
<
short
>
,
sum_caller
<
int
>
,
sum_caller
<
float
>
,
0
}
};
static
Caller
singlepass_callers
[
7
]
=
{
sumCaller
<
unsigned
char
>
,
sumCaller
<
char
>
,
Size
bufSize
;
sumCaller
<
unsigned
short
>
,
sumCaller
<
short
>
,
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
sumCaller
<
int
>
,
sumCaller
<
float
>
,
0
};
ensureSizeIsEnough
(
bufSize
,
CV_8U
,
buf
);
Size
buf_size
;
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
sum
::
getBufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sum: unsupported type"
);
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sum: unsupported type"
);
double
result
[
4
];
double
result
[
4
];
...
@@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
...
@@ -182,19 +190,27 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
using
namespace
mathfunc
;
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sqsum_multipass_caller
<
unsigned
char
>
,
sqsum_multipass_caller
<
char
>
,
static
Caller
multipass_callers
[
7
]
=
{
sqsum_multipass_caller
<
unsigned
short
>
,
sqsum_multipass_caller
<
short
>
,
sqrSumMultipassCaller
<
unsigned
char
>
,
sqrSumMultipassCaller
<
char
>
,
sqsum_multipass_caller
<
int
>
,
sqsum_multipass_caller
<
float
>
,
0
},
sqrSumMultipassCaller
<
unsigned
short
>
,
sqrSumMultipassCaller
<
short
>
,
{
sqsum_caller
<
unsigned
char
>
,
sqsum_caller
<
char
>
,
sqrSumMultipassCaller
<
int
>
,
sqrSumMultipassCaller
<
float
>
,
0
};
sqsum_caller
<
unsigned
short
>
,
sqsum_caller
<
short
>
,
sqsum_caller
<
int
>
,
sqsum_caller
<
float
>
,
0
}
};
static
Caller
singlepass_callers
[
7
]
=
{
sqrSumCaller
<
unsigned
char
>
,
sqrSumCaller
<
char
>
,
Size
bufSize
;
sqrSumCaller
<
unsigned
short
>
,
sqrSumCaller
<
short
>
,
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
sqrSumCaller
<
int
>
,
sqrSumCaller
<
float
>
,
0
};
ensureSizeIsEnough
(
bufSize
,
CV_8U
,
buf
);
Caller
*
callers
=
multipass_callers
;
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Size
buf_size
;
sum
::
getBufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
Caller
caller
=
callers
[
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sqrSum: unsupported type"
);
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sqrSum: unsupported type"
);
double
result
[
4
];
double
result
[
4
];
...
@@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
...
@@ -207,19 +223,19 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmax
{
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmax
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
bufcols
,
int
&
bufrows
);
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_mask_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxMaskC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_multipass_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxMultipassC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_mask_multipass_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
void
min
MaxMaskMultipassC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
}}}}
}}}}
...
@@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
...
@@ -238,23 +254,26 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
PtrStep
);
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
static
Caller
multipass_callers
[
7
]
=
{
{
{
min_max_multipass_caller
<
unsigned
char
>
,
min_max_multipass_caller
<
char
>
,
minMaxMultipassCaller
<
unsigned
char
>
,
minMaxMultipassCaller
<
char
>
,
min_max_multipass_caller
<
unsigned
short
>
,
min_max_multipass_caller
<
short
>
,
minMaxMultipassCaller
<
unsigned
short
>
,
minMaxMultipassCaller
<
short
>
,
min_max_multipass_caller
<
int
>
,
min_max_multipass_caller
<
float
>
,
0
},
minMaxMultipassCaller
<
int
>
,
minMaxMultipassCaller
<
float
>
,
0
};
{
min_max_caller
<
unsigned
char
>
,
min_max_caller
<
char
>
,
min_max_caller
<
unsigned
short
>
,
min_max_caller
<
short
>
,
min_max_caller
<
int
>
,
min_max_caller
<
float
>
,
min_max_caller
<
double
>
}
};
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
static
Caller
singlepass_callers
[
7
]
=
{
{
{
min_max_mask_multipass_caller
<
unsigned
char
>
,
min_max_mask_multipass_caller
<
char
>
,
minMaxCaller
<
unsigned
char
>
,
minMaxCaller
<
char
>
,
min_max_mask_multipass_caller
<
unsigned
short
>
,
min_max_mask_multipass_caller
<
short
>
,
minMaxCaller
<
unsigned
short
>
,
minMaxCaller
<
short
>
,
min_max_mask_multipass_caller
<
int
>
,
min_max_mask_multipass_caller
<
float
>
,
0
},
minMaxCaller
<
int
>
,
minMaxCaller
<
float
>
,
minMaxCaller
<
double
>
};
{
min_max_mask_caller
<
unsigned
char
>
,
min_max_mask_caller
<
char
>
,
min_max_mask_caller
<
unsigned
short
>
,
min_max_mask_caller
<
short
>
,
min_max_mask_caller
<
int
>
,
min_max_mask_caller
<
float
>
,
min_max_mask_caller
<
double
>
}
};
static
MaskedCaller
masked_multipass_callers
[
7
]
=
{
minMaxMaskMultipassCaller
<
unsigned
char
>
,
minMaxMaskMultipassCaller
<
char
>
,
minMaxMaskMultipassCaller
<
unsigned
short
>
,
minMaxMaskMultipassCaller
<
short
>
,
minMaxMaskMultipassCaller
<
int
>
,
minMaxMaskMultipassCaller
<
float
>
,
0
};
static
MaskedCaller
masked_singlepass_callers
[
7
]
=
{
minMaxMaskCaller
<
unsigned
char
>
,
minMaxMaskCaller
<
char
>
,
minMaxMaskCaller
<
unsigned
short
>
,
minMaxMaskCaller
<
short
>
,
minMaxMaskCaller
<
int
>
,
minMaxMaskCaller
<
float
>
,
minMaxMaskCaller
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
...
@@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
...
@@ -263,19 +282,27 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
double
minVal_
;
if
(
!
minVal
)
minVal
=
&
minVal_
;
double
minVal_
;
if
(
!
minVal
)
minVal
=
&
minVal_
;
double
maxVal_
;
if
(
!
maxVal
)
maxVal
=
&
maxVal_
;
double
maxVal_
;
if
(
!
maxVal
)
maxVal
=
&
maxVal_
;
Size
buf
S
ize
;
Size
buf
_s
ize
;
get
_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
bufSize
.
width
,
bufS
ize
.
height
);
get
BufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
buf_size
.
width
,
buf_s
ize
.
height
);
ensureSizeIsEnough
(
buf
S
ize
,
CV_8U
,
buf
);
ensureSizeIsEnough
(
buf
_s
ize
,
CV_8U
,
buf
);
if
(
mask
.
empty
())
if
(
mask
.
empty
())
{
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
buf
);
caller
(
src
,
minVal
,
maxVal
,
buf
);
}
}
else
else
{
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
MaskedCaller
*
callers
=
masked_multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
masked_singlepass_callers
;
MaskedCaller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
buf
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
buf
);
}
}
...
@@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
...
@@ -287,23 +314,23 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmaxloc
{
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmaxloc
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
b1cols
,
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
b1cols
,
int
&
b1rows
,
int
&
b2cols
,
int
&
b2rows
);
int
&
b1rows
,
int
&
b2cols
,
int
&
b2rows
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_loc_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_loc_mask_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocMaskC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_loc_multipass_c
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocMultipassC
aller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
template
<
typename
T
>
template
<
typename
T
>
void
min
_max_loc_mask_multipass_c
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
void
min
MaxLocMaskMultipassC
aller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valBuf
,
PtrStep
locBuf
);
}}}}
}}}}
...
@@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
...
@@ -323,21 +350,26 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
static
Caller
multipass_callers
[
7
]
=
{
{
{
min_max_loc_multipass_caller
<
unsigned
char
>
,
min_max_loc_multipass_caller
<
char
>
,
minMaxLocMultipassCaller
<
unsigned
char
>
,
minMaxLocMultipassCaller
<
char
>
,
min_max_loc_multipass_caller
<
unsigned
short
>
,
min_max_loc_multipass_caller
<
short
>
,
minMaxLocMultipassCaller
<
unsigned
short
>
,
minMaxLocMultipassCaller
<
short
>
,
min_max_loc_multipass_caller
<
int
>
,
min_max_loc_multipass_caller
<
float
>
,
0
},
minMaxLocMultipassCaller
<
int
>
,
minMaxLocMultipassCaller
<
float
>
,
0
};
{
min_max_loc_caller
<
unsigned
char
>
,
min_max_loc_caller
<
char
>
,
min_max_loc_caller
<
unsigned
short
>
,
min_max_loc_caller
<
short
>
,
static
Caller
singlepass_callers
[
7
]
=
{
min_max_loc_caller
<
int
>
,
min_max_loc_caller
<
float
>
,
min_max_loc_caller
<
double
>
}
};
minMaxLocCaller
<
unsigned
char
>
,
minMaxLocCaller
<
char
>
,
minMaxLocCaller
<
unsigned
short
>
,
minMaxLocCaller
<
short
>
,
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
minMaxLocCaller
<
int
>
,
minMaxLocCaller
<
float
>
,
minMaxLocCaller
<
double
>
};
{
{
min_max_loc_mask_multipass_caller
<
unsigned
char
>
,
min_max_loc_mask_multipass_caller
<
char
>
,
min_max_loc_mask_multipass_caller
<
unsigned
short
>
,
min_max_loc_mask_multipass_caller
<
short
>
,
static
MaskedCaller
masked_multipass_callers
[
7
]
=
{
min_max_loc_mask_multipass_caller
<
int
>
,
min_max_loc_mask_multipass_caller
<
float
>
,
0
},
minMaxLocMaskMultipassCaller
<
unsigned
char
>
,
minMaxLocMaskMultipassCaller
<
char
>
,
{
min_max_loc_mask_caller
<
unsigned
char
>
,
min_max_loc_mask_caller
<
char
>
,
minMaxLocMaskMultipassCaller
<
unsigned
short
>
,
minMaxLocMaskMultipassCaller
<
short
>
,
min_max_loc_mask_caller
<
unsigned
short
>
,
min_max_loc_mask_caller
<
short
>
,
minMaxLocMaskMultipassCaller
<
int
>
,
minMaxLocMaskMultipassCaller
<
float
>
,
0
};
min_max_loc_mask_caller
<
int
>
,
min_max_loc_mask_caller
<
float
>
,
min_max_loc_mask_caller
<
double
>
}
};
static
MaskedCaller
masked_singlepass_callers
[
7
]
=
{
minMaxLocMaskCaller
<
unsigned
char
>
,
minMaxLocMaskCaller
<
char
>
,
minMaxLocMaskCaller
<
unsigned
short
>
,
minMaxLocMaskCaller
<
short
>
,
minMaxLocMaskCaller
<
int
>
,
minMaxLocMaskCaller
<
float
>
,
minMaxLocMaskCaller
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
...
@@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
...
@@ -348,21 +380,29 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
int
minLoc_
[
2
];
int
minLoc_
[
2
];
int
maxLoc_
[
2
];
int
maxLoc_
[
2
];
Size
val
BufSize
,
locBufS
ize
;
Size
val
buf_size
,
locbuf_s
ize
;
get
_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
valBufS
ize
.
width
,
get
BufSizeRequired
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
valbuf_s
ize
.
width
,
val
BufSize
.
height
,
locBufSize
.
width
,
locBufS
ize
.
height
);
val
buf_size
.
height
,
locbuf_size
.
width
,
locbuf_s
ize
.
height
);
ensureSizeIsEnough
(
val
BufS
ize
,
CV_8U
,
valBuf
);
ensureSizeIsEnough
(
val
buf_s
ize
,
CV_8U
,
valBuf
);
ensureSizeIsEnough
(
loc
BufS
ize
,
CV_8U
,
locBuf
);
ensureSizeIsEnough
(
loc
buf_s
ize
,
CV_8U
,
locBuf
);
if
(
mask
.
empty
())
if
(
mask
.
empty
())
{
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valBuf
,
locBuf
);
caller
(
src
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valBuf
,
locBuf
);
}
}
else
else
{
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
MaskedCaller
*
callers
=
masked_multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
masked_singlepass_callers
;
MaskedCaller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valBuf
,
locBuf
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valBuf
,
locBuf
);
}
}
...
@@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
...
@@ -376,13 +416,13 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
countnonzero
{
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
countnonzero
{
void
get
_buf_size_r
equired
(
int
cols
,
int
rows
,
int
&
bufcols
,
int
&
bufrows
);
void
get
BufSizeR
equired
(
int
cols
,
int
rows
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
template
<
typename
T
>
int
count
_non_zero_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
int
count
NonZeroC
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
template
<
typename
T
>
template
<
typename
T
>
int
count
_non_zero_multipass_c
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
int
count
NonZeroMultipassC
aller
(
const
DevMem2D
src
,
PtrStep
buf
);
}}}}
}}}}
...
@@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
...
@@ -400,22 +440,29 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
typedef
int
(
*
Caller
)(
const
DevMem2D
src
,
PtrStep
buf
);
typedef
int
(
*
Caller
)(
const
DevMem2D
src
,
PtrStep
buf
);
static
const
Caller
callers
[
2
][
7
]
=
static
Caller
multipass_callers
[
7
]
=
{
{
{
count_non_zero_multipass_caller
<
unsigned
char
>
,
count_non_zero_multipass_caller
<
char
>
,
countNonZeroMultipassCaller
<
unsigned
char
>
,
countNonZeroMultipassCaller
<
char
>
,
count_non_zero_multipass_caller
<
unsigned
short
>
,
count_non_zero_multipass_caller
<
short
>
,
countNonZeroMultipassCaller
<
unsigned
short
>
,
countNonZeroMultipassCaller
<
short
>
,
count_non_zero_multipass_caller
<
int
>
,
count_non_zero_multipass_caller
<
float
>
,
0
},
countNonZeroMultipassCaller
<
int
>
,
countNonZeroMultipassCaller
<
float
>
,
0
};
{
count_non_zero_caller
<
unsigned
char
>
,
count_non_zero_caller
<
char
>
,
count_non_zero_caller
<
unsigned
short
>
,
count_non_zero_caller
<
short
>
,
static
Caller
singlepass_callers
[
7
]
=
{
count_non_zero_caller
<
int
>
,
count_non_zero_caller
<
float
>
,
count_non_zero_caller
<
double
>
}
};
countNonZeroCaller
<
unsigned
char
>
,
countNonZeroCaller
<
char
>
,
countNonZeroCaller
<
unsigned
short
>
,
countNonZeroCaller
<
short
>
,
countNonZeroCaller
<
int
>
,
countNonZeroCaller
<
float
>
,
countNonZeroCaller
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
Size
bufSize
;
Size
buf_size
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
bufSize
.
width
,
bufSize
.
height
);
getBufSizeRequired
(
src
.
cols
,
src
.
rows
,
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
bufSize
,
CV_8U
,
buf
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
Caller
*
callers
=
multipass_callers
;
if
(
ptxVersionIsGreaterOrEqual
(
1
,
1
)
&&
hasAtomicsSupport
(
getDevice
()))
callers
=
singlepass_callers
;
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
Caller
caller
=
callers
[
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"countNonZero: unsupported type"
);
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"countNonZero: unsupported type"
);
return
caller
(
src
,
buf
);
return
caller
(
src
,
buf
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment