Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv_contrib
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv_contrib
Commits
7e569cec
Commit
7e569cec
authored
6 years ago
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge moved code from opencv
parents
796853e0
71f588bd
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
81 additions
and
237 deletions
+81
-237
NCVHaarObjectDetection.cu
modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
+3
-90
NPP_staging.cu
modules/cudalegacy/src/cuda/NPP_staging.cu
+5
-111
stereobm.cu
modules/cudastereo/src/cuda/stereobm.cu
+72
-35
scan.hpp
modules/cudev/include/opencv2/cudev/warp/scan.hpp
+1
-1
No files found.
modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
View file @
7e569cec
...
...
@@ -59,8 +59,7 @@
#include <algorithm>
#include <cstdio>
#include "opencv2/core/cuda/warp.hpp"
#include "opencv2/core/cuda/warp_shuffle.hpp"
#include "opencv2/cudev.hpp"
#include "opencv2/opencv_modules.hpp"
...
...
@@ -77,92 +76,6 @@
#include "NCVAlg.hpp"
//==============================================================================
//
// BlockScan file
//
//==============================================================================
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE and size is power of 2
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{
#if __CUDA_ARCH__ >= 300
const unsigned int laneId = cv::cuda::device::Warp::laneId();
// scan on shuffl functions
#pragma unroll
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
{
const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
if (laneId >= i)
idata += n;
}
return idata;
#else
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
s_Data[pos] = idata;
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
#endif
}
__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{
return warpScanInclusive(idata, s_Data) - idata;
}
template <Ncv32u tiNumScanThreads>
__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{
if (tiNumScanThreads > K_WARP_SIZE)
{
//Bottom-level inclusive warp scan
Ncv32u warpResult = warpScanInclusive(idata, s_Data);
//Save top elements of each warp for exclusive warp scan
//sync to wait for warp scans to complete (because s_Data is being overwritten)
__syncthreads();
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
{
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
}
//wait for warp scans to complete
__syncthreads();
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
{
//grab top warp elements
Ncv32u val = s_Data[threadIdx.x];
//calculate exclusive scan and write back to shared memory
s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
}
//return updated warp scans with exclusive scan results
__syncthreads();
return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
}
else
{
return warpScanInclusive(idata, s_Data);
}
}
//==============================================================================
//
// HaarClassifierCascade file
...
...
@@ -260,11 +173,11 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
{
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
__shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL
* 2
];
__shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL];
__shared__ Ncv32u numPassed;
__shared__ Ncv32u outMaskOffset;
Ncv32u incScan =
scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem
);
Ncv32u incScan =
cv::cudev::blockScanInclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem, threadIdx.x
);
__syncthreads();
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
...
...
This diff is collapsed.
Click to expand it.
modules/cudalegacy/src/cuda/NPP_staging.cu
View file @
7e569cec
...
...
@@ -45,8 +45,7 @@
#include <vector>
#include <cuda_runtime.h>
#include "opencv2/core/cuda/warp.hpp"
#include "opencv2/core/cuda/warp_shuffle.hpp"
#include "opencv2/cudev.hpp"
#include "opencv2/cudalegacy/NPP_staging.hpp"
...
...
@@ -81,111 +80,6 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
}
//==============================================================================
//
// BlockScan.cuh
//
//==============================================================================
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE and size is power of 2
template <class T>
inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
{
#if __CUDA_ARCH__ >= 300
const unsigned int laneId = cv::cuda::device::Warp::laneId();
// scan on shuffl functions
#pragma unroll
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
{
const T n = cv::cuda::device::shfl_up(idata, i);
if (laneId >= i)
idata += n;
}
return idata;
#else
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
s_Data[pos] = idata;
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
#endif
}
inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
{
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
s_Data[pos] = idata;
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos];
}
template <class T>
inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
{
return warpScanInclusive(idata, s_Data) - idata;
}
template <class T, Ncv32u tiNumScanThreads>
inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
{
if (tiNumScanThreads > K_WARP_SIZE)
{
//Bottom-level inclusive warp scan
T warpResult = warpScanInclusive(idata, s_Data);
//Save top elements of each warp for exclusive warp scan
//sync to wait for warp scans to complete (because s_Data is being overwritten)
__syncthreads();
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
{
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
}
//wait for warp scans to complete
__syncthreads();
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
{
//grab top warp elements
T val = s_Data[threadIdx.x];
//calculate exclusive scan and write back to shared memory
s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
}
//return updated warp scans with exclusive scan results
__syncthreads();
return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
}
else
{
return warpScanInclusive(idata, s_Data);
}
}
//==============================================================================
//
// IntegralImage.cu
...
...
@@ -280,7 +174,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
Ncv32u offsetX = 0;
__shared__ T_out shmem[NUM_SCAN_THREADS
* 2
];
__shared__ T_out shmem[NUM_SCAN_THREADS];
__shared__ T_out carryElem;
carryElem = 0;
__syncthreads();
...
...
@@ -301,7 +195,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
//inclusive scan
curScanElem =
blockScanInclusive<T_out, NUM_SCAN_THREADS>(curElemMod, shmem
);
curScanElem =
cv::cudev::blockScanInclusive<NUM_SCAN_THREADS>(curElemMod, shmem, threadIdx.x
);
if (curElemOffs <= srcWidth)
{
...
...
@@ -1290,7 +1184,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
return;
}
__shared__ Ncv32u shmem[NUM_REMOVE_THREADS
* 2
];
__shared__ Ncv32u shmem[NUM_REMOVE_THREADS];
Ncv32u scanElem = 0;
if (elemAddrIn < srcLen)
...
...
@@ -1305,7 +1199,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
}
}
Ncv32u localScanInc =
blockScanInclusive<Ncv32u, NUM_REMOVE_THREADS>(scanElem, shmem
);
Ncv32u localScanInc =
cv::cudev::blockScanInclusive<NUM_REMOVE_THREADS>(scanElem, shmem, threadIdx.x
);
__syncthreads();
if (elemAddrIn < srcLen)
...
...
This diff is collapsed.
Click to expand it.
modules/cudastereo/src/cuda/stereobm.cu
View file @
7e569cec
...
...
@@ -71,48 +71,54 @@ namespace cv { namespace cuda { namespace device
}
template<int RADIUS>
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd
, const int X
)
{
unsigned int cache = 0;
unsigned int cache2 = 0;
for(int i = 1; i <= RADIUS; i++)
cache += col_ssd[i];
if (X < cwidth - RADIUS)
{
for(int i = 1; i <= RADIUS; i++)
cache += col_ssd[i];
col_ssd_cache[0] = cache;
col_ssd_cache[0] = cache;
}
__syncthreads();
if (threadIdx.x < BLOCK_W - RADIUS)
cache2 = col_ssd_cache[RADIUS];
else
for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
cache2 += col_ssd[i];
if (X < cwidth - RADIUS)
{
if (threadIdx.x < BLOCK_W - RADIUS)
cache2 = col_ssd_cache[RADIUS];
else
for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
cache2 += col_ssd[i];
}
return col_ssd[0] + cache + cache2;
}
template<int RADIUS>
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd
, const int X
)
{
unsigned int ssd[N_DISPARITIES];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS)
, X
);
__syncthreads();
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS)
, X
);
__syncthreads();
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS)
, X
);
__syncthreads();
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS)
, X
);
__syncthreads();
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS)
, X
);
__syncthreads();
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS)
, X
);
__syncthreads();
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS)
, X
);
__syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS)
, X
);
int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
...
...
@@ -243,12 +249,12 @@ namespace cv { namespace cuda { namespace device
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp.data + X + Y * disp.step;
/*
if (X < cwidth)
{
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
*ptr = 0xFFFFFFFF;
}*/
//
if (X < cwidth)
//
{
//
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
//
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
//
*ptr = 0xFFFFFFFF;
//}
int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
int y_tex;
int x_tex = X - RADIUS;
...
...
@@ -268,13 +274,27 @@ namespace cv { namespace cuda { namespace device
__syncthreads(); //before MinSSD function
if (
X < cwidth - RADIUS &&
Y < cheight - RADIUS)
if (Y < cheight - RADIUS)
{
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[0])
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
// computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
//
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
// must also call "MinSSD" to avoid deadlock. (#13850)
//
// From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
// could be an option, but the shared memory access pattern does not allow this option,
// resulting in race condition. (Checked via "cuda-memcheck --tool racecheck")
if (X < cwidth - RADIUS)
{
disparImage[0] = (unsigned char)(d + minSSD.y);
minSSDImage[0] = minSSD.x;
if (minSSD.x < minSSDImage[0])
{
disparImage[0] = (unsigned char)(d + minSSD.y);
minSSDImage[0] = minSSD.x;
}
}
}
...
...
@@ -295,17 +315,34 @@ namespace cv { namespace cuda { namespace device
__syncthreads(); //before MinSSD function
if (
X < cwidth - RADIUS &&
row < cheight - RADIUS - Y)
if (row < cheight - RADIUS - Y)
{
int idx = row * cminSSD_step;
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx])
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
// For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
// computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
//
// However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
// must also call "MinSSD" to avoid deadlock. (#13850)
//
// From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
// could be an option, but the shared memory access pattern does not allow this option,
// resulting in race condition. (Checked via "cuda-memcheck --tool racecheck")
if (X < cwidth - RADIUS)
{
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
int idx = row * cminSSD_step;
if (minSSD.x < minSSDImage[idx])
{
disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
minSSDImage[idx] = minSSD.x;
}
}
}
} // for row loop
__syncthreads(); // before initializing shared memory at the beginning of next loop
} // for d loop
}
...
...
This diff is collapsed.
Click to expand it.
modules/cudev/include/opencv2/cudev/warp/scan.hpp
View file @
7e569cec
...
...
@@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
#pragma unroll
for
(
int
i
=
1
;
i
<=
(
WARP_SIZE
/
2
);
i
*=
2
)
{
const
T
val
=
shfl_up
(
data
,
i
);
const
T
val
=
__shfl_up
(
data
,
i
,
WARP_SIZE
);
if
(
laneId
>=
i
)
data
+=
val
;
}
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment