Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
12dc52c2
Commit
12dc52c2
authored
Jul 27, 2010
by
Andrey Morozov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
implemented asynchronous call for StereoBM()
parent
dc0f3139
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
115 additions
and
99 deletions
+115
-99
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+1
-1
stereobm.cu
modules/gpu/src/cuda/stereobm.cu
+82
-74
precomp.hpp
modules/gpu/src/precomp.hpp
+6
-5
stereobm_gpu.cpp
modules/gpu/src/stereobm_gpu.cpp
+26
-19
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
12dc52c2
...
@@ -349,7 +349,7 @@ namespace cv
...
@@ -349,7 +349,7 @@ namespace cv
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
);
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
);
//! Acync version
//! Acync version
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
);
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
);
//! Some heuristics that tries to estmate
//! Some heuristics that tries to estmate
// if current GPU will be faster then CPU in this algorithm.
// if current GPU will be faster then CPU in this algorithm.
...
...
modules/gpu/src/cuda/stereobm.cu
View file @
12dc52c2
...
@@ -55,13 +55,13 @@ using namespace cv::gpu;
...
@@ -55,13 +55,13 @@ using namespace cv::gpu;
#define ROWSperTHREAD 21 // the number of rows a thread will process
#define ROWSperTHREAD 21 // the number of rows a thread will process
namespace stereobm_gpu
namespace stereobm_gpu
{
{
#define BLOCK_W 128 // the thread block width (464)
#define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8
#define N_DISPARITIES 8
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
__constant__ unsigned int* cminSSDImage;
__constant__ unsigned int* cminSSDImage;
...
@@ -71,7 +71,7 @@ __constant__ int cheight;
...
@@ -71,7 +71,7 @@ __constant__ int cheight;
__device__ int SQ(int a)
__device__ int SQ(int a)
{
{
return a * a;
return a * a;
}
}
template<int RADIUS>
template<int RADIUS>
...
@@ -82,7 +82,7 @@ __device__ unsigned int CalcSSD(unsigned int *col_ssd_cache, unsigned int *col_s
...
@@ -82,7 +82,7 @@ __device__ unsigned int CalcSSD(unsigned int *col_ssd_cache, unsigned int *col_s
for(int i = 1; i <= RADIUS; i++)
for(int i = 1; i <= RADIUS; i++)
cache += col_ssd[i];
cache += col_ssd[i];
col_ssd_cache[0] = cache;
col_ssd_cache[0] = cache;
__syncthreads();
__syncthreads();
...
@@ -101,7 +101,7 @@ __device__ uint2 MinSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd)
...
@@ -101,7 +101,7 @@ __device__ uint2 MinSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd)
{
{
unsigned int ssd[N_DISPARITIES];
unsigned int ssd[N_DISPARITIES];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
...
@@ -146,7 +146,7 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
...
@@ -146,7 +146,7 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
rightPixel1[4] = imageR[idx1 - 4];
rightPixel1[4] = imageR[idx1 - 4];
rightPixel1[5] = imageR[idx1 - 5];
rightPixel1[5] = imageR[idx1 - 5];
rightPixel1[6] = imageR[idx1 - 6];
rightPixel1[6] = imageR[idx1 - 6];
rightPixel2[7] = imageR[idx2 - 7];
rightPixel2[7] = imageR[idx2 - 7];
rightPixel2[0] = imageR[idx2 - 0];
rightPixel2[0] = imageR[idx2 - 0];
rightPixel2[1] = imageR[idx2 - 1];
rightPixel2[1] = imageR[idx2 - 1];
...
@@ -155,16 +155,16 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
...
@@ -155,16 +155,16 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
rightPixel2[4] = imageR[idx2 - 4];
rightPixel2[4] = imageR[idx2 - 4];
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[6] = imageR[idx2 - 6];
rightPixel2[6] = imageR[idx2 - 6];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
diff1 = leftPixel1 - rightPixel1[0];
diff1 = leftPixel1 - rightPixel1[0];
diff2 = leftPixel2 - rightPixel2[0];
diff2 = leftPixel2 - rightPixel2[0];
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[1];
diff1 = leftPixel1 - rightPixel1[1];
diff2 = leftPixel2 - rightPixel2[1];
diff2 = leftPixel2 - rightPixel2[1];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[2];
diff1 = leftPixel1 - rightPixel1[2];
diff2 = leftPixel2 - rightPixel2[2];
diff2 = leftPixel2 - rightPixel2[2];
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
...
@@ -172,19 +172,19 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
...
@@ -172,19 +172,19 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
diff1 = leftPixel1 - rightPixel1[3];
diff1 = leftPixel1 - rightPixel1[3];
diff2 = leftPixel2 - rightPixel2[3];
diff2 = leftPixel2 - rightPixel2[3];
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[4];
diff1 = leftPixel1 - rightPixel1[4];
diff2 = leftPixel2 - rightPixel2[4];
diff2 = leftPixel2 - rightPixel2[4];
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[5];
diff1 = leftPixel1 - rightPixel1[5];
diff2 = leftPixel2 - rightPixel2[5];
diff2 = leftPixel2 - rightPixel2[5];
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[6];
diff1 = leftPixel1 - rightPixel1[6];
diff2 = leftPixel2 - rightPixel2[6];
diff2 = leftPixel2 - rightPixel2[6];
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[7];
diff1 = leftPixel1 - rightPixel1[7];
diff2 = leftPixel2 - rightPixel2[7];
diff2 = leftPixel2 - rightPixel2[7];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
...
@@ -203,7 +203,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
...
@@ -203,7 +203,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
leftPixel1 = imageL[idx];
leftPixel1 = imageL[idx];
idx = idx - d;
idx = idx - d;
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
...
@@ -213,7 +213,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
...
@@ -213,7 +213,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
y_tex += 1;
y_tex += 1;
}
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
...
@@ -225,11 +225,11 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
...
@@ -225,11 +225,11 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
}
}
template<int RADIUS>
template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp)
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp)
{
{
extern __shared__ unsigned int col_ssd_cache[];
extern __shared__ unsigned int col_ssd_cache[];
unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
...
@@ -241,13 +241,13 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
...
@@ -241,13 +241,13 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
unsigned char* disparImage = disp + X + Y * disp_pitch;
unsigned char* disparImage = disp + X + Y * disp_pitch;
/* if (X < cwidth)
/* if (X < cwidth)
{
{
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
*ptr = 0xFFFFFFFF;
*ptr = 0xFFFFFFFF;
}*/
}*/
int end_row = min(ROWSperTHREAD, cheight - Y);
int end_row = min(ROWSperTHREAD, cheight - Y);
int y_tex;
int y_tex;
int x_tex = X - RADIUS;
int x_tex = X - RADIUS;
if (x_tex >= cwidth)
if (x_tex >= cwidth)
...
@@ -257,7 +257,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
...
@@ -257,7 +257,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
{
{
y_tex = Y - RADIUS;
y_tex = Y - RADIUS;
InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
if (col_ssd_extra > 0)
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
if (x_tex + BLOCK_W < cwidth)
...
@@ -289,13 +289,13 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
...
@@ -289,13 +289,13 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
y_tex += 1;
y_tex += 1;
__syncthreads(); //before MinSSD function
__syncthreads(); //before MinSSD function
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
{
{
int idx = row * cminSSD_step;
int idx = row * cminSSD_step;
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx])
if (minSSD.x < minSSDImage[idx])
{
{
disparImage[disp_pitch * row] = (unsigned char)(d + minSSD.y);
disparImage[disp_pitch * row] = (unsigned char)(d + minSSD.y);
...
@@ -310,49 +310,57 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
...
@@ -310,49 +310,57 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
namespace cv { namespace gpu { namespace impl
namespace cv { namespace gpu { namespace impl
{
{
template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp)
template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp
, const cudaStream_t & stream
)
{
{
dim3 grid(1,1,1);
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
dim3 threads(BLOCK_W, 1, 1);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
if (stream == 0)
{
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
}
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
cudaSafeCall( cudaThreadSynchronize() );
};
};
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp);
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp
, const cudaStream_t & stream
);
const static kernel_caller_t callers[] =
const static kernel_caller_t callers[] =
{
{
0,
0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
const int calles_num = sizeof(callers)/sizeof(callers[0]);
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf)
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf
, const cudaStream_t & stream
)
{
{
int winsz2 = winsz >> 1;
int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num)
if (winsz2 == 0 || winsz2 >= calles_num)
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cheight, &left.rows, sizeof(left.rows) ) );
...
@@ -361,7 +369,7 @@ namespace cv { namespace gpu { namespace impl
...
@@ -361,7 +369,7 @@ namespace cv { namespace gpu { namespace impl
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
callers[winsz2](left, right, disp, maxdisp
);
callers[winsz2](left, right, disp, maxdisp
, stream);
}
}
}}}
}}}
...
@@ -381,7 +389,7 @@ extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step,
...
@@ -381,7 +389,7 @@ extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step,
if (x < width && y < height)
if (x < width && y < height)
{
{
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
(int)tex2D(texForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel, x + 1, y ) * (2) +
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
...
@@ -398,18 +406,18 @@ namespace cv { namespace gpu { namespace impl
...
@@ -398,18 +406,18 @@ namespace cv { namespace gpu { namespace impl
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap)
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap)
{
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForSobel, input.ptr, desc, input.cols, input.rows, input.step ) );
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForSobel, input.ptr, desc, input.cols, input.rows, input.step ) );
dim3 threads(16, 16, 1);
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y);
grid.y = divUp(input.rows, threads.y);
stereobm_gpu::prefilter_kernel<<<grid, threads>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
stereobm_gpu::prefilter_kernel<<<grid, threads>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForSobel ) );
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForSobel ) );
}
}
}}}
}}}
...
@@ -424,8 +432,8 @@ namespace stereobm_gpu
...
@@ -424,8 +432,8 @@ namespace stereobm_gpu
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ float sobel(int x, int y)
__device__ float sobel(int x, int y)
{
{
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
return fabs(conv);
return fabs(conv);
...
@@ -453,28 +461,28 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
...
@@ -453,28 +461,28 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
return cols[0] + cache + cache2;
return cols[0] + cache + cache2;
}
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
#define RpT (2 * ROWSperTHREAD) // got experimentally
extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_step, int winsz, float threshold, int width, int height)
extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_step, int winsz, float threshold, int width, int height)
{
{
int winsz2 = winsz/2;
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
int n_dirty_pixels = (winsz2) * 2;
extern __shared__ float cols_cache[];
extern __shared__ float cols_cache[];
float *cols = cols_cache + blockDim.x + threadIdx.x;
float *cols = cols_cache + blockDim.x + threadIdx.x;
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT;
int beg_row = blockIdx.y * RpT;
int end_row = min(beg_row + RpT, height);
int end_row = min(beg_row + RpT, height);
if (x < width)
if (x < width)
{
{
int y = beg_row;
int y = beg_row;
float sum = 0;
float sum = 0;
float sum_extra = 0;
float sum_extra = 0;
for(int i = y - winsz2; i <= y + winsz2; ++i)
for(int i = y - winsz2; i <= y + winsz2; ++i)
{
{
sum += sobel(x - winsz2, i);
sum += sobel(x - winsz2, i);
...
@@ -486,11 +494,11 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
...
@@ -486,11 +494,11 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
*cols_extra = sum_extra;
*cols_extra = sum_extra;
__syncthreads();
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
disp[y * disp_step + x] = 0;
__syncthreads();
__syncthreads();
for(int y = beg_row + 1; y < end_row; ++y)
for(int y = beg_row + 1; y < end_row; ++y)
...
@@ -505,12 +513,12 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
...
@@ -505,12 +513,12 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
}
}
__syncthreads();
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
disp[y * disp_step + x] = 0;
__syncthreads();
__syncthreads();
}
}
}
}
}
}
}
}
...
@@ -521,21 +529,21 @@ namespace cv { namespace gpu { namespace impl
...
@@ -521,21 +529,21 @@ namespace cv { namespace gpu { namespace impl
{
{
avgTexturenessThreshold *= winsz * winsz;
avgTexturenessThreshold *= winsz * winsz;
stereobm_gpu::texForTF.filterMode = cudaFilterModeLinear;
stereobm_gpu::texForTF.filterMode = cudaFilterModeLinear;
stereobm_gpu::texForTF.addressMode[0] = cudaAddressModeWrap;
stereobm_gpu::texForTF.addressMode[0] = cudaAddressModeWrap;
stereobm_gpu::texForTF.addressMode[1] = cudaAddressModeWrap;
stereobm_gpu::texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForTF, input.ptr, desc, input.cols, input.rows, input.step ) );
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForTF, input.ptr, desc, input.cols, input.rows, input.step ) );
dim3 threads(128, 1, 1);
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
stereobm_gpu::textureness_kernel<<<grid, threads, smem_size>>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows);
stereobm_gpu::textureness_kernel<<<grid, threads, smem_size>>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows);
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaThreadSynchronize() );
...
...
modules/gpu/src/precomp.hpp
View file @
12dc52c2
...
@@ -46,25 +46,26 @@
...
@@ -46,25 +46,26 @@
#pragma warning( disable: 4251 4710 4711 4514 4996 )
#pragma warning( disable: 4251 4710 4711 4514 4996 )
#endif
#endif
#ifdef HAVE_CONFIG_H
#ifdef HAVE_CONFIG_H
#include <cvconfig.h>
#include <cvconfig.h>
#endif
#endif
#include <iostream>
#include <iostream>
#include <limits>
#include <limits>
#include "opencv2/gpu/gpu.hpp"
#include "opencv2/gpu/gpu.hpp"
#include "opencv2/gpu/stream_accessor.hpp"
#if defined(HAVE_CUDA)
#if defined(HAVE_CUDA)
#include "cuda_shared.hpp"
#include "cuda_shared.hpp"
#include "cuda_runtime_api.h"
#include "cuda_runtime_api.h"
#else
/* defined(HAVE_CUDA) */
#else
/* defined(HAVE_CUDA) */
static
inline
void
throw_nogpu
()
{
CV_Error
(
CV_GpuNotFound
,
"The library is compilled with no GPU support"
);
}
static
inline
void
throw_nogpu
()
{
CV_Error
(
CV_GpuNotFound
,
"The library is compilled with no GPU support"
);
}
#endif
/* defined(HAVE_CUDA) */
#endif
/* defined(HAVE_CUDA) */
#endif
/* __OPENCV_PRECOMP_H__ */
#endif
/* __OPENCV_PRECOMP_H__ */
modules/gpu/src/stereobm_gpu.cpp
View file @
12dc52c2
...
@@ -56,25 +56,26 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,
...
@@ -56,25 +56,26 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,
#else
/* !defined (HAVE_CUDA) */
#else
/* !defined (HAVE_CUDA) */
namespace
cv
{
namespace
gpu
namespace
cv
{
namespace
gpu
{
{
namespace
impl
namespace
impl
{
{
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
ndisp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
);
//extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf);
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
ndisp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
,
const
cudaStream_t
&
stream
);
extern
"C"
void
prefilter_xsobel
(
const
DevMem2D
&
input
,
const
DevMem2D
&
output
,
int
prefilterCap
=
31
);
extern
"C"
void
prefilter_xsobel
(
const
DevMem2D
&
input
,
const
DevMem2D
&
output
,
int
prefilterCap
=
31
);
extern
"C"
void
postfilter_textureness
(
const
DevMem2D
&
input
,
int
winsz
,
float
avergeTexThreshold
,
const
DevMem2D
&
disp
);
extern
"C"
void
postfilter_textureness
(
const
DevMem2D
&
input
,
int
winsz
,
float
avergeTexThreshold
,
const
DevMem2D
&
disp
);
}
}
}}
}}
const
float
defaultAvgTexThreshold
=
3
;
const
float
defaultAvgTexThreshold
=
3
;
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
()
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
()
:
preset
(
BASIC_PRESET
),
ndisp
(
DEFAULT_NDISP
),
winSize
(
DEFAULT_WINSZ
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
{}
:
preset
(
BASIC_PRESET
),
ndisp
(
DEFAULT_NDISP
),
winSize
(
DEFAULT_WINSZ
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
{}
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
preset_
,
int
ndisparities_
,
int
winSize_
)
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
preset_
,
int
ndisparities_
,
int
winSize_
)
:
preset
(
preset_
),
ndisp
(
ndisparities_
),
winSize
(
winSize_
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
:
preset
(
preset_
),
ndisp
(
ndisparities_
),
winSize
(
winSize_
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
{
{
const
int
max_supported_ndisp
=
1
<<
(
sizeof
(
unsigned
char
)
*
8
);
const
int
max_supported_ndisp
=
1
<<
(
sizeof
(
unsigned
char
)
*
8
);
CV_Assert
(
0
<
ndisp
&&
ndisp
<=
max_supported_ndisp
);
CV_Assert
(
0
<
ndisp
&&
ndisp
<=
max_supported_ndisp
);
CV_Assert
(
ndisp
%
8
==
0
);
CV_Assert
(
ndisp
%
8
==
0
);
CV_Assert
(
winSize
%
2
==
1
);
CV_Assert
(
winSize
%
2
==
1
);
...
@@ -92,12 +93,12 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
...
@@ -92,12 +93,12 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
int
numSM
=
getNumberOfSMs
(
device
);
int
numSM
=
getNumberOfSMs
(
device
);
if
(
major
>
1
||
numSM
>
16
)
if
(
major
>
1
||
numSM
>
16
)
return
true
;
return
true
;
return
false
;
return
false
;
}
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
)
void
stereo_gpu_operator
(
GpuMat
&
minSSD
,
GpuMat
&
leBuf
,
GpuMat
&
riBuf
,
int
preset
,
int
ndisp
,
int
winSize
,
float
avergeTexThreshold
,
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
cudaStream_t
&
stream
)
{
{
CV_DbgAssert
(
left
.
rows
==
right
.
rows
&&
left
.
cols
==
right
.
cols
);
CV_DbgAssert
(
left
.
rows
==
right
.
rows
&&
left
.
cols
==
right
.
cols
);
CV_DbgAssert
(
left
.
type
()
==
CV_8UC1
);
CV_DbgAssert
(
left
.
type
()
==
CV_8UC1
);
...
@@ -109,26 +110,33 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right
...
@@ -109,26 +110,33 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right
GpuMat
le_for_bm
=
left
;
GpuMat
le_for_bm
=
left
;
GpuMat
ri_for_bm
=
right
;
GpuMat
ri_for_bm
=
right
;
if
(
preset
==
PREFILTER_XSOBEL
)
if
(
preset
==
StereoBM_GPU
::
PREFILTER_XSOBEL
)
{
{
leBuf
.
create
(
left
.
size
(),
left
.
type
());
leBuf
.
create
(
left
.
size
(),
left
.
type
());
riBuf
.
create
(
right
.
size
(),
right
.
type
());
riBuf
.
create
(
right
.
size
(),
right
.
type
());
impl
::
prefilter_xsobel
(
left
,
leBuf
);
impl
::
prefilter_xsobel
(
left
,
leBuf
);
impl
::
prefilter_xsobel
(
right
,
riBuf
);
impl
::
prefilter_xsobel
(
right
,
riBuf
);
le_for_bm
=
leBuf
;
le_for_bm
=
leBuf
;
ri_for_bm
=
riBuf
;
ri_for_bm
=
riBuf
;
}
}
impl
::
stereoBM_GPU
(
le_for_bm
,
ri_for_bm
,
disparity
,
ndisp
,
winSize
,
minSSD
);
impl
::
stereoBM_GPU
(
le_for_bm
,
ri_for_bm
,
disparity
,
ndisp
,
winSize
,
minSSD
,
stream
);
if
(
avergeTexThreshold
)
if
(
avergeTexThreshold
)
impl
::
postfilter_textureness
(
le_for_bm
,
winSize
,
avergeTexThreshold
,
disparity
);
impl
::
postfilter_textureness
(
le_for_bm
,
winSize
,
avergeTexThreshold
,
disparity
);
}
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
)
{
::
stereo_gpu_operator
(
minSSD
,
leBuf
,
riBuf
,
preset
,
ndisp
,
winSize
,
avergeTexThreshold
,
left
,
right
,
disparity
,
0
);
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
)
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
)
{
{
CV_Assert
(
!
"Not implemented"
);
::
stereo_gpu_operator
(
minSSD
,
leBuf
,
riBuf
,
preset
,
ndisp
,
winSize
,
avergeTexThreshold
,
left
,
right
,
disparity
,
StreamAccessor
::
getStream
(
stream
)
);
}
}
#endif
/* !defined (HAVE_CUDA) */
#endif
/* !defined (HAVE_CUDA) */
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment