Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
17f7b12a
Commit
17f7b12a
authored
Jul 22, 2010
by
Anatoly Baksheev
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Variable winSize for StereoBP_GPU
Fixed StereoBM_GPU kernel crash Textureness threshold added
parent
26c48596
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
268 additions
and
83 deletions
+268
-83
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+7
-1
cuda_shared.hpp
modules/gpu/src/cuda/cuda_shared.hpp
+1
-1
stereobm.cu
modules/gpu/src/cuda/stereobm.cu
+240
-70
stereobm_gpu.cpp
modules/gpu/src/stereobm_gpu.cpp
+20
-11
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
17f7b12a
...
...
@@ -335,6 +335,7 @@ namespace cv
//! the full constructor taking the camera-specific preset, number of disparities and the SAD window size
//! ndisparities should be multiple of 8. SSD WindowsSize is fixed to 19 now
StereoBM_GPU
(
int
preset
,
int
ndisparities
=
DEFAULT_NDISP
,
int
winSize
=
DEFAULT_WINSZ
);
//! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
//! Output disparity has CV_8U type.
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
);
...
...
@@ -350,9 +351,14 @@ namespace cv
int
ndisp
;
int
winSize
;
int
preset
;
// If avergeTexThreshold == 0 => post procesing is disabled
// If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
// SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
// i.e. input left image is low textured.
float
avergeTexThreshold
;
private
:
GpuMat
minSSD
,
leBuf
,
riBuf
;
};
}
}
...
...
modules/gpu/src/cuda/cuda_shared.hpp
View file @
17f7b12a
...
...
@@ -80,7 +80,7 @@ namespace cv
static
inline
void
___cudaSafeCall
(
cudaError_t
err
,
const
char
*
file
,
const
int
line
,
const
char
*
func
=
""
)
{
if
(
cudaSuccess
!=
err
)
cv
::
gpu
::
error
(
cudaGetErrorString
(
err
),
__FILE__
,
__LINE__
,
func
);
cv
::
gpu
::
error
(
cudaGetErrorString
(
err
),
file
,
line
,
func
);
}
#endif
/* __OPENCV_CUDA_SHARED_HPP__ */
modules/gpu/src/cuda/stereobm.cu
View file @
17f7b12a
...
...
@@ -42,21 +42,22 @@
#include "cuda_shared.hpp"
using namespace cv::gpu;
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Streeo BM ////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
#define ROWSperTHREAD 21 // the number of rows a thread will process
namespace stereobm_gpu
{
#define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
#define RADIUS 9 // Kernel Radius 5V & 5H = 11x11 kernel
#define WINSZ (2 * RADIUS + 1)
#define N_DIRTY_PIXELS (2 * RADIUS)
#define COL_SSD_SIZE (BLOCK_W + N_DIRTY_PIXELS)
#define SHARED_MEM_SIZE (COL_SSD_SIZE) // amount of shared memory used
namespace stereobm_gpu
{
__constant__ unsigned int* cminSSDImage;
__constant__ size_t cminSSD_step;
...
...
@@ -68,6 +69,7 @@ __device__ int SQ(int a)
return a * a;
}
template<int RADIUS>
__device__ unsigned int CalcSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd)
{
unsigned int cache = 0;
...
...
@@ -83,24 +85,26 @@ __device__ unsigned int CalcSSD(unsigned int *col_ssd_cache, unsigned int *col_s
if (threadIdx.x < BLOCK_W - RADIUS)
cache2 = col_ssd_cache[RADIUS];
else
for(int i = RADIUS + 1; i <
WINSZ
; i++)
for(int i = RADIUS + 1; i <
(2 * RADIUS + 1)
; i++)
cache2 += col_ssd[i];
return col_ssd[0] + cache + cache2;
}
template<int RADIUS>
__device__ uint2 MinSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd)
{
unsigned int ssd[N_DISPARITIES];
ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * SHARED_MEM_SIZE);
ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * SHARED_MEM_SIZE);
ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * SHARED_MEM_SIZE);
ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * SHARED_MEM_SIZE);
ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * SHARED_MEM_SIZE);
ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * SHARED_MEM_SIZE);
ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * SHARED_MEM_SIZE);
ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * SHARED_MEM_SIZE);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
...
...
@@ -114,6 +118,7 @@ __device__ uint2 MinSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd)
return make_uint2(mssd, bestIdx);
}
template<int RADIUS>
__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, unsigned int *col_ssd)
{
unsigned char leftPixel1;
...
...
@@ -146,47 +151,48 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[6] = imageR[idx2 - 6];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
diff1 = leftPixel1 - rightPixel1[0];
diff2 = leftPixel2 - rightPixel2[0];
col_ssd[0 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[0 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[1];
diff2 = leftPixel2 - rightPixel2[1];
col_ssd[1 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[1 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[2];
diff2 = leftPixel2 - rightPixel2[2];
col_ssd[2 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[2 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[3];
diff2 = leftPixel2 - rightPixel2[3];
col_ssd[3 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[3 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[4];
diff2 = leftPixel2 - rightPixel2[4];
col_ssd[4 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[4 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[5];
diff2 = leftPixel2 - rightPixel2[5];
col_ssd[5 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[5 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[6];
diff2 = leftPixel2 - rightPixel2[6];
col_ssd[6 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[6 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[7];
diff2 = leftPixel2 - rightPixel2[7];
col_ssd[7 *
SHARED_MEM_SIZE
] += SQ(diff2) - SQ(diff1);
col_ssd[7 *
(BLOCK_W + 2 * RADIUS)
] += SQ(diff2) - SQ(diff1);
}
template<int RADIUS>
__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, unsigned int *col_ssd)
{
unsigned char leftPixel1;
int idx;
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
for(int i = 0; i <
WINSZ
; i++)
for(int i = 0; i <
(2 * RADIUS + 1)
; i++)
{
idx = y_tex * im_pitch + x_tex;
leftPixel1 = imageL[idx];
...
...
@@ -202,23 +208,24 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
y_tex += 1;
}
col_ssd[0 *
SHARED_MEM_SIZE
] = diffa[0];
col_ssd[1 *
SHARED_MEM_SIZE
] = diffa[1];
col_ssd[2 *
SHARED_MEM_SIZE
] = diffa[2];
col_ssd[3 *
SHARED_MEM_SIZE
] = diffa[3];
col_ssd[4 *
SHARED_MEM_SIZE
] = diffa[4];
col_ssd[5 *
SHARED_MEM_SIZE
] = diffa[5];
col_ssd[6 *
SHARED_MEM_SIZE
] = diffa[6];
col_ssd[7 *
SHARED_MEM_SIZE
] = diffa[7];
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
col_ssd[0 *
(BLOCK_W + 2 * RADIUS)
] = diffa[0];
col_ssd[1 *
(BLOCK_W + 2 * RADIUS)
] = diffa[1];
col_ssd[2 *
(BLOCK_W + 2 * RADIUS)
] = diffa[2];
col_ssd[3 *
(BLOCK_W + 2 * RADIUS)
] = diffa[3];
col_ssd[4 *
(BLOCK_W + 2 * RADIUS)
] = diffa[4];
col_ssd[5 *
(BLOCK_W + 2 * RADIUS)
] = diffa[5];
col_ssd[6 *
(BLOCK_W + 2 * RADIUS)
] = diffa[6];
col_ssd[7 *
(BLOCK_W + 2 * RADIUS)
] = diffa[7];
}
extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp)
template<int RADIUS>
__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, unsigned char* disp, size_t disp_pitch, int maxdisp)
{
extern __shared__ unsigned int col_ssd_cache[];
unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
unsigned int *col_ssd_extra = threadIdx.x <
N_DIRTY_PIXELS ? col_ssd + BLOCK_W : 0;
unsigned int *col_ssd_extra = threadIdx.x <
(2 * RADIUS) ? col_ssd + BLOCK_W : 0; //#define N_DIRTY_PIXELS (2 * RADIUS)
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp);
...
...
@@ -237,20 +244,25 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ
int end_row = min(ROWSperTHREAD, cheight - Y);
int y_tex;
int x_tex = X - RADIUS;
if (x_tex >= cwidth)
return;
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
{
y_tex = Y - RADIUS;
InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd);
InitColSSD
<RADIUS>
(x_tex, y_tex, img_step, left, right, d, col_ssd);
if (col_ssd_extra > 0)
InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
if (x_tex + BLOCK_W < cwidth)
InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
__syncthreads(); //before MinSSD function
if (X < cwidth - RADIUS && Y < cheight - RADIUS)
{
uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd);
uint2 minSSD = MinSSD
<RADIUS>
(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[0])
{
disparImage[0] = (unsigned char)(d + minSSD.y);
...
...
@@ -261,14 +273,15 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ
for(int row = 1; row < end_row; row++)
{
int idx1 = y_tex * img_step + x_tex;
int idx2 = (y_tex +
WINSZ
) * img_step + x_tex;
int idx2 = (y_tex +
(2 * RADIUS + 1)
) * img_step + x_tex;
__syncthreads();
StepDown(idx1, idx2, left, right, d, col_ssd);
StepDown
<RADIUS>
(idx1, idx2, left, right, d, col_ssd);
if (col_ssd_extra)
StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
if (x_tex + BLOCK_W < cwidth)
StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
y_tex += 1;
...
...
@@ -277,7 +290,7 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ
if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
{
int idx = row * cminSSD_step;
uint2 minSSD = MinSSD(col_ssd_cache + threadIdx.x, col_ssd);
uint2 minSSD = MinSSD
<RADIUS>
(col_ssd_cache + threadIdx.x, col_ssd);
if (minSSD.x < minSSDImage[idx])
{
disparImage[disp_pitch * row] = (unsigned char)(d + minSSD.y);
...
...
@@ -290,24 +303,52 @@ extern "C" __global__ void stereoKernel(unsigned char *left, unsigned char *righ
}
namespace cv { namespace gpu { namespace impl
{
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf)
{
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
size_t smem_size = (BLOCK_W + N_DISPARITIES * SHARED_MEM_SIZE) * sizeof(unsigned int);
cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp. rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp. rows) );
namespace cv { namespace gpu { namespace impl
{
template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp)
{
dim3 grid(1,1,1);
dim3 threads(BLOCK_W, 1, 1);
dim3 threads(BLOCK_W, 1, 1);
grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
stereobm_gpu::stereoKernel<RADIUS><<<grid, threads, smem_size>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
cudaSafeCall( cudaThreadSynchronize() );
};
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp);
const static kernel_caller_t callers[] =
{
0,
kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
const int calles_num = sizeof(callers)/sizeof(callers[0]);
extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf)
{
int winsz2 = winsz >> 1;
if (winsz2 == 0 || winsz2 >= calles_num)
cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall( cudaMemset2D(disp.ptr, disp.step, 0, disp.cols, disp.rows) );
cudaSafeCall( cudaMemset2D(minSSD_buf.ptr, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cwidth, &left.cols, sizeof(left.cols) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cheight, &left.rows, sizeof(left.rows) ) );
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSDImage, &minSSD_buf.ptr, sizeof(minSSD_buf.ptr) ) );
...
...
@@ -315,8 +356,7 @@ namespace cv { namespace gpu { namespace impl
size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
cudaSafeCall( cudaMemcpyToSymbol( stereobm_gpu::cminSSD_step, &minssd_step, sizeof(minssd_step) ) );
stereobm_gpu::stereoKernel<<<grid, threads, smem_size>>>(left.ptr, right.ptr, left.step, disp.ptr, disp.step, maxdisp);
cudaSafeCall( cudaThreadSynchronize() );
callers[winsz2](left, right, disp, maxdisp);
}
}}}
...
...
@@ -327,18 +367,18 @@ namespace cv { namespace gpu { namespace impl
namespace stereobm_gpu
{
texture<unsigned char, 2, cudaReadModeElementType> tex;
texture<unsigned char, 2, cudaReadModeElementType> tex
ForSobel
;
extern "C" __global__ void prefil
ert
_kernel(unsigned char *output, size_t step, int width, int height, int prefilterCap)
extern "C" __global__ void prefil
ter
_kernel(unsigned char *output, size_t step, int width, int height, int prefilterCap)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
int conv = (int)tex2D(tex
, x - 1, y - 1) * (-1) + (int)tex2D(tex
, x + 1, y - 1) * (1) +
(int)tex2D(tex
, x - 1, y ) * (-2) + (int)tex2D(tex
, x + 1, y ) * (2) +
(int)tex2D(tex
, x - 1, y + 1) * (-1) + (int)tex2D(tex
, x + 1, y + 1) * (1);
int conv = (int)tex2D(tex
ForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel
, x + 1, y - 1) * (1) +
(int)tex2D(tex
ForSobel, x - 1, y ) * (-2) + (int)tex2D(texForSobel
, x + 1, y ) * (2) +
(int)tex2D(tex
ForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel
, x + 1, y + 1) * (1);
conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
...
...
@@ -353,7 +393,7 @@ namespace cv { namespace gpu { namespace impl
extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output, int prefilterCap)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::tex, input.ptr, desc, input.cols, input.rows, input.step ) );
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::tex
ForSobel
, input.ptr, desc, input.cols, input.rows, input.step ) );
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
...
...
@@ -361,8 +401,139 @@ namespace cv { namespace gpu { namespace impl
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, threads.y);
stereobm_gpu::prefil
ert
_kernel<<<grid, threads>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
stereobm_gpu::prefil
ter
_kernel<<<grid, threads>>>(output.ptr, output.step, output.cols, output.rows, prefilterCap);
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForSobel ) );
}
}}}
\ No newline at end of file
}}}
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
namespace stereobm_gpu
{
texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
__device__ float sobel(int x, int y)
{
float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
tex2D(texForTF, x - 1, y ) * (-2) + tex2D(texForTF, x + 1, y ) * (2) +
tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
return fabs(conv);
}
__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
{
float cache = 0;
float cache2 = 0;
int winsz2 = winsz/2;
for(int i = 1; i <= winsz2; i++)
cache += cols[i];
cols_cache[0] = cache;
__syncthreads();
if (threadIdx.x < blockDim.x - winsz2)
cache2 = cols_cache[winsz2];
else
for(int i = winsz2 + 1; i < winsz; i++)
cache2 += cols[i];
return cols[0] + cache + cache2;
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_step, int winsz, float threshold, int width, int height)
{
int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2;
extern __shared__ float cols_cache[];
float *cols = cols_cache + blockDim.x + threadIdx.x;
float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT;
int end_row = min(beg_row + RpT, height);
if (x < width)
{
int y = beg_row;
float sum = 0;
float sum_extra = 0;
for(int i = y - winsz2; i <= y + winsz2; ++i)
{
sum += sobel(x - winsz2, i);
if (cols_extra)
sum_extra += sobel(x + blockDim.x - winsz2, i);
}
*cols = sum;
if (cols_extra)
*cols_extra = sum_extra;
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
__syncthreads();
for(int y = beg_row + 1; y < end_row; ++y)
{
sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
*cols = sum;
if (cols_extra)
{
sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
*cols_extra = sum_extra;
}
__syncthreads();
float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
if (sum_win < threshold)
disp[y * disp_step + x] = 0;
__syncthreads();
}
}
}
}
namespace cv { namespace gpu { namespace impl
{
extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float avgTexturenessThreshold, const DevMem2D& disp)
{
avgTexturenessThreshold *= winsz * winsz;
stereobm_gpu::texForTF.filterMode = cudaFilterModeLinear;
stereobm_gpu::texForTF.addressMode[0] = cudaAddressModeWrap;
stereobm_gpu::texForTF.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
cudaSafeCall( cudaBindTexture2D( 0, stereobm_gpu::texForTF, input.ptr, desc, input.cols, input.rows, input.step ) );
dim3 threads(128, 1, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(input.cols, threads.x);
grid.y = divUp(input.rows, RpT);
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
stereobm_gpu::textureness_kernel<<<grid, threads, smem_size>>>(disp.ptr, disp.step, winsz, avgTexturenessThreshold, disp.cols, disp.rows);
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture (stereobm_gpu::texForTF) );
}
}}}
modules/gpu/src/stereobm_gpu.cpp
View file @
17f7b12a
...
...
@@ -48,12 +48,11 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA)
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
()
{
throw_nogpu
();
}
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
preset_
,
int
ndisparities_
)
{
throw_nogpu
();
}
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
,
int
,
int
)
{
throw_nogpu
();
}
bool
cv
::
gpu
::
StereoBM_GPU
::
checkIfGpuCallReasonable
()
{
throw_nogpu
();
return
false
;
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
CudaStream
&
)
{
throw_nogpu
();
}
#else
/* !defined (HAVE_CUDA) */
...
...
@@ -61,17 +60,24 @@ namespace cv { namespace gpu
{
namespace
impl
{
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
max
disp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
);
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
n
disp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
);
extern
"C"
void
prefilter_xsobel
(
const
DevMem2D
&
input
,
const
DevMem2D
&
output
,
int
prefilterCap
=
31
);
extern
"C"
void
postfilter_textureness
(
const
DevMem2D
&
input
,
int
winsz
,
float
avergeTexThreshold
,
const
DevMem2D
&
disp
);
}
}}
const
float
defaultAvgTexThreshold
=
3
;
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
()
:
preset
(
BASIC_PRESET
),
ndisp
(
DEFAULT_NDISP
),
winSize
(
DEFAULT_WINSZ
)
{}
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
preset_
,
int
ndisparities_
,
int
winSize_
)
:
preset
(
preset_
),
ndisp
(
ndisparities_
),
winSize
(
winSize_
)
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
()
:
preset
(
BASIC_PRESET
),
ndisp
(
DEFAULT_NDISP
),
winSize
(
DEFAULT_WINSZ
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
{}
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
preset_
,
int
ndisparities_
,
int
winSize_
)
:
preset
(
preset_
),
ndisp
(
ndisparities_
),
winSize
(
winSize_
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
{
const
int
max_supported_ndisp
=
1
<<
(
sizeof
(
unsigned
char
)
*
8
);
CV_Assert
(
0
<
ndisp
&&
ndisp
<=
max_supported_ndisp
);
CV_Assert
(
ndisp
%
8
==
0
);
CV_Assert
(
winSize
%
2
==
1
);
}
bool
cv
::
gpu
::
StereoBM_GPU
::
checkIfGpuCallReasonable
()
...
...
@@ -87,7 +93,7 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
if
(
major
>
1
||
numSM
>
16
)
return
true
;
return
false
;
}
...
...
@@ -102,19 +108,22 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right
GpuMat
le_for_bm
=
left
;
GpuMat
ri_for_bm
=
right
;
if
(
preset
==
PREFILTER_XSOBEL
)
{
leBuf
.
create
(
left
.
size
(),
left
.
type
());
riBuf
.
create
(
right
.
size
(),
right
.
type
());
impl
::
prefilter_xsobel
(
left
,
leBuf
);
impl
::
prefilter_xsobel
(
right
,
riBuf
);
le_for_bm
=
leBuf
;
ri_for_bm
=
riBuf
;
}
impl
::
stereoBM_GPU
(
le_for_bm
,
ri_for_bm
,
disparity
,
ndisp
,
winSize
,
minSSD
);
impl
::
stereoBM_GPU
(
le_for_bm
,
ri_for_bm
,
disparity
,
ndisp
,
winSize
,
minSSD
);
if
(
avergeTexThreshold
)
impl
::
postfilter_textureness
(
le_for_bm
,
winSize
,
avergeTexThreshold
,
disparity
);
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment