Commit 6a9f9342 authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

Fixed bug #1759

parent 40d8d11f
...@@ -1750,6 +1750,7 @@ public: ...@@ -1750,6 +1750,7 @@ public:
useInitialFlow = false; useInitialFlow = false;
minEigThreshold = 1e-4f; minEigThreshold = 1e-4f;
getMinEigenVals = false; getMinEigenVals = false;
isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
} }
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
...@@ -1796,6 +1797,8 @@ private: ...@@ -1796,6 +1797,8 @@ private:
vector<GpuMat> uPyr_; vector<GpuMat> uPyr_;
vector<GpuMat> vPyr_; vector<GpuMat> vPyr_;
bool isDeviceArch11_;
}; };
...@@ -1812,6 +1815,7 @@ public: ...@@ -1812,6 +1815,7 @@ public:
polyN = 5; polyN = 5;
polySigma = 1.1; polySigma = 1.1;
flags = 0; flags = 0;
isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
} }
int numLevels; int numLevels;
...@@ -1859,6 +1863,8 @@ private: ...@@ -1859,6 +1863,8 @@ private:
GpuMat frames_[2]; GpuMat frames_[2];
GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2]; GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
std::vector<GpuMat> pyramid0_, pyramid1_; std::vector<GpuMat> pyramid0_, pyramid1_;
bool isDeviceArch11_;
}; };
......
...@@ -433,6 +433,25 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback ...@@ -433,6 +433,25 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
} }
void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
{
int height = src.rows / 5;
int width = src.cols;
dim3 block(128);
dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
__constant__ float c_gKer[MAX_KSIZE_HALF + 1]; __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
template <typename Border> template <typename Border>
...@@ -575,14 +594,14 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback ...@@ -575,14 +594,14 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
} }
template <typename Border> template <typename Border, int blockDimX>
void gaussianBlur5Caller( void gaussianBlur5Caller(
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream) const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
{ {
int height = src.rows / 5; int height = src.rows / 5;
int width = src.cols; int width = src.cols;
dim3 block(256); dim3 block(blockDimX);
dim3 grid(divUp(width, block.x), divUp(height, block.y)); dim3 grid(divUp(width, block.x), divUp(height, block.y));
int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float); int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
Border b(height, width); Border b(height, width);
...@@ -603,12 +622,26 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback ...@@ -603,12 +622,26 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
static const caller_t callers[] = static const caller_t callers[] =
{ {
gaussianBlur5Caller<BrdReflect101<float> >, gaussianBlur5Caller<BrdReflect101<float>,256>,
gaussianBlur5Caller<BrdReplicate<float> >, gaussianBlur5Caller<BrdReplicate<float>,256>,
}; };
callers[borderMode](src, ksizeHalf, dst, stream); callers[borderMode](src, ksizeHalf, dst, stream);
} }
void gaussianBlur5Gpu_CC11(
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
static const caller_t callers[] =
{
gaussianBlur5Caller<BrdReflect101<float>,128>,
gaussianBlur5Caller<BrdReplicate<float>,128>,
};
callers[borderMode](src, ksizeHalf, dst, stream);
}
}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback }}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
...@@ -181,6 +181,7 @@ namespace cv { namespace gpu { namespace device ...@@ -181,6 +181,7 @@ namespace cv { namespace gpu { namespace device
smem3[tid] = val3; smem3[tid] = val3;
__syncthreads(); __syncthreads();
#if __CUDA_ARCH__ > 110
if (tid < 128) if (tid < 128)
{ {
smem1[tid] = val1 += smem1[tid + 128]; smem1[tid] = val1 += smem1[tid + 128];
...@@ -188,6 +189,7 @@ namespace cv { namespace gpu { namespace device ...@@ -188,6 +189,7 @@ namespace cv { namespace gpu { namespace device
smem3[tid] = val3 += smem3[tid + 128]; smem3[tid] = val3 += smem3[tid + 128];
} }
__syncthreads(); __syncthreads();
#endif
if (tid < 64) if (tid < 64)
{ {
...@@ -235,12 +237,14 @@ namespace cv { namespace gpu { namespace device ...@@ -235,12 +237,14 @@ namespace cv { namespace gpu { namespace device
smem2[tid] = val2; smem2[tid] = val2;
__syncthreads(); __syncthreads();
#if __CUDA_ARCH__ > 110
if (tid < 128) if (tid < 128)
{ {
smem1[tid] = val1 += smem1[tid + 128]; smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128]; smem2[tid] = val2 += smem2[tid + 128];
} }
__syncthreads(); __syncthreads();
#endif
if (tid < 64) if (tid < 64)
{ {
...@@ -279,11 +283,13 @@ namespace cv { namespace gpu { namespace device ...@@ -279,11 +283,13 @@ namespace cv { namespace gpu { namespace device
smem1[tid] = val1; smem1[tid] = val1;
__syncthreads(); __syncthreads();
#if __CUDA_ARCH__ > 110
if (tid < 128) if (tid < 128)
{ {
smem1[tid] = val1 += smem1[tid + 128]; smem1[tid] = val1 += smem1[tid + 128];
} }
__syncthreads(); __syncthreads();
#endif
if (tid < 64) if (tid < 64)
{ {
...@@ -310,9 +316,15 @@ namespace cv { namespace gpu { namespace device ...@@ -310,9 +316,15 @@ namespace cv { namespace gpu { namespace device
__global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy, __global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,
const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{ {
#if __CUDA_ARCH__ <= 110
__shared__ float smem1[128];
__shared__ float smem2[128];
__shared__ float smem3[128];
#else
__shared__ float smem1[256]; __shared__ float smem1[256];
__shared__ float smem2[256]; __shared__ float smem2[256];
__shared__ float smem3[256]; __shared__ float smem3[256];
#endif
const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int tid = threadIdx.y * blockDim.x + threadIdx.x;
......
...@@ -81,6 +81,8 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback ...@@ -81,6 +81,8 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream); void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
void setGaussianBlurKernel(const float *gKer, int ksizeHalf); void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
void gaussianBlurGpu( void gaussianBlurGpu(
...@@ -89,6 +91,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback ...@@ -89,6 +91,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
void gaussianBlur5Gpu( void gaussianBlur5Gpu(
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream); const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
void gaussianBlur5Gpu_CC11(
const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
}}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback }}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
...@@ -167,7 +172,10 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter( ...@@ -167,7 +172,10 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy, const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{ {
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0])); if (!isDeviceArch11_)
device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
else
device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
swap(M, bufM); swap(M, bufM);
for (int i = 1; i < 5; ++i) for (int i = 1; i < 5; ++i)
...@@ -183,8 +191,12 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur( ...@@ -183,8 +191,12 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy, const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]) GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
{ {
device::optflow_farneback::gaussianBlur5Gpu( if (!isDeviceArch11_)
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0])); device::optflow_farneback::gaussianBlur5Gpu(
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
else
device::optflow_farneback::gaussianBlur5Gpu_CC11(
M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
swap(M, bufM); swap(M, bufM);
device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0])); device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));
......
...@@ -126,18 +126,19 @@ void cv::gpu::PyrLKOpticalFlow::buildImagePyramid(const GpuMat& img0, vector<Gpu ...@@ -126,18 +126,19 @@ void cv::gpu::PyrLKOpticalFlow::buildImagePyramid(const GpuMat& img0, vector<Gpu
namespace namespace
{ {
void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch) void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch, bool isDeviceArch11)
{ {
winSize.width *= cn; winSize.width *= cn;
if (winSize.width > 32 && winSize.width > 2 * winSize.height) if (winSize.width > 32 && winSize.width > 2 * winSize.height)
{ {
block.x = 32; block.x = isDeviceArch11 ? 16 : 32;
block.y = 8; block.y = 8;
} }
else else
{ {
block.x = block.y = 16; block.x = 16;
block.y = isDeviceArch11 ? 8 : 16;
} }
patch.x = (winSize.width + block.x - 1) / block.x; patch.x = (winSize.width + block.x - 1) / block.x;
...@@ -166,7 +167,7 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next ...@@ -166,7 +167,7 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
const int cn = prevImg.channels(); const int cn = prevImg.channels();
dim3 block, patch; dim3 block, patch;
calcPatchSize(winSize, cn, block, patch); calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);
CV_Assert(derivLambda >= 0); CV_Assert(derivLambda >= 0);
CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2); CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment