Commit 0239c195 authored by Juan María Gómez López's avatar Juan María Gómez López Committed by Alexander Alekhin

Merge pull request #11060 from juanecito:2.4

* Thread-safe version of sparse function in cv::gpu::PyrLKOpticalFlow
class. The new function name is sparse_multi

* Thread-safe sparse function in cv::gpu::PyrLKOpticalFlow. Tests

* Thread-safe sparse function in cv::gpu::PyrLKOpticalFlow class.

Add intel_TBB conditional compilation
parent a32aec5b
...@@ -60,6 +60,10 @@ ...@@ -60,6 +60,10 @@
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif #endif
#if !defined(HAVE_TBB)
#define throw_notbb() CV_Error(CV_StsNotImplemented, "The library is compiled without TBB support")
#endif
namespace cv { namespace gpu { namespace cv { namespace gpu {
//////////////////////////////// CudaMem //////////////////////////////// //////////////////////////////// CudaMem ////////////////////////////////
...@@ -1824,6 +1828,14 @@ public: ...@@ -1824,6 +1828,14 @@ public:
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err = 0); GpuMat& status, GpuMat* err = 0);
#if !defined(HAVE_TBB)
void sparse_multi(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&,
GpuMat&, Stream&, GpuMat*) {throw_notbb();}
#else
void sparse_multi(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, Stream& stream, GpuMat* err = 0);
#endif
void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0); void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
void releaseMemory(); void releaseMemory();
......
...@@ -303,6 +303,88 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, Video_PyrLKOpticalFlowSparse ...@@ -303,6 +303,88 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, Video_PyrLKOpticalFlowSparse
} }
} }
//////////////////////////////////////////////////////
// PyrLKOpticalFlowSparseMulti
#ifdef HAVE_TBB
DEF_PARAM_TEST(ImagePair_Gray_NPts_WinSz_Levels_Iters, pair_string, bool, int, int, int, int);
PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, Video_PyrLKOpticalFlowSparseMulti,
Combine(Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")),
Bool(),
Values(8000),
Values(21),
Values(1, 3),
Values(1, 30)))
{
declare.time(20.0);
const pair_string imagePair = GET_PARAM(0);
const bool useGray = GET_PARAM(1);
const int points = GET_PARAM(2);
const int winSize = GET_PARAM(3);
const int levels = GET_PARAM(4);
const int iters = GET_PARAM(5);
const cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty());
const cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame;
if (useGray)
gray_frame = frame0;
else
cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
cv::Mat pts;
cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
if (PERF_RUN_GPU())
{
const cv::gpu::GpuMat d_pts(pts.reshape(2, 1));
cv::gpu::PyrLKOpticalFlow d_pyrLK;
d_pyrLK.winSize = cv::Size(winSize, winSize);
d_pyrLK.maxLevel = levels - 1;
d_pyrLK.iters = iters;
const cv::gpu::GpuMat d_frame0(frame0);
const cv::gpu::GpuMat d_frame1(frame1);
cv::gpu::GpuMat nextPts;
cv::gpu::GpuMat status;
cv::gpu::Stream stream;
TEST_CYCLE()
{
d_pyrLK.sparse_multi(d_frame0, d_frame1, d_pts, nextPts, status, stream);
stream.waitForCompletion();
}
GPU_SANITY_CHECK(nextPts);
GPU_SANITY_CHECK(status);
}
else
{
cv::Mat nextPts;
cv::Mat status;
TEST_CYCLE()
{
cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
cv::Size(winSize, winSize), levels - 1,
cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
}
CPU_SANITY_CHECK(nextPts);
CPU_SANITY_CHECK(status);
}
}
#endif // HAVE_TBB
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// PyrLKOpticalFlowDense // PyrLKOpticalFlowDense
......
...@@ -49,6 +49,8 @@ ...@@ -49,6 +49,8 @@
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/core/core.hpp"
using namespace cv::gpu; using namespace cv::gpu;
using namespace cv::gpu::device; using namespace cv::gpu::device;
...@@ -60,12 +62,54 @@ namespace pyrlk ...@@ -60,12 +62,54 @@ namespace pyrlk
__constant__ int c_halfWin_y; __constant__ int c_halfWin_y;
__constant__ int c_iters; __constant__ int c_iters;
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp); #define CUDA_CONSTANTS(index) \
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp); __constant__ int c_winSize_x##index; \
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp); __constant__ int c_winSize_y##index; \
__constant__ int c_halfWin_x##index; \
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp); __constant__ int c_halfWin_y##index; \
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp); __constant__ int c_iters##index;
CUDA_CONSTANTS(0)
CUDA_CONSTANTS(1)
CUDA_CONSTANTS(2)
CUDA_CONSTANTS(3)
CUDA_CONSTANTS(4)
template <int index> struct c_multi_winSize_x;
template <int index> struct c_multi_winSize_y;
template <int index> struct c_multi_halfWin_x;
template <int index> struct c_multi_halfWin_y;
template <int index> struct c_multi_iters;
#define CUDA_CONSTANTS_ACCESSOR(index) \
template <> struct c_multi_winSize_x<index> \
{ static __device__ __forceinline__ int get(void){ return c_winSize_x##index;} }; \
template <> struct c_multi_winSize_y<index> \
{ static __device__ __forceinline__ int get(void){ return c_winSize_y##index;} }; \
template <> struct c_multi_halfWin_x<index> \
{ static __device__ __forceinline__ int get(void){ return c_halfWin_x##index;} }; \
template <> struct c_multi_halfWin_y<index> \
{ static __device__ __forceinline__ int get(void){ return c_halfWin_y##index;} }; \
template <> struct c_multi_iters<index> \
{ static __device__ __forceinline__ int get(void){ return c_iters##index;} };
CUDA_CONSTANTS_ACCESSOR(0)
CUDA_CONSTANTS_ACCESSOR(1)
CUDA_CONSTANTS_ACCESSOR(2)
CUDA_CONSTANTS_ACCESSOR(3)
CUDA_CONSTANTS_ACCESSOR(4)
texture<float, cudaTextureType2D, cudaReadModeElementType>
tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType>
tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType>
tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType>
tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType>
tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
template <int cn> struct Tex_I; template <int cn> struct Tex_I;
template <> struct Tex_I<1> template <> struct Tex_I<1>
...@@ -99,6 +143,57 @@ namespace pyrlk ...@@ -99,6 +143,57 @@ namespace pyrlk
} }
}; };
//--------------------------------------------------------------------------
#define CUDA_DECL_TEX_MULTI(texname, type, filtermode) \
texture<type, cudaTextureType2D, cudaReadModeElementType> \
texname##_multi0(false, filtermode, cudaAddressModeClamp); \
texture<type, cudaTextureType2D, cudaReadModeElementType> \
texname##_multi1(false, filtermode, cudaAddressModeClamp); \
texture<type, cudaTextureType2D, cudaReadModeElementType> \
texname##_multi2(false, filtermode, cudaAddressModeClamp); \
texture<type, cudaTextureType2D, cudaReadModeElementType> \
texname##_multi3(false, filtermode, cudaAddressModeClamp); \
texture<type, cudaTextureType2D, cudaReadModeElementType> \
texname##_multi4(false, filtermode, cudaAddressModeClamp); \
CUDA_DECL_TEX_MULTI(tex_If1, float, cudaFilterModeLinear)
CUDA_DECL_TEX_MULTI(tex_If4, float4, cudaFilterModeLinear)
CUDA_DECL_TEX_MULTI(tex_Ib1, uchar, cudaFilterModePoint)
CUDA_DECL_TEX_MULTI(tex_Jf1, float, cudaFilterModeLinear)
CUDA_DECL_TEX_MULTI(tex_Jf4, float4, cudaFilterModeLinear)
template <int cn, int index> struct Tex_I_multi;
template <int cn, int index> struct Tex_J_multi;
template <int cn, int index> struct Tex_B_multi;
#define CUDA_DECL_TEX_MULTI_ACCESS(accessorname, texname, cn, returntype) \
template <> struct accessorname##_multi<cn, 0> \
{ static __device__ __forceinline__ returntype read(float x, float y) \
{ return tex2D(texname##cn##_multi0, x, y); } }; \
template <> struct accessorname##_multi<cn, 1> \
{ static __device__ __forceinline__ returntype read(float x, float y) \
{ return tex2D(texname##cn##_multi1, x, y); } }; \
template <> struct accessorname##_multi<cn, 2> \
{ static __device__ __forceinline__ returntype read(float x, float y) \
{ return tex2D(texname##cn##_multi2, x, y); } }; \
template <> struct accessorname##_multi<cn, 3> \
{ static __device__ __forceinline__ returntype read(float x, float y) \
{ return tex2D(texname##cn##_multi3, x, y); } }; \
template <> struct accessorname##_multi<cn, 4> \
{ static __device__ __forceinline__ returntype read(float x, float y) \
{ return tex2D(texname##cn##_multi4, x, y); } };
CUDA_DECL_TEX_MULTI_ACCESS(Tex_I, tex_If, 1, float)
CUDA_DECL_TEX_MULTI_ACCESS(Tex_I, tex_If, 4, float4)
CUDA_DECL_TEX_MULTI_ACCESS(Tex_B, tex_Ib, 1, uchar)
CUDA_DECL_TEX_MULTI_ACCESS(Tex_J, tex_Jf, 1, float)
CUDA_DECL_TEX_MULTI_ACCESS(Tex_J, tex_Jf, 4, float4)
//--------------------------------------------------------------------------
__device__ __forceinline__ void accum(float& dst, float val) __device__ __forceinline__ void accum(float& dst, float val)
{ {
dst += val; dst += val;
...@@ -309,6 +404,200 @@ namespace pyrlk ...@@ -309,6 +404,200 @@ namespace pyrlk
} }
} }
#if defined(HAVE_TBB)
template <int cn, int index, int PATCH_X, int PATCH_Y, bool calcErr>
__global__ void sparseKernel_multi(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{
#if __CUDA_ARCH__ <= 110
const int BLOCK_SIZE = 128;
#else
const int BLOCK_SIZE = 256;
#endif
__shared__ float smem1[BLOCK_SIZE];
__shared__ float smem2[BLOCK_SIZE];
__shared__ float smem3[BLOCK_SIZE];
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
float2 prevPt = prevPts[blockIdx.x];
prevPt.x *= (1.0f / (1 << level));
prevPt.y *= (1.0f / (1 << level));
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
prevPt.x -= c_multi_halfWin_x<index>::get();
prevPt.y -= c_multi_halfWin_y<index>::get();
// extract the patch from the first image, compute covariation matrix of derivatives
float A11 = 0;
float A12 = 0;
float A22 = 0;
typedef typename TypeVec<float, cn>::vec_type work_type;
work_type I_patch [PATCH_Y][PATCH_X];
work_type dIdx_patch[PATCH_Y][PATCH_X];
work_type dIdy_patch[PATCH_Y][PATCH_X];
for (int yBase = threadIdx.y, i = 0; yBase < c_multi_winSize_y<index>::get(); yBase += blockDim.y, ++i)
{
for (int xBase = threadIdx.x, j = 0; xBase < c_multi_winSize_x<index>::get(); xBase += blockDim.x, ++j)
{
float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = Tex_I_multi<cn,index>::read(x, y);
// Sharr Deriv
work_type dIdx = 3.0f * Tex_I_multi<cn,index>::read(x+1, y-1) + 10.0f * Tex_I_multi<cn,index>::read(x+1, y) + 3.0f * Tex_I_multi<cn,index>::read(x+1, y+1) -
(3.0f * Tex_I_multi<cn,index>::read(x-1, y-1) + 10.0f * Tex_I_multi<cn,index>::read(x-1, y) + 3.0f * Tex_I_multi<cn,index>::read(x-1, y+1));
work_type dIdy = 3.0f * Tex_I_multi<cn,index>::read(x-1, y+1) + 10.0f * Tex_I_multi<cn,index>::read(x, y+1) + 3.0f * Tex_I_multi<cn,index>::read(x+1, y+1) -
(3.0f * Tex_I_multi<cn,index>::read(x-1, y-1) + 10.0f * Tex_I_multi<cn,index>::read(x, y-1) + 3.0f * Tex_I_multi<cn,index>::read(x+1, y-1));
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
accum(A11, dIdx * dIdx);
accum(A12, dIdx * dIdy);
accum(A22, dIdy * dIdy);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = A11;
smem2[0] = A12;
smem3[0] = A22;
}
#endif
__syncthreads();
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
float D = A11 * A22 - A12 * A12;
if (abs_(D) < numeric_limits<float>::epsilon())
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
D = 1.f / D;
A11 *= D;
A12 *= D;
A22 *= D;
float2 nextPt = nextPts[blockIdx.x];
nextPt.x *= 2.f;
nextPt.y *= 2.f;
nextPt.x -= c_multi_halfWin_x<index>::get();
nextPt.y -= c_multi_halfWin_y<index>::get();
for (int k = 0; k < c_multi_iters<index>::get(); ++k)
{
if (nextPt.x < -c_multi_halfWin_x<index>::get() || nextPt.x >= cols || nextPt.y < -c_multi_halfWin_y<index>::get() || nextPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
float b1 = 0;
float b2 = 0;
for (int y = threadIdx.y, i = 0; y < c_multi_winSize_y<index>::get(); y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_multi_winSize_x<index>::get(); x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J_multi<cn,index>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = (J_val - I_val) * 32.0f;
accum(b1, diff * dIdx_patch[i][j]);
accum(b2, diff * dIdy_patch[i][j]);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = b1;
smem2[0] = b2;
}
#endif
__syncthreads();
b1 = smem1[0];
b2 = smem2[0];
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
float errval = 0;
if (calcErr)
{
for (int y = threadIdx.y, i = 0; y < c_multi_winSize_y<index>::get(); y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_multi_winSize_x<index>::get(); x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J_multi<cn,index>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = J_val - I_val;
accum(errval, abs_(diff));
}
}
reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
}
if (tid == 0)
{
nextPt.x += c_multi_halfWin_x<index>::get();
nextPt.y += c_multi_halfWin_y<index>::get();
nextPts[blockIdx.x] = nextPt;
if (calcErr)
err[blockIdx.x] = static_cast<float>(errval) / (cn * c_multi_winSize_x<index>::get() * c_multi_winSize_y<index>::get());
}
}
#endif // defined(HAVE_TBB)
template <int cn, int PATCH_X, int PATCH_Y> template <int cn, int PATCH_X, int PATCH_Y>
void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream) int level, dim3 block, cudaStream_t stream)
...@@ -326,6 +615,26 @@ namespace pyrlk ...@@ -326,6 +615,26 @@ namespace pyrlk
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
#if defined(HAVE_TBB)
template <int cn, int index, int PATCH_X, int PATCH_Y>
void sparse_caller_multi(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
sparseKernel_multi<cn, index, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
else
sparseKernel_multi<cn, index, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
#endif // defined(HAVE_TBB)
template <bool calcErr> template <bool calcErr>
__global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
{ {
...@@ -484,6 +793,30 @@ namespace pyrlk ...@@ -484,6 +793,30 @@ namespace pyrlk
cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
} }
#if defined(HAVE_TBB)
void loadConstants_multi(int2 winSize, int iters, int index, cudaStream_t stream = 0)
{
int2 halfWin;
#define COPY_TO_SYMBOL_CALL(index) \
cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x##index, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); \
cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y##index, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); \
halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); \
cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x##index, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); \
cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y##index, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); \
cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters##index, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
switch(index)
{
case 0: COPY_TO_SYMBOL_CALL(0) break;
case 1: COPY_TO_SYMBOL_CALL(1) break;
case 2: COPY_TO_SYMBOL_CALL(2) break;
case 3: COPY_TO_SYMBOL_CALL(3) break;
case 4: COPY_TO_SYMBOL_CALL(4) break;
default: CV_Error(CV_StsBadArg, "invalid execution line index"); break;
}
}
#endif // defined(HAVE_TBB)
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream) int level, dim3 block, dim3 patch, cudaStream_t stream)
{ {
...@@ -528,6 +861,161 @@ namespace pyrlk ...@@ -528,6 +861,161 @@ namespace pyrlk
level, block, stream); level, block, stream);
} }
#if defined(HAVE_TBB)
void sparse1_multi(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream, int index)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5][5] =
{
{ // index 0
{sparse_caller_multi<1, 0, 1, 1>, sparse_caller_multi<1, 0, 2, 1>, sparse_caller_multi<1, 0, 3, 1>, sparse_caller_multi<1, 0, 4, 1>, sparse_caller_multi<1, 0, 5, 1>},
{sparse_caller_multi<1, 0, 1, 2>, sparse_caller_multi<1, 0, 2, 2>, sparse_caller_multi<1, 0, 3, 2>, sparse_caller_multi<1, 0, 4, 2>, sparse_caller_multi<1, 0, 5, 2>},
{sparse_caller_multi<1, 0, 1, 3>, sparse_caller_multi<1, 0, 2, 3>, sparse_caller_multi<1, 0, 3, 3>, sparse_caller_multi<1, 0, 4, 3>, sparse_caller_multi<1, 0, 5, 3>},
{sparse_caller_multi<1, 0, 1, 4>, sparse_caller_multi<1, 0, 2, 4>, sparse_caller_multi<1, 0, 3, 4>, sparse_caller_multi<1, 0, 4, 4>, sparse_caller_multi<1, 0, 5, 4>},
{sparse_caller_multi<1, 0, 1, 5>, sparse_caller_multi<1, 0, 2, 5>, sparse_caller_multi<1, 0, 3, 5>, sparse_caller_multi<1, 0, 4, 5>, sparse_caller_multi<1, 0, 5, 5>}
},
{ // index 1
{sparse_caller_multi<1, 1, 1, 1>, sparse_caller_multi<1, 1, 2, 1>, sparse_caller_multi<1, 1, 3, 1>, sparse_caller_multi<1, 1, 4, 1>, sparse_caller_multi<1, 1, 5, 1>},
{sparse_caller_multi<1, 1, 1, 2>, sparse_caller_multi<1, 1, 2, 2>, sparse_caller_multi<1, 1, 3, 2>, sparse_caller_multi<1, 1, 4, 2>, sparse_caller_multi<1, 1, 5, 2>},
{sparse_caller_multi<1, 1, 1, 3>, sparse_caller_multi<1, 1, 2, 3>, sparse_caller_multi<1, 1, 3, 3>, sparse_caller_multi<1, 1, 4, 3>, sparse_caller_multi<1, 1, 5, 3>},
{sparse_caller_multi<1, 1, 1, 4>, sparse_caller_multi<1, 1, 2, 4>, sparse_caller_multi<1, 1, 3, 4>, sparse_caller_multi<1, 1, 4, 4>, sparse_caller_multi<1, 1, 5, 4>},
{sparse_caller_multi<1, 1, 1, 5>, sparse_caller_multi<1, 1, 2, 5>, sparse_caller_multi<1, 1, 3, 5>, sparse_caller_multi<1, 1, 4, 5>, sparse_caller_multi<1, 1, 5, 5>}
},
{ // index 2
{sparse_caller_multi<1, 2, 1, 1>, sparse_caller_multi<1, 2, 2, 1>, sparse_caller_multi<1, 2, 3, 1>, sparse_caller_multi<1, 2, 4, 1>, sparse_caller_multi<1, 2, 5, 1>},
{sparse_caller_multi<1, 2, 1, 2>, sparse_caller_multi<1, 2, 2, 2>, sparse_caller_multi<1, 2, 3, 2>, sparse_caller_multi<1, 2, 4, 2>, sparse_caller_multi<1, 2, 5, 2>},
{sparse_caller_multi<1, 2, 1, 3>, sparse_caller_multi<1, 2, 2, 3>, sparse_caller_multi<1, 2, 3, 3>, sparse_caller_multi<1, 2, 4, 3>, sparse_caller_multi<1, 2, 5, 3>},
{sparse_caller_multi<1, 2, 1, 4>, sparse_caller_multi<1, 2, 2, 4>, sparse_caller_multi<1, 2, 3, 4>, sparse_caller_multi<1, 2, 4, 4>, sparse_caller_multi<1, 2, 5, 4>},
{sparse_caller_multi<1, 2, 1, 5>, sparse_caller_multi<1, 2, 2, 5>, sparse_caller_multi<1, 2, 3, 5>, sparse_caller_multi<1, 2, 4, 5>, sparse_caller_multi<1, 2, 5, 5>}
},
{ // index 3
{sparse_caller_multi<1, 3, 1, 1>, sparse_caller_multi<1, 3, 2, 1>, sparse_caller_multi<1, 3, 3, 1>, sparse_caller_multi<1, 3, 4, 1>, sparse_caller_multi<1, 3, 5, 1>},
{sparse_caller_multi<1, 3, 1, 2>, sparse_caller_multi<1, 3, 2, 2>, sparse_caller_multi<1, 3, 3, 2>, sparse_caller_multi<1, 3, 4, 2>, sparse_caller_multi<1, 3, 5, 2>},
{sparse_caller_multi<1, 3, 1, 3>, sparse_caller_multi<1, 3, 2, 3>, sparse_caller_multi<1, 3, 3, 3>, sparse_caller_multi<1, 3, 4, 3>, sparse_caller_multi<1, 3, 5, 3>},
{sparse_caller_multi<1, 3, 1, 4>, sparse_caller_multi<1, 3, 2, 4>, sparse_caller_multi<1, 3, 3, 4>, sparse_caller_multi<1, 3, 4, 4>, sparse_caller_multi<1, 3, 5, 4>},
{sparse_caller_multi<1, 3, 1, 5>, sparse_caller_multi<1, 3, 2, 5>, sparse_caller_multi<1, 3, 3, 5>, sparse_caller_multi<1, 3, 4, 5>, sparse_caller_multi<1, 3, 5, 5>}
},
{ // index 4
{sparse_caller_multi<1, 4, 1, 1>, sparse_caller_multi<1, 4, 2, 1>, sparse_caller_multi<1, 4, 3, 1>, sparse_caller_multi<1, 4, 4, 1>, sparse_caller_multi<1, 4, 5, 1>},
{sparse_caller_multi<1, 4, 1, 2>, sparse_caller_multi<1, 4, 2, 2>, sparse_caller_multi<1, 4, 3, 2>, sparse_caller_multi<1, 4, 4, 2>, sparse_caller_multi<1, 4, 5, 2>},
{sparse_caller_multi<1, 4, 1, 3>, sparse_caller_multi<1, 4, 2, 3>, sparse_caller_multi<1, 4, 3, 3>, sparse_caller_multi<1, 4, 4, 3>, sparse_caller_multi<1, 4, 5, 3>},
{sparse_caller_multi<1, 4, 1, 4>, sparse_caller_multi<1, 4, 2, 4>, sparse_caller_multi<1, 4, 3, 4>, sparse_caller_multi<1, 4, 4, 4>, sparse_caller_multi<1, 4, 5, 4>},
{sparse_caller_multi<1, 4, 1, 5>, sparse_caller_multi<1, 4, 2, 5>, sparse_caller_multi<1, 4, 3, 5>, sparse_caller_multi<1, 4, 4, 5>, sparse_caller_multi<1, 4, 5, 5>}
}
};
switch(index)
{
case 0:
bindTexture(&tex_If1_multi0, I);
bindTexture(&tex_Jf1_multi0, J);
break;
case 1:
bindTexture(&tex_If1_multi1, I);
bindTexture(&tex_Jf1_multi1, J);
break;
case 2:
bindTexture(&tex_If1_multi2, I);
bindTexture(&tex_Jf1_multi2, J);
break;
case 3:
bindTexture(&tex_If1_multi3, I);
bindTexture(&tex_Jf1_multi3, J);
break;
case 4:
bindTexture(&tex_If1_multi4, I);
bindTexture(&tex_Jf1_multi4, J);
break;
default:
CV_Error(CV_StsBadArg, "invalid execution line index");
break;
}
funcs[index][patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
void sparse4_multi(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream, int index)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5][5] =
{
{ // index 0
{sparse_caller_multi<4, 0, 1, 1>, sparse_caller_multi<4, 0, 2, 1>, sparse_caller_multi<4, 0, 3, 1>, sparse_caller_multi<4, 0, 4, 1>, sparse_caller_multi<4, 0, 5, 1>},
{sparse_caller_multi<4, 0, 1, 2>, sparse_caller_multi<4, 0, 2, 2>, sparse_caller_multi<4, 0, 3, 2>, sparse_caller_multi<4, 0, 4, 2>, sparse_caller_multi<4, 0, 5, 2>},
{sparse_caller_multi<4, 0, 1, 3>, sparse_caller_multi<4, 0, 2, 3>, sparse_caller_multi<4, 0, 3, 3>, sparse_caller_multi<4, 0, 4, 3>, sparse_caller_multi<4, 0, 5, 3>},
{sparse_caller_multi<4, 0, 1, 4>, sparse_caller_multi<4, 0, 2, 4>, sparse_caller_multi<4, 0, 3, 4>, sparse_caller_multi<4, 0, 4, 4>, sparse_caller_multi<4, 0, 5, 4>},
{sparse_caller_multi<4, 0, 1, 5>, sparse_caller_multi<4, 0, 2, 5>, sparse_caller_multi<4, 0, 3, 5>, sparse_caller_multi<4, 0, 4, 5>, sparse_caller_multi<4, 0, 5, 5>}
},
{ // index 1
{sparse_caller_multi<4, 1, 1, 1>, sparse_caller_multi<4, 1, 2, 1>, sparse_caller_multi<4, 1, 3, 1>, sparse_caller_multi<4, 1, 4, 1>, sparse_caller_multi<4, 1, 5, 1>},
{sparse_caller_multi<4, 1, 1, 2>, sparse_caller_multi<4, 1, 2, 2>, sparse_caller_multi<4, 1, 3, 2>, sparse_caller_multi<4, 1, 4, 2>, sparse_caller_multi<4, 1, 5, 2>},
{sparse_caller_multi<4, 1, 1, 3>, sparse_caller_multi<4, 1, 2, 3>, sparse_caller_multi<4, 1, 3, 3>, sparse_caller_multi<4, 1, 4, 3>, sparse_caller_multi<4, 1, 5, 3>},
{sparse_caller_multi<4, 1, 1, 4>, sparse_caller_multi<4, 1, 2, 4>, sparse_caller_multi<4, 1, 3, 4>, sparse_caller_multi<4, 1, 4, 4>, sparse_caller_multi<4, 1, 5, 4>},
{sparse_caller_multi<4, 1, 1, 5>, sparse_caller_multi<4, 1, 2, 5>, sparse_caller_multi<4, 1, 3, 5>, sparse_caller_multi<4, 1, 4, 5>, sparse_caller_multi<4, 1, 5, 5>}
},
{ // index 2
{sparse_caller_multi<4, 2, 1, 1>, sparse_caller_multi<4, 2, 2, 1>, sparse_caller_multi<4, 2, 3, 1>, sparse_caller_multi<4, 2, 4, 1>, sparse_caller_multi<4, 2, 5, 1>},
{sparse_caller_multi<4, 2, 1, 2>, sparse_caller_multi<4, 2, 2, 2>, sparse_caller_multi<4, 2, 3, 2>, sparse_caller_multi<4, 2, 4, 2>, sparse_caller_multi<4, 2, 5, 2>},
{sparse_caller_multi<4, 2, 1, 3>, sparse_caller_multi<4, 2, 2, 3>, sparse_caller_multi<4, 2, 3, 3>, sparse_caller_multi<4, 2, 4, 3>, sparse_caller_multi<4, 2, 5, 3>},
{sparse_caller_multi<4, 2, 1, 4>, sparse_caller_multi<4, 2, 2, 4>, sparse_caller_multi<4, 2, 3, 4>, sparse_caller_multi<4, 2, 4, 4>, sparse_caller_multi<4, 2, 5, 4>},
{sparse_caller_multi<4, 2, 1, 5>, sparse_caller_multi<4, 2, 2, 5>, sparse_caller_multi<4, 2, 3, 5>, sparse_caller_multi<4, 2, 4, 5>, sparse_caller_multi<4, 2, 5, 5>}
},
{ // index 3
{sparse_caller_multi<4, 3, 1, 1>, sparse_caller_multi<4, 3, 2, 1>, sparse_caller_multi<4, 3, 3, 1>, sparse_caller_multi<4, 3, 4, 1>, sparse_caller_multi<4, 3, 5, 1>},
{sparse_caller_multi<4, 3, 1, 2>, sparse_caller_multi<4, 3, 2, 2>, sparse_caller_multi<4, 3, 3, 2>, sparse_caller_multi<4, 3, 4, 2>, sparse_caller_multi<4, 3, 5, 2>},
{sparse_caller_multi<4, 3, 1, 3>, sparse_caller_multi<4, 3, 2, 3>, sparse_caller_multi<4, 3, 3, 3>, sparse_caller_multi<4, 3, 4, 3>, sparse_caller_multi<4, 3, 5, 3>},
{sparse_caller_multi<4, 3, 1, 4>, sparse_caller_multi<4, 3, 2, 4>, sparse_caller_multi<4, 3, 3, 4>, sparse_caller_multi<4, 3, 4, 4>, sparse_caller_multi<4, 3, 5, 4>},
{sparse_caller_multi<4, 3, 1, 5>, sparse_caller_multi<4, 3, 2, 5>, sparse_caller_multi<4, 3, 3, 5>, sparse_caller_multi<4, 3, 4, 5>, sparse_caller_multi<4, 3, 5, 5>}
},
{ // index 4
{sparse_caller_multi<4, 4, 1, 1>, sparse_caller_multi<4, 4, 2, 1>, sparse_caller_multi<4, 4, 3, 1>, sparse_caller_multi<4, 4, 4, 1>, sparse_caller_multi<4, 4, 5, 1>},
{sparse_caller_multi<4, 4, 1, 2>, sparse_caller_multi<4, 4, 2, 2>, sparse_caller_multi<4, 4, 3, 2>, sparse_caller_multi<4, 4, 4, 2>, sparse_caller_multi<4, 4, 5, 2>},
{sparse_caller_multi<4, 4, 1, 3>, sparse_caller_multi<4, 4, 2, 3>, sparse_caller_multi<4, 4, 3, 3>, sparse_caller_multi<4, 4, 4, 3>, sparse_caller_multi<4, 4, 5, 3>},
{sparse_caller_multi<4, 4, 1, 4>, sparse_caller_multi<4, 4, 2, 4>, sparse_caller_multi<4, 4, 3, 4>, sparse_caller_multi<4, 4, 4, 4>, sparse_caller_multi<4, 4, 5, 4>},
{sparse_caller_multi<4, 4, 1, 5>, sparse_caller_multi<4, 4, 2, 5>, sparse_caller_multi<4, 4, 3, 5>, sparse_caller_multi<4, 4, 4, 5>, sparse_caller_multi<4, 4, 5, 5>}
}
};
switch(index)
{
case 0:
bindTexture(&tex_If4_multi0, I);
bindTexture(&tex_Jf4_multi0, J);
break;
case 1:
bindTexture(&tex_If4_multi1, I);
bindTexture(&tex_Jf4_multi1, J);
break;
case 2:
bindTexture(&tex_If4_multi2, I);
bindTexture(&tex_Jf4_multi2, J);
break;
case 3:
bindTexture(&tex_If4_multi3, I);
bindTexture(&tex_Jf4_multi3, J);
break;
case 4:
bindTexture(&tex_If4_multi4, I);
bindTexture(&tex_Jf4_multi4, J);
break;
default:
CV_Error(CV_StsBadArg, "invalid execution line index");
break;
}
funcs[index][patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
#endif // defined(HAVE_TBB)
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream) void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
{ {
dim3 block(16, 16); dim3 block(16, 16);
......
...@@ -42,6 +42,11 @@ ...@@ -42,6 +42,11 @@
#include "precomp.hpp" #include "precomp.hpp"
#ifdef HAVE_TBB
#include <tbb/compat/condition_variable>
#include <tbb/mutex.h>
#endif
using namespace std; using namespace std;
using namespace cv; using namespace cv;
using namespace cv::gpu; using namespace cv::gpu;
...@@ -64,6 +69,22 @@ namespace pyrlk ...@@ -64,6 +69,22 @@ namespace pyrlk
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0); int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
#if !defined(HAVE_TBB)
void loadConstants_multi(int2, int, int, cudaStream_t) { throw_notbb(); }
void sparse1_multi(PtrStepSzf, PtrStepSzf, const float2*, float2*, uchar*, float*, int,
int, dim3, dim3, cudaStream_t, int) { throw_notbb(); }
void sparse4_multi(PtrStepSz<float4>, PtrStepSz<float4>, const float2*, float2*, uchar*, float*, int,
int, dim3, dim3, cudaStream_t, int) { throw_notbb(); }
#else
void loadConstants_multi(int2 winSize, int iters, int index = 0, cudaStream_t stream = 0);
void sparse1_multi(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0, int index = 0);
void sparse4_multi(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream = 0, int index = 0);
#endif
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream = 0); PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
} }
...@@ -98,7 +119,9 @@ namespace ...@@ -98,7 +119,9 @@ namespace
} }
} }
void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err) void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg,
const GpuMat& nextImg, const GpuMat& prevPts,
GpuMat& nextPts, GpuMat& status, GpuMat* err)
{ {
if (prevPts.empty()) if (prevPts.empty())
{ {
...@@ -181,6 +204,130 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next ...@@ -181,6 +204,130 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
} }
} }
#ifdef HAVE_TBB
//--------------------------------------------------------------------------
// Multi-threading support
static bool index_vector_use[5] = {true, true, true, true, true}; // all free
static tbb::mutex s_PyrLKOpticalFlow_Mutex;
static condition_variable s_PyrLKOpticalFlow_ConditionVariable;
void cv::gpu::PyrLKOpticalFlow::sparse_multi(const GpuMat& prevImg,
const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, Stream& stream, GpuMat* err)
{
if (prevPts.empty())
{
nextPts.release();
status.release();
if (err) err->release();
return;
}
dim3 block, patch;
calcPatchSize(winSize, block, patch);
CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
CV_Assert(maxLevel >= 0);
CV_Assert(winSize.width > 2 && winSize.height > 2);
CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
if (useInitialFlow)
CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
else
ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
GpuMat temp2 = nextPts.reshape(1);
multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
status.setTo(Scalar::all(1));
if (err)
ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
// build the image pyramids.
prevPyr_.resize(maxLevel + 1);
nextPyr_.resize(maxLevel + 1);
int cn = prevImg.channels();
if (cn == 1 || cn == 4)
{
prevImg.convertTo(prevPyr_[0], CV_32F);
nextImg.convertTo(nextPyr_[0], CV_32F);
}
else
{
buf_.resize(1);
cvtColor(prevImg, buf_[0], COLOR_BGR2BGRA);
buf_[0].convertTo(prevPyr_[0], CV_32F);
cvtColor(nextImg, buf_[0], COLOR_BGR2BGRA);
buf_[0].convertTo(nextPyr_[0], CV_32F);
}
for (int level = 1; level <= maxLevel; ++level)
{
pyrDown(prevPyr_[level - 1], prevPyr_[level]);
pyrDown(nextPyr_[level - 1], nextPyr_[level]);
}
//--------------------------------------------------------------------------
// Multithreading support
int index = -1;
do
{
unique_lock<tbb::mutex> ul(s_PyrLKOpticalFlow_Mutex);
for (unsigned int uiI = 0; uiI < 5; ++uiI)
{
if (index_vector_use[uiI])
{
index = uiI;
index_vector_use[uiI] = false;
break;
}
}
if (index < 0)
s_PyrLKOpticalFlow_ConditionVariable.wait(ul);
ul.unlock();
}while (index < 0);
//--------------------------------------------------------------------------
pyrlk::loadConstants_multi(make_int2(winSize.width, winSize.height), iters, index);
for (int level = maxLevel; level >= 0; level--)
{
if (cn == 1)
{
pyrlk::sparse1_multi(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(),
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch, StreamAccessor::getStream(stream), index);
}
else
{
pyrlk::sparse4_multi(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(),
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch, StreamAccessor::getStream(stream), index);
}
}
unique_lock<tbb::mutex> ul(s_PyrLKOpticalFlow_Mutex);
index_vector_use[index] = true;
s_PyrLKOpticalFlow_ConditionVariable.notify_one();
}
#endif
void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err) void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
{ {
CV_Assert(prevImg.type() == CV_8UC1); CV_Assert(prevImg.type() == CV_8UC1);
......
...@@ -44,6 +44,10 @@ ...@@ -44,6 +44,10 @@
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
#ifdef HAVE_TBB
#include <tbb/tbb.h>
#endif
using namespace cvtest; using namespace cvtest;
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
...@@ -322,6 +326,134 @@ GPU_TEST_P(PyrLKOpticalFlow, Sparse) ...@@ -322,6 +326,134 @@ GPU_TEST_P(PyrLKOpticalFlow, Sparse)
ASSERT_LE(bad_ratio, 0.01); ASSERT_LE(bad_ratio, 0.01);
} }
#ifdef HAVE_TBB
struct Sparse_Multi_Functor
{
explicit Sparse_Multi_Functor(const cv::Mat& in_frame0, const cv::Mat& in_frame1,
const cv::Mat& in_pts_mat,
cv::gpu::GpuMat* in_d_pts,
cv::gpu::GpuMat* in_d_nextPts,
cv::gpu::GpuMat* in_d_status,
cv::gpu::Stream* in_streams):
m_frame0(in_frame0), m_frame1(in_frame1),
m_pts_mat(in_pts_mat),
m_d_pts(in_d_pts), m_d_nextPts(in_d_nextPts),
m_d_status(in_d_status), m_streams(in_streams){}
void operator()( const tbb::blocked_range<size_t>& r ) const
{
for( size_t i = r.begin(); i != r.end(); ++i )
{
m_d_pts[i].upload(m_pts_mat);
cv::gpu::PyrLKOpticalFlow pyrLK;
pyrLK.sparse_multi(loadMat(m_frame0), loadMat(m_frame1), m_d_pts[i],
m_d_nextPts[i], m_d_status[i], m_streams[i]);
m_streams[i].waitForCompletion();
}
}
const cv::Mat& m_frame0;
const cv::Mat& m_frame1;
const cv::Mat& m_pts_mat;
cv::gpu::GpuMat* m_d_pts;
cv::gpu::GpuMat* m_d_nextPts;
cv::gpu::GpuMat* m_d_status;
cv::gpu::Stream* m_streams;
};
GPU_TEST_P(PyrLKOpticalFlow, Sparse_Multi)
{
cv::Mat frame0 = readImage("opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty());
cv::Mat frame1 = readImage("opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame;
if (useGray)
gray_frame = frame0;
else
cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
std::vector<cv::Point2f> pts;
cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0);
//--------------------------------------------------------------------------
// GPU
const unsigned int NB_EXEC_LINES = 27;
cv::gpu::GpuMat d_pts[NB_EXEC_LINES];
cv::gpu::GpuMat d_nextPts[NB_EXEC_LINES];
cv::gpu::GpuMat d_status[NB_EXEC_LINES];
cv::gpu::Stream streams[NB_EXEC_LINES];
cv::Mat pts_mat(1, (int) pts.size(), CV_32FC2, (void*) &pts[0]);
tbb::parallel_for(tbb::blocked_range<size_t>(0, NB_EXEC_LINES),
Sparse_Multi_Functor(frame0, frame1, pts_mat,
d_pts, d_nextPts, d_status, streams));
std::vector<cv::Point2f> nextPts[NB_EXEC_LINES];
std::vector<unsigned char> status[NB_EXEC_LINES];
for (unsigned int i = 0; i < NB_EXEC_LINES; ++i)
{
nextPts[i].resize(d_nextPts[i].cols);
cv::Mat nextPts_mat(1, d_nextPts[i].cols, CV_32FC2, (void*) &(nextPts[i][0]));
d_nextPts[i].download(nextPts_mat);
status[i].resize(d_status[i].cols);
cv::Mat status_mat(1, d_status[i].cols, CV_8UC1, (void*) &(status[i][0]));
d_status[i].download(status_mat);
}
//--------------------------------------------------------------------------
// CPU
std::vector<cv::Point2f> nextPts_gold;
std::vector<unsigned char> status_gold;
cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
//--------------------------------------------------------------------------
// CHECKS
for (unsigned int uiI = 0; uiI < NB_EXEC_LINES; ++uiI)
{
ASSERT_EQ(nextPts_gold.size(), nextPts[uiI].size());
ASSERT_EQ(status_gold.size(), status[uiI].size());
}
size_t mistmatch = 0;
for (unsigned int uiI = 0; uiI < NB_EXEC_LINES; ++uiI)
{
for (size_t i = 0; i < nextPts[uiI].size(); ++i)
{
cv::Point2i a = nextPts[uiI][i];
cv::Point2i b = nextPts_gold[i];
if (status[uiI][i] != status_gold[i])
{
++mistmatch;
continue;
}
if (status[uiI][i])
{
bool eq = std::abs(a.x - b.x) <= 1 && std::abs(a.y - b.y) <= 1;
if (!eq)
++mistmatch;
}
}
}
double bad_ratio = static_cast<double>(mistmatch) / (nextPts[0].size() * NB_EXEC_LINES);
ASSERT_LE(bad_ratio, 0.01);
}
#endif // HAVE_TBB
INSTANTIATE_TEST_CASE_P(GPU_Video, PyrLKOpticalFlow, testing::Combine( INSTANTIATE_TEST_CASE_P(GPU_Video, PyrLKOpticalFlow, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
testing::Values(UseGray(true), UseGray(false)))); testing::Values(UseGray(true), UseGray(false))));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment