Commit 66738d74 authored by Dan Moodie's avatar Dan Moodie

Modified sparse pyrlk optical flow to allow input of an image pyramid which thus…

Modified sparse pyrlk optical flow to allow input of an image pyramid which thus allows caching of image pyramids on successive calls.
Added unsigned char support for 1, 3, 4 channel images.
parent 8d79285d
...@@ -116,10 +116,10 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse, ...@@ -116,10 +116,10 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
const int levels = GET_PARAM(4); const int levels = GET_PARAM(4);
const int iters = GET_PARAM(5); const int iters = GET_PARAM(5);
const cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty()); ASSERT_FALSE(frame0.empty());
const cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame; cv::Mat gray_frame;
...@@ -131,6 +131,14 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse, ...@@ -131,6 +131,14 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
cv::Mat pts; cv::Mat pts;
cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0); cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
frame0.convertTo(frame0, CV_32F);
frame1.convertTo(frame1, CV_32F);
if(!useGray)
{
cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
}
if (PERF_RUN_CUDA()) if (PERF_RUN_CUDA())
{ {
const cv::cuda::GpuMat d_pts(pts.reshape(2, 1)); const cv::cuda::GpuMat d_pts(pts.reshape(2, 1));
...@@ -318,4 +326,4 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1, ...@@ -318,4 +326,4 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
CPU_SANITY_CHECK(flow); CPU_SANITY_CHECK(flow);
} }
} }
\ No newline at end of file
...@@ -48,6 +48,8 @@ ...@@ -48,6 +48,8 @@
#include "opencv2/core/cuda/limits.hpp" #include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/vec_math.hpp" #include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/filters.hpp"
#include "opencv2/core/cuda/border_interpolate.hpp"
using namespace cv::cuda; using namespace cv::cuda;
using namespace cv::cuda::device; using namespace cv::cuda::device;
...@@ -60,53 +62,240 @@ namespace pyrlk ...@@ -60,53 +62,240 @@ namespace pyrlk
__constant__ int c_halfWin_y; __constant__ int c_halfWin_y;
__constant__ int c_iters; __constant__ int c_iters;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp); texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp); texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
template <int cn> struct Tex_I;
template <> struct Tex_I<1> template <int cn, typename T> struct Tex_I
{
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
{
(void)I;
}
};
template <> struct Tex_I<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_I8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
{
bindTexture(&tex_I8U, I);
}
};
template <> struct Tex_I<1, ushort>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
{
(void)I;
}
};
template <> struct Tex_I<1, int>
{
static __device__ __forceinline__ float read(float x, float y)
{
return 0.0;
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
{
(void)I;
}
};
template <> struct Tex_I<1, float>
{ {
static __device__ __forceinline__ float read(float x, float y) static __device__ __forceinline__ float read(float x, float y)
{ {
return tex2D(tex_If, x, y); return tex2D(tex_If, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
{
bindTexture(&tex_If, I);
}
}; };
template <> struct Tex_I<4> // ****************** 3 channel specializations ************************
template <> struct Tex_I<3, uchar>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0,0,0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
{
(void)I;
}
};
template <> struct Tex_I<3, ushort>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
{
(void)I;
}
};
template <> struct Tex_I<3, int>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
{
(void)I;
}
};
template <> struct Tex_I<3, float>
{
static __device__ __forceinline__ float3 read(float x, float y)
{
return make_float3(0, 0, 0);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
{
(void)I;
}
};
// ****************** 4 channel specializations ************************
template <> struct Tex_I<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
{
bindTexture(&tex_I8UC4, I);
}
};
template <> struct Tex_I<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_I16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
{
bindTexture(&tex_I16UC4, I);
}
};
template <> struct Tex_I<4, float>
{ {
static __device__ __forceinline__ float4 read(float x, float y) static __device__ __forceinline__ float4 read(float x, float y)
{ {
return tex2D(tex_If4, x, y); return tex2D(tex_If4, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
{
bindTexture(&tex_If4, I);
}
}; };
// ************* J ***************
template <int cn> struct Tex_J; template <int cn, typename T> struct Tex_J
template <> struct Tex_J<1> {
static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
{
(void)J;
}
};
template <> struct Tex_J<1, uchar>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_J8U, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
{
bindTexture(&tex_J8U, J);
}
};
template <> struct Tex_J<1, float>
{ {
static __device__ __forceinline__ float read(float x, float y) static __device__ __forceinline__ float read(float x, float y)
{ {
return tex2D(tex_Jf, x, y); return tex2D(tex_Jf, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
{
bindTexture(&tex_Jf, J);
}
}; };
template <> struct Tex_J<4> // ************* 4 channel specializations ***************
template <> struct Tex_J<4, uchar>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J8UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
{
bindTexture(&tex_J8UC4, J);
}
};
template <> struct Tex_J<4, ushort>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_J16UC4, x, y);
}
static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
{
bindTexture(&tex_J16UC4, J);
}
};
template <> struct Tex_J<4, float>
{ {
static __device__ __forceinline__ float4 read(float x, float y) static __device__ __forceinline__ float4 read(float x, float y)
{ {
return tex2D(tex_Jf4, x, y); return tex2D(tex_Jf4, x, y);
} }
static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
{
bindTexture(&tex_Jf4, J);
}
}; };
__device__ __forceinline__ void accum(float& dst, float val) __device__ __forceinline__ void accum(float& dst, const float& val)
{ {
dst += val; dst += val;
} }
__device__ __forceinline__ void accum(float& dst, const float4& val) __device__ __forceinline__ void accum(float& dst, const float2& val)
{
dst += val.x + val.y;
}
__device__ __forceinline__ void accum(float& dst, const float3& val)
{ {
dst += val.x + val.y + val.z; dst += val.x + val.y + val.z;
} }
__device__ __forceinline__ void accum(float& dst, const float4& val)
{
dst += val.x + val.y + val.z + val.w;
}
__device__ __forceinline__ float abs_(float a) __device__ __forceinline__ float abs_(float a)
{ {
...@@ -116,8 +305,46 @@ namespace pyrlk ...@@ -116,8 +305,46 @@ namespace pyrlk
{ {
return abs(a); return abs(a);
} }
__device__ __forceinline__ float2 abs_(const float2& a)
{
return abs(a);
}
__device__ __forceinline__ float3 abs_(const float3& a)
{
return abs(a);
}
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr> template<typename T> __device__ __forceinline__ typename TypeVec<float, 1>::vec_type ToFloat(const typename TypeVec<T, 1>::vec_type& other)
{
return other;
}
template<typename T> __device__ __forceinline__ typename TypeVec<float, 2>::vec_type ToFloat(const typename TypeVec<T, 2>::vec_type& other)
{
typename TypeVec<float, 2>::vec_type ret;
ret.x = other.x;
ret.y = other.y;
return ret;
}
template<typename T> __device__ __forceinline__ typename TypeVec<float, 3>::vec_type ToFloat(const typename TypeVec<T, 3>::vec_type& other)
{
typename TypeVec<float, 3>::vec_type ret;
ret.x = other.x;
ret.y = other.y;
ret.z = other.z;
return ret;
}
template<typename T> __device__ __forceinline__ typename TypeVec<float, 4>::vec_type ToFloat(const typename TypeVec<T, 4>::vec_type& other)
{
typename TypeVec<float, 4>::vec_type ret;
ret.x = other.x;
ret.y = other.y;
ret.z = other.z;
ret.w = other.w;
return ret;
}
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
__global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{ {
#if __CUDA_ARCH__ <= 110 #if __CUDA_ARCH__ <= 110
...@@ -166,15 +393,15 @@ namespace pyrlk ...@@ -166,15 +393,15 @@ namespace pyrlk
float x = prevPt.x + xBase + 0.5f; float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f; float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = Tex_I<cn>::read(x, y); I_patch[i][j] = Tex_I<cn, T>::read(x, y);
// Sharr Deriv // Sharr Deriv
work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) - work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1)); (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) - work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1)); (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
dIdx_patch[i][j] = dIdx; dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy; dIdy_patch[i][j] = dIdy;
...@@ -243,7 +470,7 @@ namespace pyrlk ...@@ -243,7 +470,7 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{ {
work_type I_val = I_patch[i][j]; work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = (J_val - I_val) * 32.0f; work_type diff = (J_val - I_val) * 32.0f;
...@@ -286,7 +513,7 @@ namespace pyrlk ...@@ -286,7 +513,7 @@ namespace pyrlk
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{ {
work_type I_val = I_patch[i][j]; work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = J_val - I_val; work_type diff = J_val - I_val;
...@@ -309,22 +536,352 @@ namespace pyrlk ...@@ -309,22 +536,352 @@ namespace pyrlk
} }
} }
template <int cn, int PATCH_X, int PATCH_Y> // Kernel, uses non texture fetches
void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, template <int PATCH_X, int PATCH_Y, bool calcErr, int cn, typename T, typename Ptr2D>
int level, dim3 block, cudaStream_t stream) __global__ void sparseKernel_(Ptr2D I, Ptr2D J, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{ {
dim3 grid(ptcount); #if __CUDA_ARCH__ <= 110
const int BLOCK_SIZE = 128;
#else
const int BLOCK_SIZE = 256;
#endif
if (level == 0 && err) __shared__ float smem1[BLOCK_SIZE];
sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols); __shared__ float smem2[BLOCK_SIZE];
else __shared__ float smem3[BLOCK_SIZE];
sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
cudaSafeCall( cudaGetLastError() ); const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
float2 prevPt = prevPts[blockIdx.x];
prevPt.x *= (1.0f / (1 << level));
prevPt.y *= (1.0f / (1 << level));
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
prevPt.x -= c_halfWin_x;
prevPt.y -= c_halfWin_y;
// extract the patch from the first image, compute covariation matrix of derivatives
float A11 = 0;
float A12 = 0;
float A22 = 0;
typedef typename TypeVec<float, cn>::vec_type work_type;
work_type I_patch[PATCH_Y][PATCH_X];
work_type dIdx_patch[PATCH_Y][PATCH_X];
work_type dIdy_patch[PATCH_Y][PATCH_X];
for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
{
for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
{
float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = ToFloat<T>(I(y, x));
// Sharr Deriv
work_type dIdx = 3.0f * I(y - 1, x + 1) + 10.0f * I(y, x + 1) + 3.0f * I(y + 1, x + 1) -
(3.0f * I(y - 1, x - 1) + 10.0f * I(y, x - 1) + 3.0f * I(y + 1 , x - 1));
work_type dIdy = 3.0f * I(y + 1, x - 1) + 10.0f * I(y + 1, x) + 3.0f * I(y+1, x + 1) -
(3.0f * I(y - 1, x - 1) + 10.0f * I(y-1, x) + 3.0f * I(y - 1, x + 1));
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
accum(A11, dIdx * dIdx);
accum(A12, dIdx * dIdy);
accum(A22, dIdy * dIdy);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = A11;
smem2[0] = A12;
smem3[0] = A22;
}
#endif
__syncthreads();
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
float D = A11 * A22 - A12 * A12;
if (D < numeric_limits<float>::epsilon())
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
D = 1.f / D;
A11 *= D;
A12 *= D;
A22 *= D;
float2 nextPt = nextPts[blockIdx.x];
nextPt.x *= 2.f;
nextPt.y *= 2.f;
nextPt.x -= c_halfWin_x;
nextPt.y -= c_halfWin_y;
for (int k = 0; k < c_iters; ++k)
{
if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
}
float b1 = 0;
float b2 = 0;
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
work_type diff = (J_val - I_val) * 32.0f;
accum(b1, diff * dIdx_patch[i][j]);
accum(b2, diff * dIdy_patch[i][j]);
}
}
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = b1;
smem2[0] = b2;
}
#endif
__syncthreads();
b1 = smem1[0];
b2 = smem2[0];
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
nextPt.x += delta.x;
nextPt.y += delta.y;
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
float errval = 0;
if (calcErr)
{
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
work_type diff = J_val - I_val;
accum(errval, abs_(diff));
}
}
reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
}
if (tid == 0)
{
nextPt.x += c_halfWin_x;
nextPt.y += c_halfWin_y;
nextPts[blockIdx.x] = nextPt;
if (calcErr)
err[blockIdx.x] = static_cast<float>(errval) / (3 * c_winSize_x * c_winSize_y);
}
} // __global__ void sparseKernel_
template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
{
public:
static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
(void)I;
(void)J;
if (level == 0 && err)
sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
else
sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
// Specialization to use non texture path because for some reason the texture path keeps failing accuracy tests
template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, unsigned short>
{
public:
typedef typename TypeVec<unsigned short, 1>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 1, unsigned short> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 1, unsigned short> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
// Specialization for int because the texture path keeps failing
template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, int>
{
public:
typedef typename TypeVec<int, 1>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 1, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 1, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
template<int PATCH_X, int PATCH_Y> class sparse_caller<4, PATCH_X, PATCH_Y, int>
{
public:
typedef typename TypeVec<int, 4>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 4, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 4, int> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
using namespace cv::cuda::device;
template <int PATCH_X, int PATCH_Y, typename T> class sparse_caller<3, PATCH_X, PATCH_Y, T>
{
public:
typedef typename TypeVec<T, 3>::vec_type work_type;
typedef PtrStepSz<work_type> Ptr2D;
typedef BrdConstant<work_type> BrdType;
typedef BorderReader<Ptr2D, BrdType> Reader;
typedef LinearFilter<Reader> Filter;
static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
if (level == 0 && err)
{
sparseKernel_<PATCH_X, PATCH_Y, true, 3, T> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
else
{
sparseKernel_<PATCH_X, PATCH_Y, false, 3, T> <<<grid, block, 0, stream >>>(
Filter(Reader(I, BrdType(rows, cols))),
Filter(Reader(J, BrdType(rows, cols))),
prevPts, nextPts, status, err, level, rows, cols);
}
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
};
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <bool calcErr> template <bool calcErr>
__global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
...@@ -484,77 +1041,72 @@ namespace pyrlk ...@@ -484,77 +1041,72 @@ namespace pyrlk
cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
} }
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, template<typename T, int cn> struct pyrLK_caller
int level, dim3 block, dim3 patch, cudaStream_t stream)
{ {
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, static void sparse(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream); int level, dim3 block, dim3 patch, cudaStream_t stream)
static const func_t funcs[5][5] =
{ {
{sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>}, typedef void(*func_t)(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J,
{sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>}, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
{sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>}, int level, dim3 block, cudaStream_t stream);
{sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
{sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
};
bindTexture(&tex_If, I); static const func_t funcs[5][5] =
bindTexture(&tex_Jf, J); {
{ sparse_caller<cn, 1, 1,T>::call, sparse_caller<cn, 2, 1,T>::call, sparse_caller<cn, 3, 1,T>::call, sparse_caller<cn, 4, 1,T>::call, sparse_caller<cn, 5, 1,T>::call },
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, { sparse_caller<cn, 1, 2,T>::call, sparse_caller<cn, 2, 2,T>::call, sparse_caller<cn, 3, 2,T>::call, sparse_caller<cn, 4, 2,T>::call, sparse_caller<cn, 5, 2,T>::call },
level, block, stream); { sparse_caller<cn, 1, 3,T>::call, sparse_caller<cn, 2, 3,T>::call, sparse_caller<cn, 3, 3,T>::call, sparse_caller<cn, 4, 3,T>::call, sparse_caller<cn, 5, 3,T>::call },
} { sparse_caller<cn, 1, 4,T>::call, sparse_caller<cn, 2, 4,T>::call, sparse_caller<cn, 3, 4,T>::call, sparse_caller<cn, 4, 4,T>::call, sparse_caller<cn, 5, 4,T>::call },
{ sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, };
int level, dim3 block, dim3 patch, cudaStream_t stream)
{ Tex_I<cn, T>::bindTexture_(I);
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, Tex_J<cn, T>::bindTexture_(J);
int level, dim3 block, cudaStream_t stream);
funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
static const func_t funcs[5][5] = level, block, stream);
}
static void dense(PtrStepSzb I, PtrStepSz<T> J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
{ {
{sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>}, dim3 block(16, 16);
{sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>}, dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
{sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>}, Tex_I<1, uchar>::bindTexture_(I);
{sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>}, Tex_J<1, T>::bindTexture_(J);
{sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
};
bindTexture(&tex_If4, I);
bindTexture(&tex_Jf4, J);
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
level, block, stream); const int patchWidth = block.x + 2 * halfWin.x;
} const int patchHeight = block.y + 2 * halfWin.y;
size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream) if (err.data)
{ {
dim3 block(16, 16); denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols);
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); cudaSafeCall(cudaGetLastError());
}
else
{
denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
cudaSafeCall(cudaGetLastError());
}
bindTexture(&tex_Ib, I); if (stream == 0)
bindTexture(&tex_Jf, J); cudaSafeCall(cudaDeviceSynchronize());
}
};
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); template class pyrLK_caller<unsigned char,1>;
const int patchWidth = block.x + 2 * halfWin.x; template class pyrLK_caller<unsigned short,1>;
const int patchHeight = block.y + 2 * halfWin.y; template class pyrLK_caller<int,1>;
size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); template class pyrLK_caller<float,1>;
if (err.data) template class pyrLK_caller<unsigned char, 3>;
{ template class pyrLK_caller<unsigned short, 3>;
denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols); template class pyrLK_caller<int, 3>;
cudaSafeCall( cudaGetLastError() ); template class pyrLK_caller<float, 3>;
}
else
{
denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0) template class pyrLK_caller<unsigned char, 4>;
cudaSafeCall( cudaDeviceSynchronize() ); template class pyrLK_caller<unsigned short, 4>;
} template class pyrLK_caller<int, 4>;
template class pyrLK_caller<float, 4>;
} }
#endif /* CUDA_DISABLER */ #endif /* CUDA_DISABLER */
\ No newline at end of file
...@@ -52,7 +52,7 @@ ...@@ -52,7 +52,7 @@
#include "opencv2/video.hpp" #include "opencv2/video.hpp"
#include "opencv2/core/private.cuda.hpp" #include "opencv2/core/private.cuda.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/opencv_modules.hpp" #include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_CUDALEGACY #ifdef HAVE_OPENCV_CUDALEGACY
......
...@@ -56,14 +56,20 @@ Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, in ...@@ -56,14 +56,20 @@ Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, in
namespace pyrlk namespace pyrlk
{ {
void loadConstants(int2 winSize, int iters, cudaStream_t stream); void loadConstants(int2 winSize, int iters, cudaStream_t stream);
template<typename T, int cn> struct pyrLK_caller
{
static void sparse(PtrStepSz<typename device::TypeVec<T, cn>::vec_type> I, PtrStepSz<typename device::TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream);
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, static void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
int level, dim3 block, dim3 patch, cudaStream_t stream); PtrStepSzf err, int2 winSize, cudaStream_t stream);
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, };
int level, dim3 block, dim3 patch, cudaStream_t stream);
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, template<typename T, int cn> void dispatcher(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
PtrStepSzf err, int2 winSize, cudaStream_t stream); int level, dim3 block, dim3 patch, cudaStream_t stream)
{
pyrLK_caller<T, cn>::sparse(I, J, prevPts, nextPts, status, err, ptcount, level, block, patch, stream);
}
} }
namespace namespace
...@@ -76,6 +82,9 @@ namespace ...@@ -76,6 +82,9 @@ namespace
void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err, Stream& stream); GpuMat& status, GpuMat* err, Stream& stream);
void sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err, Stream& stream);
void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream); void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
protected: protected:
...@@ -83,8 +92,9 @@ namespace ...@@ -83,8 +92,9 @@ namespace
int maxLevel_; int maxLevel_;
int iters_; int iters_;
bool useInitialFlow_; bool useInitialFlow_;
void buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream);
private: private:
friend class SparsePyrLKOpticalFlowImpl;
std::vector<GpuMat> prevPyr_; std::vector<GpuMat> prevPyr_;
std::vector<GpuMat> nextPyr_; std::vector<GpuMat> nextPyr_;
}; };
...@@ -113,28 +123,34 @@ namespace ...@@ -113,28 +123,34 @@ namespace
block.z = patch.z = 1; block.z = patch.z = 1;
} }
void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream) void PyrLKOpticalFlowBase::buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream)
{ {
if (prevPts.empty()) prevPyr.resize(maxLevel_ + 1);
{ nextPyr.resize(maxLevel_ + 1);
nextPts.release();
status.release();
if (err) err->release();
return;
}
dim3 block, patch; int cn = prevImg.channels();
calcPatchSize(winSize_, block, patch);
CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 ); CV_Assert(cn == 1 || cn == 3 || cn == 4);
CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
CV_Assert( maxLevel_ >= 0 ); prevPyr[0] = prevImg;
CV_Assert( winSize_.width > 2 && winSize_.height > 2 ); nextPyr[0] = nextImg;
CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 );
CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 );
for (int level = 1; level <= maxLevel_; ++level)
{
cuda::pyrDown(prevPyr[level - 1], prevPyr[level], stream);
cuda::pyrDown(nextPyr[level - 1], nextPyr[level], stream);
}
}
void PyrLKOpticalFlowBase::sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
GpuMat& status, GpuMat* err, Stream& stream)
{
CV_Assert(prevPyr.size() && nextPyr.size() && "Pyramid needs to at least contain the original matrix as the first element");
CV_Assert(prevPyr[0].size() == nextPyr[0].size());
CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
CV_Assert(maxLevel_ >= 0);
CV_Assert(winSize_.width > 2 && winSize_.height > 2);
if (useInitialFlow_) if (useInitialFlow_)
CV_Assert( nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type() ); CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type());
else else
ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts); ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
...@@ -142,66 +158,70 @@ namespace ...@@ -142,66 +158,70 @@ namespace
GpuMat temp2 = nextPts.reshape(1); GpuMat temp2 = nextPts.reshape(1);
cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream); cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);
ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
status.setTo(Scalar::all(1), stream); status.setTo(Scalar::all(1), stream);
if (err) if (err)
ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
// build the image pyramids. if (prevPyr.size() != size_t(maxLevel_ + 1) || nextPyr.size() != size_t(maxLevel_ + 1))
{
buildImagePyramid(prevPyr[0], prevPyr, nextPyr[0], nextPyr, stream);
}
BufferPool pool(stream); dim3 block, patch;
calcPatchSize(winSize_, block, patch);
CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
prevPyr_.resize(maxLevel_ + 1); const int cn = prevPyr[0].channels();
nextPyr_.resize(maxLevel_ + 1); const int type = prevPyr[0].depth();
int cn = prevImg.channels(); typedef void(*func_t)(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream);
if (cn == 1 || cn == 4) // Current int datatype is disabled due to pyrDown not implementing it
// while ushort does work, it has significantly worse performance, and thus doesn't pass accuracy tests.
static const func_t funcs[6][4] =
{ {
prevImg.convertTo(prevPyr_[0], CV_32F, stream); { pyrlk::dispatcher<uchar, 1> , /*pyrlk::dispatcher<uchar, 2>*/ 0, pyrlk::dispatcher<uchar, 3> , pyrlk::dispatcher<uchar, 4> },
nextImg.convertTo(nextPyr_[0], CV_32F, stream); { /*pyrlk::dispatcher<char, 1>*/ 0, /*pyrlk::dispatcher<char, 2>*/ 0, /*pyrlk::dispatcher<char, 3>*/ 0, /*pyrlk::dispatcher<char, 4>*/ 0 },
} { pyrlk::dispatcher<ushort, 1> , /*pyrlk::dispatcher<ushort, 2>*/0, pyrlk::dispatcher<ushort, 3> , pyrlk::dispatcher<ushort, 4> },
else { /*pyrlk::dispatcher<short, 1>*/ 0, /*pyrlk::dispatcher<short, 2>*/ 0, /*pyrlk::dispatcher<short, 3>*/ 0, /*pyrlk::dispatcher<short, 4>*/0 },
{ { pyrlk::dispatcher<int, 1> , /*pyrlk::dispatcher<int, 2>*/ 0, pyrlk::dispatcher<int, 3> , pyrlk::dispatcher<int, 4> },
GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4)); { pyrlk::dispatcher<float, 1> , /*pyrlk::dispatcher<float, 2>*/ 0, pyrlk::dispatcher<float, 3> , pyrlk::dispatcher<float, 4> }
};
cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream);
buf.convertTo(prevPyr_[0], CV_32F, stream);
cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream); func_t func = funcs[type][cn-1];
buf.convertTo(nextPyr_[0], CV_32F, stream); CV_Assert(func != NULL && "Datatype not implemented");
for (int level = maxLevel_; level >= 0; level--)
{
func(prevPyr[level], nextPyr[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(),
status.ptr(), level == 0 && err ? err->ptr<float>() : 0,
prevPts.cols, level, block, patch,
StreamAccessor::getStream(stream));
} }
}
for (int level = 1; level <= maxLevel_; ++level) void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
{
if (prevPts.empty())
{ {
cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream); nextPts.release();
cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream); status.release();
if (err) err->release();
return;
} }
CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream)); // build the image pyramids.
buildImagePyramid(prevImg, prevPyr_, nextImg, nextPyr_, stream);
sparse(prevPyr_, nextPyr_, prevPts, nextPts, status, err, stream);
for (int level = maxLevel_; level >= 0; level--)
{
if (cn == 1)
{
pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(),
status.ptr(),
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch,
StreamAccessor::getStream(stream));
}
else
{
pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
prevPts.ptr<float2>(), nextPts.ptr<float2>(),
status.ptr(),
level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
level, block, patch,
StreamAccessor::getStream(stream));
}
}
} }
void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream) void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
...@@ -250,7 +270,7 @@ namespace ...@@ -250,7 +270,7 @@ namespace
{ {
int idx2 = (idx + 1) & 1; int idx2 = (idx + 1) & 1;
pyrlk::dense(prevPyr_[level], nextPyr_[level], pyrlk::pyrLK_caller<float,1>::dense(prevPyr_[level], nextPyr_[level],
uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2], uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
PtrStepSzf(), winSize2i, PtrStepSzf(), winSize2i,
StreamAccessor::getStream(stream)); StreamAccessor::getStream(stream));
...@@ -289,14 +309,23 @@ namespace ...@@ -289,14 +309,23 @@ namespace
OutputArray _err, OutputArray _err,
Stream& stream) Stream& stream)
{ {
const GpuMat prevImg = _prevImg.getGpuMat();
const GpuMat nextImg = _nextImg.getGpuMat();
const GpuMat prevPts = _prevPts.getGpuMat(); const GpuMat prevPts = _prevPts.getGpuMat();
GpuMat& nextPts = _nextPts.getGpuMatRef(); GpuMat& nextPts = _nextPts.getGpuMatRef();
GpuMat& status = _status.getGpuMatRef(); GpuMat& status = _status.getGpuMatRef();
GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL; GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
if (_prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT && _prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT)
sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream); {
std::vector<GpuMat> prevPyr, nextPyr;
_prevImg.getGpuMatVector(prevPyr);
_nextImg.getGpuMatVector(nextPyr);
sparse(prevPyr, nextPyr, prevPts, nextPts, status, err, stream);
}
else
{
const GpuMat prevImg = _prevImg.getGpuMat();
const GpuMat nextImg = _nextImg.getGpuMat();
sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
}
} }
}; };
...@@ -347,4 +376,4 @@ Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize, ...@@ -347,4 +376,4 @@ Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize,
return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow); return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */
\ No newline at end of file
...@@ -167,33 +167,34 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, BroxOpticalFlow, ALL_DEVICES); ...@@ -167,33 +167,34 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, BroxOpticalFlow, ALL_DEVICES);
namespace namespace
{ {
IMPLEMENT_PARAM_CLASS(UseGray, bool) IMPLEMENT_PARAM_CLASS(Chan, int)
IMPLEMENT_PARAM_CLASS(DataType, int)
} }
PARAM_TEST_CASE(PyrLKOpticalFlow, cv::cuda::DeviceInfo, UseGray) PARAM_TEST_CASE(PyrLKOpticalFlow, cv::cuda::DeviceInfo, Chan, DataType)
{ {
cv::cuda::DeviceInfo devInfo; cv::cuda::DeviceInfo devInfo;
bool useGray; int channels;
int dataType;
virtual void SetUp() virtual void SetUp()
{ {
devInfo = GET_PARAM(0); devInfo = GET_PARAM(0);
useGray = GET_PARAM(1); channels = GET_PARAM(1);
dataType = GET_PARAM(2);
cv::cuda::setDevice(devInfo.deviceID()); cv::cuda::setDevice(devInfo.deviceID());
} }
}; };
CUDA_TEST_P(PyrLKOpticalFlow, Sparse) CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
{ {
cv::Mat frame0 = readImage("opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame0 = readImage("opticalflow/frame0.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty()); ASSERT_FALSE(frame0.empty());
cv::Mat frame1 = readImage("opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame1 = readImage("opticalflow/frame1.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame; cv::Mat gray_frame;
if (useGray) if (channels == 1)
gray_frame = frame0; gray_frame = frame0;
else else
cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY); cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
...@@ -208,22 +209,32 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse) ...@@ -208,22 +209,32 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK = cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK =
cv::cuda::SparsePyrLKOpticalFlow::create(); cv::cuda::SparsePyrLKOpticalFlow::create();
std::vector<cv::Point2f> nextPts_gold;
std::vector<unsigned char> status_gold;
cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
cv::cuda::GpuMat d_nextPts; cv::cuda::GpuMat d_nextPts;
cv::cuda::GpuMat d_status; cv::cuda::GpuMat d_status;
pyrLK->calc(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status); cv::Mat converted0, converted1;
if(channels == 4)
{
cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
}
frame0.convertTo(converted0, dataType);
frame1.convertTo(converted1, dataType);
pyrLK->calc(loadMat(converted0), loadMat(converted1), d_pts, d_nextPts, d_status);
std::vector<cv::Point2f> nextPts(d_nextPts.cols); std::vector<cv::Point2f> nextPts(d_nextPts.cols);
cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*) &nextPts[0]); cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]);
d_nextPts.download(nextPts_mat); d_nextPts.download(nextPts_mat);
std::vector<unsigned char> status(d_status.cols); std::vector<unsigned char> status(d_status.cols);
cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*) &status[0]); cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
d_status.download(status_mat); d_status.download(status_mat);
std::vector<cv::Point2f> nextPts_gold;
std::vector<unsigned char> status_gold;
cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
ASSERT_EQ(nextPts_gold.size(), nextPts.size()); ASSERT_EQ(nextPts_gold.size(), nextPts.size());
ASSERT_EQ(status_gold.size(), status.size()); ASSERT_EQ(status_gold.size(), status.size());
...@@ -251,11 +262,16 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse) ...@@ -251,11 +262,16 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
double bad_ratio = static_cast<double>(mistmatch) / nextPts.size(); double bad_ratio = static_cast<double>(mistmatch) / nextPts.size();
ASSERT_LE(bad_ratio, 0.01); ASSERT_LE(bad_ratio, 0.01);
} }
INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, PyrLKOpticalFlow, testing::Combine( INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, PyrLKOpticalFlow, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
testing::Values(UseGray(true), UseGray(false)))); testing::Values(Chan(1), Chan(3), Chan(4)),
testing::Values(DataType(CV_8U), DataType(CV_16U), DataType(CV_32S), DataType(CV_32F))));
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// FarnebackOpticalFlow // FarnebackOpticalFlow
...@@ -385,4 +401,4 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine( ...@@ -385,4 +401,4 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
testing::Values(Gamma(0.0), Gamma(1.0)))); testing::Values(Gamma(0.0), Gamma(1.0))));
#endif // HAVE_CUDA #endif // HAVE_CUDA
\ No newline at end of file
...@@ -212,10 +212,10 @@ namespace cv { namespace cuda { namespace device ...@@ -212,10 +212,10 @@ namespace cv { namespace cuda { namespace device
template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
//template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
...@@ -225,4 +225,4 @@ namespace cv { namespace cuda { namespace device ...@@ -225,4 +225,4 @@ namespace cv { namespace cuda { namespace device
}}} // namespace cv { namespace cuda { namespace cudev }}} // namespace cv { namespace cuda { namespace cudev
#endif /* CUDA_DISABLER */ #endif /* CUDA_DISABLER */
\ No newline at end of file
...@@ -74,7 +74,7 @@ void cv::cuda::pyrDown(InputArray _src, OutputArray _dst, Stream& stream) ...@@ -74,7 +74,7 @@ void cv::cuda::pyrDown(InputArray _src, OutputArray _dst, Stream& stream)
{0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/}, {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
{pyrDown_gpu<ushort> , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3> , pyrDown_gpu<ushort4> }, {pyrDown_gpu<ushort> , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3> , pyrDown_gpu<ushort4> },
{pyrDown_gpu<short> , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3> , pyrDown_gpu<short4> }, {pyrDown_gpu<short> , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3> , pyrDown_gpu<short4> },
{0 /*pyrDown_gpu<int>*/ , 0 /*pyrDown_gpu<int2>*/ , 0 /*pyrDown_gpu<int3>*/ , 0 /*pyrDown_gpu<int4>*/ }, {pyrDown_gpu<int> , 0 /*pyrDown_gpu<int2>*/ , pyrDown_gpu<int3> , pyrDown_gpu<int4> },
{pyrDown_gpu<float> , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3> , pyrDown_gpu<float4> } {pyrDown_gpu<float> , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3> , pyrDown_gpu<float4> }
}; };
...@@ -131,4 +131,4 @@ void cv::cuda::pyrUp(InputArray _src, OutputArray _dst, Stream& stream) ...@@ -131,4 +131,4 @@ void cv::cuda::pyrUp(InputArray _src, OutputArray _dst, Stream& stream)
func(src, dst, StreamAccessor::getStream(stream)); func(src, dst, StreamAccessor::getStream(stream));
} }
#endif #endif
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment