Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
1f1e24be
Commit
1f1e24be
authored
Nov 12, 2012
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
PyrLKOpticalFlow
parent
1b571bde
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
379 additions
and
502 deletions
+379
-502
pyrlk.cu
modules/gpu/src/cuda/pyrlk.cu
+365
-481
pyrlk.cpp
modules/gpu/src/pyrlk.cpp
+14
-21
No files found.
modules/gpu/src/cuda/pyrlk.cu
View file @
1f1e24be
...
...
@@ -52,244 +52,187 @@
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/reduce.hpp"
namespace cv { namespace gpu { namespace device
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace
{
namespace pyrlk
{
__constant__ int c_winSize_x;
__constant__ int c_winSize_y;
__constant__ int c_winSize_x;
__constant__ int c_winSize_y;
__constant__ int c_halfWin_x;
__constant__ int c_halfWin_y;
__constant__ int c_iters;
__constant__ int c_halfWin_x;
__constant__ int c_halfWin_y;
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
__constant__ int c_iters;
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
void loadConstants(int2 winSize, int iters)
template <int cn> struct Tex_I;
template <> struct Tex_I<1>
{
static __device__ __forceinline__ float read(float x, float y)
{
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
return tex2D(tex_If, x, y);
}
__device__ void reduce(float& val1, float& val2, float& val3, float* smem1, float* smem2, float* smem3, int tid)
};
template <> struct Tex_I<4>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
smem1[tid] = val1;
smem2[tid] = val2;
smem3[tid] = val3;
__syncthreads();
return tex2D(tex_If4, x, y);
}
};
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
smem3[tid] = val3 += smem3[tid + 128];
}
__syncthreads();
#endif
template <int cn> struct Tex_J;
template <> struct Tex_J<1>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_Jf, x, y);
}
};
template <> struct Tex_J<4>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_Jf4, x, y);
}
};
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
smem3[tid] = val3 += smem3[tid + 64];
}
__syncthreads();
__device__ __forceinline__ void accum(float& dst, float val)
{
dst += val;
}
__device__ __forceinline__ void accum(float& dst, const float4& val)
{
dst += val.x + val.y + val.z;
}
if (tid < 32)
{
volatile float* vmem1 = smem1;
volatile float* vmem2 = smem2;
volatile float* vmem3 = smem3;
__device__ __forceinline__ float abs_(float a)
{
return ::fabsf(a);
}
__device__ __forceinline__ float4 abs_(const float4& a)
{
return abs(a);
}
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
vmem3[tid] = val3 += vmem3[tid + 32];
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
__global__ void sparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
{
#if __CUDA_ARCH__ <= 110
const int BLOCK_SIZE = 128;
#else
const int BLOCK_SIZE = 256;
#endif
vmem1[tid] = val1 += vmem1[tid + 16
];
vmem2[tid] = val2 += vmem2[tid + 16
];
vmem3[tid] = val3 += vmem3[tid + 16
];
__shared__ float smem1[BLOCK_SIZE
];
__shared__ float smem2[BLOCK_SIZE
];
__shared__ float smem3[BLOCK_SIZE
];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8];
vmem3[tid] = val3 += vmem3[tid + 8];
const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
vmem1[tid] = val1 += vmem1[tid + 4
];
vmem2[tid] = val2 += vmem2[tid + 4]
;
vmem3[tid] = val3 += vmem3[tid + 4]
;
float2 prevPt = prevPts[blockIdx.x
];
prevPt.x *= (1.0f / (1 << level))
;
prevPt.y *= (1.0f / (1 << level))
;
vmem1[tid] = val1 += vmem1[tid + 2];
vmem2[tid] = val2 += vmem2[tid + 2];
vmem3[tid] = val3 += vmem3[tid + 2];
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
vmem3[tid] = val3 += vmem3[tid + 1];
}
return;
}
__device__ void reduce(float& val1, float& val2, float* smem1, float* smem2, int tid)
{
smem1[tid] = val1;
smem2[tid] = val2;
__syncthreads();
prevPt.x -= c_halfWin_x;
prevPt.y -= c_halfWin_y;
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
smem2[tid] = val2 += smem2[tid + 128];
}
__syncthreads();
#endif
// extract the patch from the first image, compute covariation matrix of derivatives
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
smem2[tid] = val2 += smem2[tid + 64];
}
__syncthreads();
float A11 = 0;
float A12 = 0;
float A22 = 0;
typedef typename TypeVec<float, cn>::vec_type work_type;
if (tid < 32)
work_type I_patch [PATCH_Y][PATCH_X];
work_type dIdx_patch[PATCH_Y][PATCH_X];
work_type dIdy_patch[PATCH_Y][PATCH_X];
for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
{
for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
{
volatile float* vmem1 = smem1
;
volatile float* vmem2 = smem2
;
float x = prevPt.x + xBase + 0.5f
;
float y = prevPt.y + yBase + 0.5f
;
vmem1[tid] = val1 += vmem1[tid + 32];
vmem2[tid] = val2 += vmem2[tid + 32];
I_patch[i][j] = Tex_I<cn>::read(x, y);
vmem1[tid] = val1 += vmem1[tid + 16];
vmem2[tid] = val2 += vmem2[tid + 16];
// Sharr Deriv
vmem1[tid] = val1 += vmem1[tid + 8];
vmem2[tid] = val2 += vmem2[tid + 8]
;
work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1))
;
vmem1[tid] = val1 += vmem1[tid + 4];
vmem2[tid] = val2 += vmem2[tid + 4]
;
work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1))
;
vmem1[tid] = val1 += vmem1[tid + 2]
;
vmem2[tid] = val2 += vmem2[tid + 2]
;
dIdx_patch[i][j] = dIdx
;
dIdy_patch[i][j] = dIdy
;
vmem1[tid] = val1 += vmem1[tid + 1];
vmem2[tid] = val2 += vmem2[tid + 1];
accum(A11, dIdx * dIdx);
accum(A12, dIdx * dIdy);
accum(A22, dIdy * dIdy);
}
}
__device__ void reduce(float& val1, float* smem1, int tid)
{
smem1[tid] = val1;
__syncthreads();
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
if (tid < 128)
{
smem1[tid] = val1 += smem1[tid + 128];
}
__syncthreads();
#endif
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
if (tid < 64)
{
smem1[tid] = val1 += smem1[tid + 64];
}
__syncthreads();
if (tid < 32)
{
volatile float* vmem1 = smem1;
vmem1[tid] = val1 += vmem1[tid + 32];
vmem1[tid] = val1 += vmem1[tid + 16];
vmem1[tid] = val1 += vmem1[tid + 8];
vmem1[tid] = val1 += vmem1[tid + 4];
vmem1[tid] = val1 += vmem1[tid + 2];
vmem1[tid] = val1 += vmem1[tid + 1];
}
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
smem1[0] = A11;
smem2[0] = A12;
smem3[0] = A22;
}
#endif
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
__syncthreads();
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
template <int cn> struct Tex_I;
template <> struct Tex_I<1>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_If, x, y);
}
};
template <> struct Tex_I<4>
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_If4, x, y);
}
};
float D = A11 * A22 - A12 * A12;
template <int cn> struct Tex_J;
template <> struct Tex_J<1>
{
static __device__ __forceinline__ float read(float x, float y)
{
return tex2D(tex_Jf, x, y);
}
};
template <> struct Tex_J<4>
if (D < numeric_limits<float>::epsilon())
{
static __device__ __forceinline__ float4 read(float x, float y)
{
return tex2D(tex_Jf4, x, y);
}
};
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
__device__ __forceinline__ void accum(float& dst, float val)
{
dst += val;
}
__device__ __forceinline__ void accum(float& dst, const float4& val)
{
dst += val.x + val.y + val.z;
return;
}
__device__ __forceinline__ float abs_(float a)
{
return ::fabs(a);
}
__device__ __forceinline__ float4 abs_(const float4& a)
{
return abs(a);
}
D = 1.f / D;
A11 *= D;
A12 *= D;
A22 *= D;
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
__global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
float2 nextPt = nextPts[blockIdx.x];
nextPt.x *= 2.f;
nextPt.y *= 2.f;
nextPt.x -= c_halfWin_x;
nextPt.y -= c_halfWin_y;
for (int k = 0; k < c_iters; ++k)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110)
__shared__ float smem1[128];
__shared__ float smem2[128];
__shared__ float smem3[128];
#else
__shared__ float smem1[256];
__shared__ float smem2[256];
__shared__ float smem3[256];
#endif
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
float2 prevPt = prevPts[blockIdx.x];
prevPt.x *= (1.0f / (1 << level));
prevPt.y *= (1.0f / (1 << level));
if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
...
...
@@ -297,388 +240,329 @@ namespace cv { namespace gpu { namespace device
return;
}
prevPt.x -= c_halfWin_x;
prevPt.y -= c_halfWin_y;
// extract the patch from the first image, compute covariation matrix of derivatives
float b1 = 0;
float b2 = 0;
float A11 = 0;
float A12 = 0;
float A22 = 0;
typedef typename TypeVec<float, cn>::vec_type work_type;
work_type I_patch [PATCH_Y][PATCH_X];
work_type dIdx_patch[PATCH_Y][PATCH_X];
work_type dIdy_patch[PATCH_Y][PATCH_X];
for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x
Base = threadIdx.x, j = 0; xBase < c_winSize_x; xBase
+= blockDim.x, ++j)
for (int x
= threadIdx.x, j = 0; x < c_winSize_x; x
+= blockDim.x, ++j)
{
float x = prevPt.x + xBase + 0.5f;
float y = prevPt.y + yBase + 0.5f;
I_patch[i][j] = Tex_I<cn>::read(x, y);
// Sharr Deriv
work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
(3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
work_type diff = (J_val - I_val) * 32.0f;
dIdx_patch[i][j] = dIdx;
dIdy_patch[i][j] = dIdy;
accum(A11, dIdx * dIdx);
accum(A12, dIdx * dIdy);
accum(A22, dIdy * dIdy);
accum(b1, diff * dIdx_patch[i][j]);
accum(b2, diff * dIdy_patch[i][j]);
}
}
reduce(A11, A12, A22, smem1, smem2, smem3, tid);
__syncthreads();
A11 = smem1[0];
A12 = smem2[0];
A22 = smem3[0];
reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
float D = A11 * A22 - A12 * A12;
if (D < numeric_limits<float>::epsilon())
#if __CUDA_ARCH__ >= 300
if (tid == 0)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
return;
smem1[0] = b1;
smem2[0] = b2;
}
#endif
D = 1.f / D;
__syncthreads();
b1 = smem1[0];
b2 = smem2[0];
A11 *= D
;
A12 *= D
;
A22 *= D
;
float2 delta
;
delta.x = A12 * b2 - A22 * b1
;
delta.y = A12 * b1 - A11 * b2
;
float2 nextPt = nextPts[blockIdx.x];
nextPt.x *= 2.f;
nextPt.y *= 2.f;
nextPt.x += delta.x;
nextPt.y += delta.y;
nextPt.x -= c_halfWin_x;
nextPt.y -= c_halfWin_y;
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
for (int k = 0; k < c_iters; ++k)
float errval = 0;
if (calcErr)
{
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows
)
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j
)
{
if (tid == 0 && level == 0)
status[blockIdx.x] = 0;
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
work_type diff = J_val - I_val;
return
;
accum(errval, abs_(diff))
;
}
}
float b1 = 0
;
float b2 = 0;
reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>())
;
}
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
if (tid == 0)
{
nextPt.x += c_halfWin_x;
nextPt.y += c_halfWin_y;
work_type diff = (J_val - I_val) * 32.0f
;
nextPts[blockIdx.x] = nextPt
;
accum(b1, diff * dIdx_patch[i][j]);
accum(b2, diff * dIdy_patch[i][j]);
}
}
reduce(b1, b2, smem1, smem2, tid);
__syncthreads();
if (calcErr)
err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
}
}
b1 = smem1[0];
b2 = smem2[0];
template <int cn, int PATCH_X, int PATCH_Y>
void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
if (level == 0 && err)
sparse<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
else
sparse<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
nextPt.x += delta.x;
nextPt.y += delta.y;
cudaSafeCall( cudaGetLastError() );
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f
)
break
;
}
if (stream == 0
)
cudaSafeCall( cudaDeviceSynchronize() )
;
}
float errval = 0;
if (calcErr)
{
for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
{
for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
{
work_type I_val = I_patch[i][j];
work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
template <bool calcErr>
__global__ void dense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
{
extern __shared__ int smem[];
work_type diff = J_val - I_val;
const int patchWidth = blockDim.x + 2 * c_halfWin_x;
const int patchHeight = blockDim.y + 2 * c_halfWin_y;
accum(errval, abs_(diff))
;
}
}
int* I_patch = smem
;
int* dIdx_patch = I_patch + patchWidth * patchHeight;
int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
reduce(errval, smem1, tid)
;
}
const int xBase = blockIdx.x * blockDim.x
;
const int yBase = blockIdx.y * blockDim.y;
if (tid == 0)
for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
{
for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
{
nextPt.x += c_halfWin_x
;
nextPt.y += c_halfWin_y
;
float x = xBase - c_halfWin_x + j + 0.5f
;
float y = yBase - c_halfWin_y + i + 0.5f
;
nextPts[blockIdx.x] = nextPt
;
I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y)
;
if (calcErr)
err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
// Sharr Deriv
dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
(3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
(3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
}
}
template <int cn, int PATCH_X, int PATCH_Y>
void lkSparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream)
{
dim3 grid(ptcount);
__syncthreads();
if (level == 0 && err)
lkSparse<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
else
lkSparse<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
const int x = xBase + threadIdx.x;
const int y = yBase + threadIdx.y;
cudaSafeCall( cudaGetLastError() );
if (x >= cols || y >= rows)
return;
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() )
;
}
int A11i = 0;
int A12i = 0
;
int A22i = 0;
void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
for (int i = 0; i < c_winSize_y; ++i)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5] =
for (int j = 0; j < c_winSize_x; ++j)
{
{lkSparse_caller<1, 1, 1>, lkSparse_caller<1, 2, 1>, lkSparse_caller<1, 3, 1>, lkSparse_caller<1, 4, 1>, lkSparse_caller<1, 5, 1>},
{lkSparse_caller<1, 1, 2>, lkSparse_caller<1, 2, 2>, lkSparse_caller<1, 3, 2>, lkSparse_caller<1, 4, 2>, lkSparse_caller<1, 5, 2>},
{lkSparse_caller<1, 1, 3>, lkSparse_caller<1, 2, 3>, lkSparse_caller<1, 3, 3>, lkSparse_caller<1, 4, 3>, lkSparse_caller<1, 5, 3>},
{lkSparse_caller<1, 1, 4>, lkSparse_caller<1, 2, 4>, lkSparse_caller<1, 3, 4>, lkSparse_caller<1, 4, 4>, lkSparse_caller<1, 5, 4>},
{lkSparse_caller<1, 1, 5>, lkSparse_caller<1, 2, 5>, lkSparse_caller<1, 3, 5>, lkSparse_caller<1, 4, 5>, lkSparse_caller<1, 5, 5>}
};
bindTexture(&tex_If, I);
bindTexture(&tex_Jf, J);
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
static const func_t funcs[5][5] =
{
{lkSparse_caller<4, 1, 1>, lkSparse_caller<4, 2, 1>, lkSparse_caller<4, 3, 1>, lkSparse_caller<4, 4, 1>, lkSparse_caller<4, 5, 1>},
{lkSparse_caller<4, 1, 2>, lkSparse_caller<4, 2, 2>, lkSparse_caller<4, 3, 2>, lkSparse_caller<4, 4, 2>, lkSparse_caller<4, 5, 2>},
{lkSparse_caller<4, 1, 3>, lkSparse_caller<4, 2, 3>, lkSparse_caller<4, 3, 3>, lkSparse_caller<4, 4, 3>, lkSparse_caller<4, 5, 3>},
{lkSparse_caller<4, 1, 4>, lkSparse_caller<4, 2, 4>, lkSparse_caller<4, 3, 4>, lkSparse_caller<4, 4, 4>, lkSparse_caller<4, 5, 4>},
{lkSparse_caller<4, 1, 5>, lkSparse_caller<4, 2, 5>, lkSparse_caller<4, 3, 5>, lkSparse_caller<4, 4, 5>, lkSparse_caller<4, 5, 5>}
};
bindTexture(&tex_If4, I);
bindTexture(&tex_Jf4, J);
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
A11i += dIdx * dIdx;
A12i += dIdx * dIdy;
A22i += dIdy * dIdy;
}
}
template <bool calcErr>
__global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
{
extern __shared__ int smem[];
const int patchWidth = blockDim.x + 2 * c_halfWin_x;
const int patchHeight = blockDim.y + 2 * c_halfWin_y;
float A11 = A11i;
float A12 = A12i;
float A22 = A22i;
int* I_patch = smem;
int* dIdx_patch = I_patch + patchWidth * patchHeight;
int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
float D = A11 * A22 - A12 * A12;
const int xBase = blockIdx.x * blockDim.x;
const int yBase = blockIdx.y * blockDim.y;
for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
{
for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
{
float x = xBase - c_halfWin_x + j + 0.5f;
float y = yBase - c_halfWin_y + i + 0.5f;
I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
if (D < numeric_limits<float>::epsilon())
{
if (calcErr)
err(y, x) = numeric_limits<float>::max();
// Sharr Deriv
return;
}
dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
(3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
D = 1.f / D;
dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
(3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
}
}
A11 *= D;
A12 *= D;
A22 *= D;
__syncthreads();
float2 nextPt;
nextPt.x = x + prevU(y/2, x/2) * 2.0f;
nextPt.y = y + prevV(y/2, x/2) * 2.0f;
const int x = xBase + threadIdx.x;
const int y = yBase + threadIdx.y;
for (int k = 0; k < c_iters; ++k)
{
if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
{
if (calcErr)
err(y, x) = numeric_limits<float>::max();
if (x >= cols || y >= rows)
return;
}
int A11i = 0;
int A12i = 0;
int A22i = 0;
int b1 = 0;
int b2 = 0;
for (int i = 0; i < c_winSize_y; ++i)
{
for (int j = 0; j < c_winSize_x; ++j)
{
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
int diff = (J - I) * 32;
int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
A11i += dIdx * dIdx;
A12i += dIdx * dIdy;
A22i += dIdy * dIdy;
b1 += diff * dIdx;
b2 += diff * dIdy;
}
}
float
A11 = A11i
;
float A12 = A12i
;
float A22 = A22i
;
float
2 delta
;
delta.x = A12 * b2 - A22 * b1
;
delta.y = A12 * b1 - A11 * b2
;
float D = A11 * A22 - A12 * A12;
nextPt.x += delta.x;
nextPt.y += delta.y;
if (D < numeric_limits<float>::epsilon())
{
if (calcErr)
err(y, x) = numeric_limits<float>::max();
return;
}
D = 1.f / D;
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
A11 *= D;
A12 *= D;
A22 *= D;
u(y, x) = nextPt.x - x;
v(y, x) = nextPt.y - y;
float2 nextPt;
nextPt.x = x + prevU(y/2, x/2) * 2.0f;
nextPt.y = y + prevV(y/2, x/2) * 2.0f
;
if (calcErr)
{
int errval = 0
;
for (int
k = 0; k < c_iters; ++k
)
for (int
i = 0; i < c_winSize_y; ++i
)
{
if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows
)
for (int j = 0; j < c_winSize_x; ++j
)
{
i
f (calcErr)
err(y, x) = numeric_limits<float>::max(
);
i
nt I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f
);
return
;
errval += ::abs(J - I)
;
}
}
int b1 = 0;
int b2 = 0;
for (int i = 0; i < c_winSize_y; ++i)
{
for (int j = 0; j < c_winSize_x; ++j)
{
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
int diff = (J - I) * 32;
err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
}
}
}
int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
namespace pyrlk
{
void loadConstants(int2 winSize, int iters)
{
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
b1 += diff * dIdx;
b2 += diff * dIdy;
}
}
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
float2 delta;
delta.x = A12 * b2 - A22 * b1;
delta.y = A12 * b1 - A11 * b2;
cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
}
nextPt.x += delta.x;
nextPt.y += delta.y;
void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
break;
}
static const func_t funcs[5][5] =
{
{::sparse_caller<1, 1, 1>, ::sparse_caller<1, 2, 1>, ::sparse_caller<1, 3, 1>, ::sparse_caller<1, 4, 1>, ::sparse_caller<1, 5, 1>},
{::sparse_caller<1, 1, 2>, ::sparse_caller<1, 2, 2>, ::sparse_caller<1, 3, 2>, ::sparse_caller<1, 4, 2>, ::sparse_caller<1, 5, 2>},
{::sparse_caller<1, 1, 3>, ::sparse_caller<1, 2, 3>, ::sparse_caller<1, 3, 3>, ::sparse_caller<1, 4, 3>, ::sparse_caller<1, 5, 3>},
{::sparse_caller<1, 1, 4>, ::sparse_caller<1, 2, 4>, ::sparse_caller<1, 3, 4>, ::sparse_caller<1, 4, 4>, ::sparse_caller<1, 5, 4>},
{::sparse_caller<1, 1, 5>, ::sparse_caller<1, 2, 5>, ::sparse_caller<1, 3, 5>, ::sparse_caller<1, 4, 5>, ::sparse_caller<1, 5, 5>}
};
u(y, x) = nextPt.x - x
;
v(y, x) = nextPt.y - y
;
bindTexture(&tex_If, I)
;
bindTexture(&tex_Jf, J)
;
if (calcErr)
{
int errval = 0;
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
for (int i = 0; i < c_winSize_y; ++i)
{
for (int j = 0; j < c_winSize_x; ++j)
{
int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, dim3 patch, cudaStream_t stream)
{
typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
int level, dim3 block, cudaStream_t stream);
errval += ::abs(J - I);
}
}
static const func_t funcs[5][5] =
{
{::sparse_caller<4, 1, 1>, ::sparse_caller<4, 2, 1>, ::sparse_caller<4, 3, 1>, ::sparse_caller<4, 4, 1>, ::sparse_caller<4, 5, 1>},
{::sparse_caller<4, 1, 2>, ::sparse_caller<4, 2, 2>, ::sparse_caller<4, 3, 2>, ::sparse_caller<4, 4, 2>, ::sparse_caller<4, 5, 2>},
{::sparse_caller<4, 1, 3>, ::sparse_caller<4, 2, 3>, ::sparse_caller<4, 3, 3>, ::sparse_caller<4, 4, 3>, ::sparse_caller<4, 5, 3>},
{::sparse_caller<4, 1, 4>, ::sparse_caller<4, 2, 4>, ::sparse_caller<4, 3, 4>, ::sparse_caller<4, 4, 4>, ::sparse_caller<4, 5, 4>},
{::sparse_caller<4, 1, 5>, ::sparse_caller<4, 2, 5>, ::sparse_caller<4, 3, 5>, ::sparse_caller<4, 4, 5>, ::sparse_caller<4, 5, 5>}
};
err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
}
}
bindTexture(&tex_If4, I);
bindTexture(&tex_Jf4, J);
void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
PtrStepSzf err, int2 winSize, cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
level, block, stream);
}
bindTexture(&tex_Ib, I);
bindTexture(&tex_Jf, J);
void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
{
dim3 block(16, 16);
dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
const int patchWidth = block.x + 2 * halfWin.x;
const int patchHeight = block.y + 2 * halfWin.y;
size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
bindTexture(&tex_Ib, I);
bindTexture(&tex_Jf, J);
if (err.data)
{
lkDense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
cudaSafeCall( cudaGetLastError() );
}
else
{
lkDense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
cudaSafeCall( cudaGetLastError() );
}
int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
const int patchWidth = block.x + 2 * halfWin.x;
const int patchHeight = block.y + 2 * halfWin.y;
size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
if (err.data)
{
::dense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
cudaSafeCall( cudaGetLastError() );
}
else
{
::dense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
}
}}
}
#endif /* CUDA_DISABLER */
modules/gpu/src/pyrlk.cpp
View file @
1f1e24be
...
...
@@ -55,21 +55,18 @@ void cv::gpu::PyrLKOpticalFlow::releaseMemory() {}
#else
/* !defined (HAVE_CUDA) */
namespace
cv
{
namespace
gpu
{
namespace
device
namespace
pyrlk
{
namespace
pyrlk
{
void
loadConstants
(
int2
winSize
,
int
iters
);
void
loadConstants
(
int2
winSize
,
int
iters
);
void
lkSparse1_gpu
(
PtrStepSzf
I
,
PtrStepSzf
J
,
const
float2
*
prevPts
,
float2
*
nextPts
,
uchar
*
status
,
float
*
err
,
int
ptcount
,
int
level
,
dim3
block
,
dim3
patch
,
cudaStream_t
stream
=
0
);
void
lkSparse4_gpu
(
PtrStepSz
<
float4
>
I
,
PtrStepSz
<
float4
>
J
,
const
float2
*
prevPts
,
float2
*
nextPts
,
uchar
*
status
,
float
*
err
,
int
ptcount
,
int
level
,
dim3
block
,
dim3
patch
,
cudaStream_t
stream
=
0
);
void
sparse1
(
PtrStepSzf
I
,
PtrStepSzf
J
,
const
float2
*
prevPts
,
float2
*
nextPts
,
uchar
*
status
,
float
*
err
,
int
ptcount
,
int
level
,
dim3
block
,
dim3
patch
,
cudaStream_t
stream
=
0
);
void
sparse4
(
PtrStepSz
<
float4
>
I
,
PtrStepSz
<
float4
>
J
,
const
float2
*
prevPts
,
float2
*
nextPts
,
uchar
*
status
,
float
*
err
,
int
ptcount
,
int
level
,
dim3
block
,
dim3
patch
,
cudaStream_t
stream
=
0
);
void
lkDense_gpu
(
PtrStepSzb
I
,
PtrStepSzf
J
,
PtrStepSzf
u
,
PtrStepSzf
v
,
PtrStepSzf
prevU
,
PtrStepSzf
prevV
,
PtrStepSzf
err
,
int2
winSize
,
cudaStream_t
stream
=
0
);
}
}}}
void
dense
(
PtrStepSzb
I
,
PtrStepSzf
J
,
PtrStepSzf
u
,
PtrStepSzf
v
,
PtrStepSzf
prevU
,
PtrStepSzf
prevV
,
PtrStepSzf
err
,
int2
winSize
,
cudaStream_t
stream
=
0
);
}
cv
::
gpu
::
PyrLKOpticalFlow
::
PyrLKOpticalFlow
()
{
...
...
@@ -104,8 +101,6 @@ namespace
void
cv
::
gpu
::
PyrLKOpticalFlow
::
sparse
(
const
GpuMat
&
prevImg
,
const
GpuMat
&
nextImg
,
const
GpuMat
&
prevPts
,
GpuMat
&
nextPts
,
GpuMat
&
status
,
GpuMat
*
err
)
{
using
namespace
cv
::
gpu
::
device
::
pyrlk
;
if
(
prevPts
.
empty
())
{
nextPts
.
release
();
...
...
@@ -166,19 +161,19 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
pyrDown
(
nextPyr_
[
level
-
1
],
nextPyr_
[
level
]);
}
loadConstants
(
make_int2
(
winSize
.
width
,
winSize
.
height
),
iters
);
pyrlk
::
loadConstants
(
make_int2
(
winSize
.
width
,
winSize
.
height
),
iters
);
for
(
int
level
=
maxLevel
;
level
>=
0
;
level
--
)
{
if
(
cn
==
1
)
{
lkSparse1_gpu
(
prevPyr_
[
level
],
nextPyr_
[
level
],
pyrlk
::
sparse1
(
prevPyr_
[
level
],
nextPyr_
[
level
],
prevPts
.
ptr
<
float2
>
(),
nextPts
.
ptr
<
float2
>
(),
status
.
ptr
(),
level
==
0
&&
err
?
err
->
ptr
<
float
>
()
:
0
,
prevPts
.
cols
,
level
,
block
,
patch
);
}
else
{
lkSparse4_gpu
(
prevPyr_
[
level
],
nextPyr_
[
level
],
pyrlk
::
sparse4
(
prevPyr_
[
level
],
nextPyr_
[
level
],
prevPts
.
ptr
<
float2
>
(),
nextPts
.
ptr
<
float2
>
(),
status
.
ptr
(),
level
==
0
&&
err
?
err
->
ptr
<
float
>
()
:
0
,
prevPts
.
cols
,
level
,
block
,
patch
);
}
...
...
@@ -187,8 +182,6 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
void
cv
::
gpu
::
PyrLKOpticalFlow
::
dense
(
const
GpuMat
&
prevImg
,
const
GpuMat
&
nextImg
,
GpuMat
&
u
,
GpuMat
&
v
,
GpuMat
*
err
)
{
using
namespace
cv
::
gpu
::
device
::
pyrlk
;
CV_Assert
(
prevImg
.
type
()
==
CV_8UC1
);
CV_Assert
(
prevImg
.
size
()
==
nextImg
.
size
()
&&
prevImg
.
type
()
==
nextImg
.
type
());
CV_Assert
(
maxLevel
>=
0
);
...
...
@@ -219,7 +212,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
vPyr_
[
1
].
setTo
(
Scalar
::
all
(
0
));
int2
winSize2i
=
make_int2
(
winSize
.
width
,
winSize
.
height
);
loadConstants
(
winSize2i
,
iters
);
pyrlk
::
loadConstants
(
winSize2i
,
iters
);
PtrStepSzf
derr
=
err
?
*
err
:
PtrStepSzf
();
...
...
@@ -229,7 +222,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
{
int
idx2
=
(
idx
+
1
)
&
1
;
lkDense_gpu
(
prevPyr_
[
level
],
nextPyr_
[
level
],
uPyr_
[
idx
],
vPyr_
[
idx
],
uPyr_
[
idx2
],
vPyr_
[
idx2
],
pyrlk
::
dense
(
prevPyr_
[
level
],
nextPyr_
[
level
],
uPyr_
[
idx
],
vPyr_
[
idx
],
uPyr_
[
idx2
],
vPyr_
[
idx2
],
level
==
0
?
derr
:
PtrStepSzf
(),
winSize2i
);
if
(
level
>
0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment