Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
42c7aece
Commit
42c7aece
authored
Sep 29, 2011
by
Anton Obukhov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[+] Added Brox optical flow (implementation courtesy of Michael Smirnov)
parent
f838db92
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
3076 additions
and
0 deletions
+3076
-0
NCVBroxOpticalFlow.cu
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+1136
-0
NCVBroxOpticalFlow.hpp
modules/gpu/src/nvidia/NCVBroxOpticalFlow.hpp
+103
-0
NPP_staging.cu
modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+949
-0
NPP_staging.hpp
modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
+249
-0
opticalflow_nvidia_api.cpp
samples/gpu/opticalflow_nvidia_api.cpp
+639
-0
rubberwhale1.png
samples/gpu/rubberwhale1.png
+0
-0
rubberwhale2.png
samples/gpu/rubberwhale2.png
+0
-0
No files found.
modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
0 → 100644
View file @
42c7aece
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
////////////////////////////////////////////////////////////////////////////////
//
// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
//
// Algorithm is explained in the original paper:
// T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
// High accuracy optical flow estimation based on a theory for warping.
// ECCV 2004.
//
// Implementation by Mikhail Smirnov
// email: msmirnov@nvidia.com, devsupport@nvidia.com
//
// Credits for help with the code to:
// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
//
////////////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <vector>
#include <memory>
#include "NPP_staging\NPP_staging.hpp"
#include "NCVBroxOpticalFlow.hpp"
using std::tr1::shared_ptr;
typedef NCVVectorAlloc<Ncv32f> FloatVector;
/////////////////////////////////////////////////////////////////////////////////////////
// Implementation specific constants
/////////////////////////////////////////////////////////////////////////////////////////
__device__ const float eps2 = 1e-6f;
/////////////////////////////////////////////////////////////////////////////////////////
// Additional defines
/////////////////////////////////////////////////////////////////////////////////////////
// rounded up division
inline int iDivUp(int a, int b)
{
return (a + b - 1)/b;
}
/////////////////////////////////////////////////////////////////////////////////////////
// Texture references
/////////////////////////////////////////////////////////////////////////////////////////
texture<float, 2, cudaReadModeElementType> tex_coarse;
texture<float, 2, cudaReadModeElementType> tex_fine;
texture<float, 2, cudaReadModeElementType> tex_I1;
texture<float, 2, cudaReadModeElementType> tex_I0;
texture<float, 2, cudaReadModeElementType> tex_Ix;
texture<float, 2, cudaReadModeElementType> tex_Ixx;
texture<float, 2, cudaReadModeElementType> tex_Ix0;
texture<float, 2, cudaReadModeElementType> tex_Iy;
texture<float, 2, cudaReadModeElementType> tex_Iyy;
texture<float, 2, cudaReadModeElementType> tex_Iy0;
texture<float, 2, cudaReadModeElementType> tex_Ixy;
texture<float, 1, cudaReadModeElementType> tex_u;
texture<float, 1, cudaReadModeElementType> tex_v;
texture<float, 1, cudaReadModeElementType> tex_du;
texture<float, 1, cudaReadModeElementType> tex_dv;
texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
texture<float, 1, cudaReadModeElementType> tex_numerator_u;
texture<float, 1, cudaReadModeElementType> tex_numerator_v;
texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
/////////////////////////////////////////////////////////////////////////////////////////
// SUPPLEMENTARY FUNCTIONS
/////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/// \brief performs pointwise summation of two vectors stored in device memory
/// \param d_res - pointer to resulting vector (device memory)
/// \param d_op1 - term #1 (device memory)
/// \param d_op2 - term #2 (device memory)
/// \param len - vector size
///////////////////////////////////////////////////////////////////////////////
__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
{
const int pos = blockIdx.x*blockDim.x + threadIdx.x;
if(pos >= len) return;
d_res[pos] = d_op1[pos] + d_op2[pos];
}
///////////////////////////////////////////////////////////////////////////////
/// \brief wrapper for summation kernel.
/// Computes \b op1 + \b op2 and stores result to \b res
/// \param res array, containing op1 + op2 (device memory)
/// \param op1 term #1 (device memory)
/// \param op2 term #2 (device memory)
/// \param count vector size
///////////////////////////////////////////////////////////////////////////////
static void add(float *res, const float *op1, const float *op2, const int count, cudaStream_t stream)
{
dim3 threads(256);
dim3 blocks(iDivUp(count, threads.x));
pointwise_add<<<blocks, threads, 0, stream>>>(res, op1, op2, count);
}
///////////////////////////////////////////////////////////////////////////////
/// \brief wrapper for summation kernel.
/// Increments \b res by \b rhs
/// \param res initial vector, will be replaced with result (device memory)
/// \param rhs increment (device memory)
/// \param count vector size
///////////////////////////////////////////////////////////////////////////////
static void add(float *res, const float *rhs, const int count, cudaStream_t stream)
{
add(res, res, rhs, count, stream);
}
///////////////////////////////////////////////////////////////////////////////
/// \brief kernel for scaling vector by scalar
/// \param d_res scaled vector (device memory)
/// \param d_src source vector (device memory)
/// \param scale scalar to scale by
/// \param len vector size (number of elements)
///////////////////////////////////////////////////////////////////////////////
__global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len)
{
const int pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= len) return;
d_res[pos] = d_src[pos] * scale;
}
///////////////////////////////////////////////////////////////////////////////
/// \brief scale vector by scalar
///
/// kernel wrapper
/// \param d_res scaled vector (device memory)
/// \param d_src source vector (device memory)
/// \param scale scalar to scale by
/// \param len vector size (number of elements)
/// \param stream CUDA stream
///////////////////////////////////////////////////////////////////////////////
static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream)
{
dim3 threads(256);
dim3 blocks(iDivUp(len, threads.x));
scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len);
}
const int SOR_TILE_WIDTH = 32;
const int SOR_TILE_HEIGHT = 6;
const int PSOR_TILE_WIDTH = 32;
const int PSOR_TILE_HEIGHT = 6;
const int PSOR_PITCH = PSOR_TILE_WIDTH + 4;
const int PSOR_HEIGHT = PSOR_TILE_HEIGHT + 4;
///////////////////////////////////////////////////////////////////////////////
///\brief Utility function. Compute smooth term diffusivity along x axis
///\param s (out) pointer to memory location for result (diffusivity)
///\param pos (in) position within shared memory array containing \b u
///\param u (in) shared memory array containing \b u
///\param v (in) shared memory array containing \b v
///\param du (in) shared memory array containing \b du
///\param dv (in) shared memory array containing \b dv
///////////////////////////////////////////////////////////////////////////////
__forceinline__ __device__ void diffusivity_along_x(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
{
//x derivative between pixels (i,j) and (i-1,j)
const int left = pos-1;
float u_x = u[pos] + du[pos] - u[left] - du[left];
float v_x = v[pos] + dv[pos] - v[left] - dv[left];
const int up = pos + PSOR_PITCH;
const int down = pos - PSOR_PITCH;
const int up_left = up - 1;
const int down_left = down-1;
//y derivative between pixels (i,j) and (i-1,j)
float u_y = 0.25f*(u[up] + du[up] + u[up_left] + du[up_left] - u[down] - du[down] - u[down_left] - du[down_left]);
float v_y = 0.25f*(v[up] + dv[up] + v[up_left] + dv[up_left] - v[down] - dv[down] - v[down_left] - dv[down_left]);
*s = 0.5f / sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
}
///////////////////////////////////////////////////////////////////////////////
///\brief Utility function. Compute smooth term diffusivity along y axis
///\param s (out) pointer to memory location for result (diffusivity)
///\param pos (in) position within shared memory array containing \b u
///\param u (in) shared memory array containing \b u
///\param v (in) shared memory array containing \b v
///\param du (in) shared memory array containing \b du
///\param dv (in) shared memory array containing \b dv
///////////////////////////////////////////////////////////////////////////////
__forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
{
//y derivative between pixels (i,j) and (i,j-1)
const int down = pos-PSOR_PITCH;
float u_y = u[pos] + du[pos] - u[down] - du[down];
float v_y = v[pos] + dv[pos] - v[down] - dv[down];
const int right = pos + 1;
const int left = pos - 1;
const int down_right = down + 1;
const int down_left = down - 1;
//x derivative between pixels (i,j) and (i,j-1);
float u_x = 0.25f*(u[right] + u[down_right] + du[right] + du[down_right] - u[left] - u[down_left] - du[left] - du[down_left]);
float v_x = 0.25f*(v[right] + v[down_right] + dv[right] + dv[down_right] - v[left] - v[down_left] - dv[left] - dv[down_left]);
*s = 0.5f/sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
}
///////////////////////////////////////////////////////////////////////////////
///\brief Utility function. Load element of 2D global memory to shared memory
///\param smem pointer to shared memory array
///\param is shared memory array column
///\param js shared memory array row
///\param w number of columns in global memory array
///\param h number of rows in global memory array
///\param p global memory array pitch in floats
///////////////////////////////////////////////////////////////////////////////
template<int tex_id>
__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
{
//position within shared memory array
const int ijs = js * PSOR_PITCH + is;
//mirror reflection across borders
i = max(i, -i-1);
i = min(i, w-i+w-1);
j = max(j, -j-1);
j = min(j, h-j+h-1);
const int pos = j * p + i;
switch(tex_id){
case 0:
smem[ijs] = tex1Dfetch(tex_u, pos);
break;
case 1:
smem[ijs] = tex1Dfetch(tex_v, pos);
break;
case 2:
smem[ijs] = tex1Dfetch(tex_du, pos);
break;
case 3:
smem[ijs] = tex1Dfetch(tex_dv, pos);
break;
}
}
///////////////////////////////////////////////////////////////////////////////
///\brief Utility function. Load part (tile) of 2D global memory to shared memory
///\param smem pointer to target shared memory array
///\param ig column number within source
///\param jg row number within source
///\param w number of columns in global memory array
///\param h number of rows in global memory array
///\param p global memory array pitch in floats
///////////////////////////////////////////////////////////////////////////////
template<int tex>
__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
{
const int i = threadIdx.x + 2;
const int j = threadIdx.y + 2;
load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
__syncthreads();
if(threadIdx.y < 2)
{
//load bottom shadow elements
load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
if(threadIdx.x < 2)
{
//load bottom right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
//load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
}
else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{
//load bottom left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
//load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
}
}
else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
{
//load upper shadow elements
load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
if(threadIdx.x < 2)
{
//load upper right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
//load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
}
else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{
//load upper left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
//load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
}
}
else
{
//load middle shadow elements
if(threadIdx.x < 2)
{
//load middle right shadow elements
load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
}
else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
{
//load middle left shadow elements
load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
}
}
__syncthreads();
}
///////////////////////////////////////////////////////////////////////////////
/// \brief computes matrix of linearised system for \c du, \c dv
/// Computed values reside in GPU memory. \n
/// Matrix computation is divided into two steps. This kernel performs first step\n
/// - compute smoothness term diffusivity between pixels - psi dash smooth
/// - compute robustness factor in the data term - psi dash data
/// \param diffusivity_x (in/out) diffusivity between pixels along x axis in smoothness term
/// \param diffusivity_y (in/out) diffusivity between pixels along y axis in smoothness term
/// \param denominator_u (in/out) precomputed part of expression for new du value in SOR iteration
/// \param denominator_v (in/out) precomputed part of expression for new dv value in SOR iteration
/// \param numerator_dudv (in/out) precomputed part of expression for new du and dv value in SOR iteration
/// \param numerator_u (in/out) precomputed part of expression for new du value in SOR iteration
/// \param numerator_v (in/out) precomputed part of expression for new dv value in SOR iteration
/// \param w (in) frame width
/// \param h (in) frame height
/// \param pitch (in) pitch in floats
/// \param alpha (in) alpha in Brox model (flow smoothness)
/// \param gamma (in) gamma in Brox model (edge importance)
///////////////////////////////////////////////////////////////////////////////
__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
float *denominator_u, float *denominator_v,
float *numerator_dudv,
float *numerator_u, float *numerator_v,
int w, int h, int s,
float alpha, float gamma)
{
__shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
__shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
__shared__ float du[PSOR_PITCH * PSOR_HEIGHT];
__shared__ float dv[PSOR_PITCH * PSOR_HEIGHT];
//position within tile
const int i = threadIdx.x;
const int j = threadIdx.y;
//position within smem arrays
const int ijs = (j+2) * PSOR_PITCH + i + 2;
//position within global memory
const int ig = blockIdx.x * blockDim.x + threadIdx.x;
const int jg = blockIdx.y * blockDim.y + threadIdx.y;
const int ijg = jg * s + ig;
//position within texture
float x = (float)ig + 0.5f;
float y = (float)jg + 0.5f;
//load u and v to smem
load_array<0>(u, ig, jg, w, h, s);
load_array<1>(v, ig, jg, w, h, s);
load_array<2>(du, ig, jg, w, h, s);
load_array<3>(dv, ig, jg, w, h, s);
//warped position
float wx = (x + u[ijs])/(float)w;
float wy = (y + v[ijs])/(float)h;
x /= (float)w;
y /= (float)h;
//compute image derivatives
const float Iz = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
const float Ix = tex2D(tex_Ix, wx, wy);
const float Ixz = Ix - tex2D(tex_Ix0, x, y);
const float Ixy = tex2D(tex_Ixy, wx, wy);
const float Ixx = tex2D(tex_Ixx, wx, wy);
const float Iy = tex2D(tex_Iy, wx, wy);
const float Iyz = Iy - tex2D(tex_Iy0, x, y);
const float Iyy = tex2D(tex_Iyy, wx, wy);
//compute data term
float q0, q1, q2;
q0 = Iz + Ix * du[ijs] + Iy * dv[ijs];
q1 = Ixz + Ixx * du[ijs] + Ixy * dv[ijs];
q2 = Iyz + Ixy * du[ijs] + Iyy * dv[ijs];
float data_term = 0.5f * rsqrtf(q0*q0 + gamma*(q1*q1 + q2*q2) + eps2);
//scale data term by 1/alpha
data_term /= alpha;
//compute smoothness term (diffusivity)
float sx, sy;
if(ig >= w || jg >= h) return;
diffusivity_along_x(&sx, ijs, u, v, du, dv);
diffusivity_along_y(&sy, ijs, u, v, du, dv);
if(ig == 0) sx = 0.0f;
if(jg == 0) sy = 0.0f;
numerator_dudv[ijg] = data_term * (Ix*Iy + gamma * Ixy*(Ixx + Iyy));
numerator_u[ijg] = data_term * (Ix*Iz + gamma * (Ixx*Ixz + Ixy*Iyz));
numerator_v[ijg] = data_term * (Iy*Iz + gamma * (Iyy*Iyz + Ixy*Ixz));
denominator_u[ijg] = data_term * (Ix*Ix + gamma * (Ixy*Ixy + Ixx*Ixx));
denominator_v[ijg] = data_term * (Iy*Iy + gamma * (Ixy*Ixy + Iyy*Iyy));
diffusivity_x[ijg] = sx;
diffusivity_y[ijg] = sy;
}
///////////////////////////////////////////////////////////////////////////////
///\brief computes matrix of linearised system for \c du, \c dv
///\param inv_denominator_u
///\param inv_denominator_v
///\param w
///\param h
///\param s
///////////////////////////////////////////////////////////////////////////////
__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
int w, int h, int s)
{
__shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
__shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
//position within tile
const int i = threadIdx.x;
const int j = threadIdx.y;
//position within smem arrays
const int ijs = j*(PSOR_TILE_WIDTH+1) + i;
//position within global memory
const int ig = blockIdx.x * blockDim.x + threadIdx.x;
const int jg = blockIdx.y * blockDim.y + threadIdx.y;
const int ijg = jg*s + ig;
int inside = ig < w && jg < h;
float denom_u;
float denom_v;
if(inside)
{
denom_u = inv_denominator_u[ijg];
denom_v = inv_denominator_v[ijg];
}
if(inside)
{
sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
}
else
{
sx[ijs] = 0.0f;
sy[ijs] = 0.0f;
}
int up = ijs+PSOR_TILE_WIDTH+1;
if(j == PSOR_TILE_HEIGHT-1)
{
if(jg < h-1 && inside)
{
sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
}
else
{
sy[up] = 0.0f;
}
}
int right = ijs + 1;
if(threadIdx.x == PSOR_TILE_WIDTH-1)
{
if(ig < w-1 && inside)
{
sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
}
else
{
sx[right] = 0.0f;
}
}
__syncthreads();
float diffusivity_sum;
diffusivity_sum = sx[ijs] + sx[ijs+1] + sy[ijs] + sy[ijs+PSOR_TILE_WIDTH+1];
if(inside)
{
denom_u += diffusivity_sum;
denom_v += diffusivity_sum;
inv_denominator_u[ijg] = 1.0f/denom_u;
inv_denominator_v[ijg] = 1.0f/denom_v;
}
}
/////////////////////////////////////////////////////////////////////////////////////////
// Red-Black SOR
/////////////////////////////////////////////////////////////////////////////////////////
template<int isBlack> __global__ void sor_pass(float *new_du,
float *new_dv,
const float *g_inv_denominator_u,
const float *g_inv_denominator_v,
const float *g_numerator_u,
const float *g_numerator_v,
const float *g_numerator_dudv,
float omega,
int width,
int height,
int stride)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i >= width || j >= height)
return;
const int pos = j * stride + i;
const int pos_r = pos + 1;
const int pos_u = pos + stride;
const int pos_d = pos - stride;
const int pos_l = pos - 1;
//load smooth term
float s_up, s_left, s_right, s_down;
s_left = tex1Dfetch(tex_diffusivity_x, pos);
s_down = tex1Dfetch(tex_diffusivity_y, pos);
if(i < width-1)
s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
else
s_right = 0.0f; //Neumann BC
if(j < height-1)
s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
else
s_up = 0.0f; //Neumann BC
//load u, v and du, dv
float u_up, u_left, u_right, u_down, u;
float v_up, v_left, v_right, v_down, v;
float du_up, du_left, du_right, du_down, du;
float dv_up, dv_left, dv_right, dv_down, dv;
u_left = tex1Dfetch(tex_u, pos_l);
u_right = tex1Dfetch(tex_u, pos_r);
u_down = tex1Dfetch(tex_u, pos_d);
u_up = tex1Dfetch(tex_u, pos_u);
u = tex1Dfetch(tex_u, pos);
v_left = tex1Dfetch(tex_v, pos_l);
v_right = tex1Dfetch(tex_v, pos_r);
v_down = tex1Dfetch(tex_v, pos_d);
v = tex1Dfetch(tex_v, pos);
v_up = tex1Dfetch(tex_v, pos_u);
du = tex1Dfetch(tex_du, pos);
du_left = tex1Dfetch(tex_du, pos_l);
du_right = tex1Dfetch(tex_du, pos_r);
du_down = tex1Dfetch(tex_du, pos_d);
du_up = tex1Dfetch(tex_du, pos_u);
dv = tex1Dfetch(tex_dv, pos);
dv_left = tex1Dfetch(tex_dv, pos_l);
dv_right = tex1Dfetch(tex_dv, pos_r);
dv_down = tex1Dfetch(tex_dv, pos_d);
dv_up = tex1Dfetch(tex_dv, pos_u);
float numerator_dudv = g_numerator_dudv[pos];
if((i+j)%2 == isBlack)
{
// update du
float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
// update dv
float numerator_v = (s_left*(v_left + dv_left) + s_up*(v_up + dv_up) + s_right*(v_right + dv_right) + s_down*(v_down + dv_down) -
v * (s_left + s_right + s_up + s_down) - g_numerator_v[pos] - numerator_dudv*du);
dv = (1.0f - omega) * dv + omega * g_inv_denominator_v[pos] * numerator_v;
}
new_du[pos] = du;
new_dv[pos] = dv;
}
///////////////////////////////////////////////////////////////////////////////
// utility functions
///////////////////////////////////////////////////////////////////////////////
void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
{
tex.addressMode[0] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false;
}
void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
{
tex.addressMode[0] = cudaAddressModeMirror;
tex.addressMode[1] = cudaAddressModeMirror;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true;
}
void InitTextures()
{
initTexture2D(tex_I0);
initTexture2D(tex_I1);
initTexture2D(tex_fine); // for downsampling
initTexture2D(tex_coarse); // for prolongation
initTexture2D(tex_Ix);
initTexture2D(tex_Ixx);
initTexture2D(tex_Ix0);
initTexture2D(tex_Iy);
initTexture2D(tex_Iyy);
initTexture2D(tex_Iy0);
initTexture2D(tex_Ixy);
initTexture1D(tex_u);
initTexture1D(tex_v);
initTexture1D(tex_du);
initTexture1D(tex_dv);
initTexture1D(tex_diffusivity_x);
initTexture1D(tex_diffusivity_y);
initTexture1D(tex_inv_denominator_u);
initTexture1D(tex_inv_denominator_v);
initTexture1D(tex_numerator_dudv);
initTexture1D(tex_numerator_u);
initTexture1D(tex_numerator_v);
}
/////////////////////////////////////////////////////////////////////////////////////////
// MAIN FUNCTION
/////////////////////////////////////////////////////////////////////////////////////////
NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
INCVMemAllocator &gpu_mem_allocator,
const NCVMatrix<Ncv32f> &frame0,
const NCVMatrix<Ncv32f> &frame1,
NCVMatrix<Ncv32f> &uOut,
NCVMatrix<Ncv32f> &vOut,
cudaStream_t stream)
{
ncvAssertPrintReturn(desc.alpha > 0.0f , "Invalid alpha" , NCV_INCONSISTENT_INPUT);
ncvAssertPrintReturn(desc.gamma >= 0.0f , "Invalid gamma" , NCV_INCONSISTENT_INPUT);
ncvAssertPrintReturn(desc.number_of_inner_iterations > 0 , "Invalid number of inner iterations" , NCV_INCONSISTENT_INPUT);
ncvAssertPrintReturn(desc.number_of_outer_iterations > 0 , "Invalid number of outer iterations" , NCV_INCONSISTENT_INPUT);
ncvAssertPrintReturn(desc.number_of_solver_iterations > 0, "Invalid number of solver iterations", NCV_INCONSISTENT_INPUT);
const Ncv32u kSourceWidth = frame0.width();
const Ncv32u kSourceHeight = frame0.height();
ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
bool kSkipProcessing = gpu_mem_allocator.isCounting();
cudaDeviceProp device_props;
int cuda_device;
ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);
Ncv32u alignmentValue = gpu_mem_allocator.alignment ();
const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
const Ncv32u kSourcePitch = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
const Ncv32f scale_factor = desc.scale_factor;
const Ncv32f alpha = desc.alpha;
const Ncv32f gamma = desc.gamma;
const Ncv32u kSizeInPixelsAligned = alignUp(kSourceWidth, kStrideAlignmentFloat)*kSourceHeight;
#if defined SAFE_VECTOR_DECL
#undef SAFE_VECTOR_DECL
#endif
#define SAFE_VECTOR_DECL(name, allocator, size) \
FloatVector name((allocator), (size)); \
ncvAssertReturn(name##.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
// matrix elements
SAFE_VECTOR_DECL(diffusivity_x, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(diffusivity_y, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(denom_u, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(denom_v, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(num_dudv, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(num_u, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(num_v, gpu_mem_allocator, kSizeInPixelsAligned);
// flow components
SAFE_VECTOR_DECL(u, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(v, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(u_new, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(v_new, gpu_mem_allocator, kSizeInPixelsAligned);
// flow increments
SAFE_VECTOR_DECL(du, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(dv, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(du_new, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
// temporary storage
SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
alignUp(kSourceWidth, kStrideAlignmentFloat)
* alignUp(kSourceHeight, kStrideAlignmentFloat));
// image derivatives
SAFE_VECTOR_DECL(Ix, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(Ixx, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(Ix0, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(Iy, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(Iyy, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(Iy0, gpu_mem_allocator, kSizeInPixelsAligned);
SAFE_VECTOR_DECL(Ixy, gpu_mem_allocator, kSizeInPixelsAligned);
// spatial derivative filter size
const int kDFilterSize = 5;
const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);
ncvAssertCUDAReturn(
cudaMemcpy(derivativeFilter.ptr(),
derivativeFilterHost,
sizeof(float) * kDFilterSize,
cudaMemcpyHostToDevice),
NCV_CUDA_ERROR);
InitTextures();
//prepare image pyramid
std::vector< shared_ptr<FloatVector> > img0_pyramid;
std::vector< shared_ptr<FloatVector> > img1_pyramid;
std::vector<Ncv32u> w_pyramid;
std::vector<Ncv32u> h_pyramid;
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
float scale = 1.0f;
//cuda arrays for frames
shared_ptr<FloatVector> I0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
ncvAssertReturn(I0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
shared_ptr<FloatVector> I1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
ncvAssertReturn(I1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
if (!kSkipProcessing)
{
//copy frame data to device
size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
size_t src_width_in_bytes = kSourceWidth * sizeof(float);
size_t src_pitch_in_bytes = frame0.pitch();
ncvAssertCUDAReturn( cudaMemcpy2DAsync(I0->ptr(), dst_width_in_bytes, frame0.ptr(),
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn( cudaMemcpy2DAsync(I1->ptr(), dst_width_in_bytes, frame1.ptr(),
src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
}
//prepare pyramid
img0_pyramid.push_back(I0);
img1_pyramid.push_back(I1);
w_pyramid.push_back(kSourceWidth);
h_pyramid.push_back(kSourceHeight);
scale *= scale_factor;
Ncv32u prev_level_width = kSourceWidth;
Ncv32u prev_level_height = kSourceHeight;
while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(img0_pyramid.size()) < desc.number_of_outer_iterations))
{
//current resolution
Ncv32u level_width = static_cast<Ncv32u>(ceilf(kSourceWidth * scale));
Ncv32u level_height = static_cast<Ncv32u>(ceilf(kSourceHeight * scale));
Ncv32u level_width_aligned = alignUp(level_width, kStrideAlignmentFloat);
Ncv32u buffer_size = alignUp(level_width, kStrideAlignmentFloat) * level_height; // buffer size in floats
Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);
shared_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
shared_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
if (!kSkipProcessing)
{
NcvSize32u srcSize (prev_level_width, prev_level_height);
NcvSize32u dstSize (level_width, level_height);
NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
NcvRect32u dstROI (0, 0, level_width, level_height);
// frame 0
nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
// frame 1
nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
}
//store pointers
img0_pyramid.push_back(level_frame0);
img1_pyramid.push_back(level_frame1);
w_pyramid.push_back(level_width);
h_pyramid.push_back(level_height);
scale *= scale_factor;
prev_level_width = level_width;
prev_level_height = level_height;
I0 = level_frame0;
I1 = level_frame1;
}
if (!kSkipProcessing)
{
//initial values for flow is 0
ncvAssertCUDAReturn(cudaMemsetAsync(u.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
//select images with lowest resolution
size_t pitch = alignUp(w_pyramid.back(), kStrideAlignmentFloat) * sizeof(float);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, img0_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, img1_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
}
FloatVector* ptrU = &u;
FloatVector* ptrV = &v;
FloatVector* ptrUNew = &u_new;
FloatVector* ptrVNew = &v_new;
std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img0Iter = img0_pyramid.rbegin();
std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img1Iter = img1_pyramid.rbegin();
//outer loop
//warping fixed point iteration
while(!w_pyramid.empty())
{
//current grid dimensions
const Ncv32u kLevelWidth = w_pyramid.back();
const Ncv32u kLevelHeight = h_pyramid.back();
const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
//size of current image in bytes
const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
//number of points at current resolution
const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
if (!kSkipProcessing)
{
//initial guess for du and dv
ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
}
//texture format descriptor
cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
I0 = *img0Iter;
I1 = *img1Iter;
++img0Iter;
++img1Iter;
if (!kSkipProcessing)
{
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
}
//compute derivatives
dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
dim3 dThreads(32, 6);
const int kPitchTex = kLevelStride * sizeof(float);
if (!kSkipProcessing)
{
NcvSize32u srcSize(kLevelWidth, kLevelHeight);
Ncv32u nSrcStep = kLevelStride * sizeof(float);
NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
// Ix0
nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
// Iy0
nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
// Ix
nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
// Iy
nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
// Ixx
nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
// Iyy
nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
// Ixy
nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
}
if (!kSkipProcessing)
{
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy, Iy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
// flow
ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
// flow increments
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}
dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
// inner loop
// lagged nonlinearity fixed point iteration
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
{
//compute coefficients
if (!kSkipProcessing)
{
prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
(diffusivity_x.ptr(),
diffusivity_y.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_dudv.ptr(),
num_u.ptr(),
num_v.ptr(),
kLevelWidth,
kLevelHeight,
kLevelStride,
alpha,
gamma);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
// linear system coefficients
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}
//solve linear system
for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
{
float omega = 1.99f;
if (!kSkipProcessing)
{
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
(du_new.ptr(),
dv_new.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
(du.ptr(),
dv.ptr(),
denom_u.ptr(),
denom_v.ptr(),
num_u.ptr(),
num_v.ptr(),
num_dudv.ptr(),
omega,
kLevelWidth,
kLevelHeight,
kLevelStride);
}
ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
}//end of solver loop
}// end of inner loop
//update u and v
if (!kSkipProcessing)
{
add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
}
//prolongate using texture
w_pyramid.pop_back();
h_pyramid.pop_back();
if (!w_pyramid.empty())
{
//compute new image size
Ncv32u nw = w_pyramid.back();
Ncv32u nh = h_pyramid.back();
Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
dim3 p_threads(32, 8);
if (!kSkipProcessing)
{
NcvSize32u srcSize (kLevelWidth, kLevelHeight);
NcvSize32u dstSize (nw, nh);
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
NcvRect32u dstROI (0, 0, nw, nh);
nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
}
std::swap<FloatVector*>(ptrU, ptrUNew);
std::swap<FloatVector*>(ptrV, ptrVNew);
}
scale /= scale_factor;
}
// end of warping iterations
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(uOut.ptr(), uOut.pitch(), ptrU->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn( cudaMemcpy2DAsync
(vOut.ptr(), vOut.pitch(), ptrV->ptr(),
kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
return NCV_SUCCESS;
}
modules/gpu/src/nvidia/NCVBroxOpticalFlow.hpp
0 → 100644
View file @
42c7aece
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
////////////////////////////////////////////////////////////////////////////////
//
// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
//
// Algorithm is explained in the original paper:
// T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
// High accuracy optical flow estimation based on a theory for warping.
// ECCV 2004.
//
// Implementation by Mikhail Smirnov
// email: msmirnov@nvidia.com, devsupport@nvidia.com
//
// Credits for help with the code to:
// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef _ncv_optical_flow_h_
#define _ncv_optical_flow_h_
#include "NCV.hpp"
/// \brief Model and solver parameters
struct
NCVBroxOpticalFlowDescriptor
{
/// flow smoothness
Ncv32f
alpha
;
/// gradient constancy importance
Ncv32f
gamma
;
/// pyramid scale factor
Ncv32f
scale_factor
;
/// number of lagged non-linearity iterations (inner loop)
Ncv32u
number_of_inner_iterations
;
/// number of warping iterations (number of pyramid levels)
Ncv32u
number_of_outer_iterations
;
/// number of linear system solver iterations
Ncv32u
number_of_solver_iterations
;
};
/////////////////////////////////////////////////////////////////////////////////////////
/// \brief Compute optical flow
///
/// Based on method by Brox et al [2004]
/// \param [in] desc model and solver parameters
/// \param [in] gpu_mem_allocator GPU memory allocator
/// \param [in] frame0 source frame
/// \param [in] frame1 frame to track
/// \param [out] u flow horizontal component (along \b x axis)
/// \param [out] v flow vertical component (along \b y axis)
/// \return computation status
/////////////////////////////////////////////////////////////////////////////////////////
NCV_EXPORTS
NCVStatus
NCVBroxOpticalFlow
(
const
NCVBroxOpticalFlowDescriptor
desc
,
INCVMemAllocator
&
gpu_mem_allocator
,
const
NCVMatrix
<
Ncv32f
>
&
frame0
,
const
NCVMatrix
<
Ncv32f
>
&
frame1
,
NCVMatrix
<
Ncv32f
>
&
u
,
NCVMatrix
<
Ncv32f
>
&
v
,
cudaStream_t
stream
);
#endif
modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
View file @
42c7aece
...
...
@@ -1610,3 +1610,952 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
{
return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
}
//==============================================================================
//
// Filter.cu
//
//==============================================================================
texture <float, 1, cudaReadModeElementType> texSrc;
texture <float, 1, cudaReadModeElementType> texKernel;
__forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
int i,
int w)
{
if (i < 0) i = 1 - i;
if (i >= w) i = w + w - i - 1;
return tex1Dfetch (texSrc, rowOffset + i);
}
__forceinline__ __device__ float getValueMirrorColumn(const int offset,
const int rowStep,
int j,
int h)
{
if (j < 0) j = 1 - j;
if (j >= h) j = h + h - j - 1;
return tex1Dfetch (texSrc, offset + j * rowStep);
}
__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
Ncv32f *pDst,
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u roi,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
{
// position within ROI
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix >= roi.width || iy >= roi.height)
{
return;
}
const int p = nKernelSize - nAnchor - 1;
const int j = roi.y + iy;
const int rowOffset = j * srcStep + roi.x;
float sum = 0.0f;
for (int m = 0; m < nKernelSize; ++m)
{
sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
* tex1Dfetch (texKernel, m);
}
pDst[iy * dstStep + ix] = sum * multiplier;
}
__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
Ncv32f *pDst,
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u roi,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
{
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
if (ix >= roi.width || iy >= roi.height)
{
return;
}
const int p = nKernelSize - nAnchor - 1;
const int i = roi.x + ix;
const int offset = i + roi.y * srcStep;
float sum = 0.0f;
for (int m = 0; m < nKernelSize; ++m)
{
sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
* tex1Dfetch (texKernel, m);
}
pDst[ix + iy * dstStep] = sum * multiplier;
}
NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
Ncv32f *pDst,
NcvSize32u dstSize,
Ncv32u nDstStep,
NcvRect32u oROI,
NppStBorderType borderType,
const Ncv32f *pKernel,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
{
ncvAssertReturn (pSrc != NULL &&
pDst != NULL &&
pKernel != NULL, NCV_NULL_PTR);
ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
dstSize.width * sizeof (Ncv32f) <= nDstStep &&
oROI.width * sizeof (Ncv32f) <= nSrcStep &&
oROI.width * sizeof (Ncv32f) <= nDstStep &&
nSrcStep % sizeof (Ncv32f) == 0 &&
nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
// adjust ROI size to be within source image
if (oROI.x + oROI.width > srcSize.width)
{
oROI.width = srcSize.width - oROI.x;
}
if (oROI.y + oROI.height > srcSize.height)
{
oROI.height = srcSize.height - oROI.y;
}
cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
texSrc.normalized = false;
texKernel.normalized = false;
cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
dim3 ctaSize (32, 6);
dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
(oROI.height + ctaSize.y - 1) / ctaSize.y);
switch (borderType)
{
case nppStBorderNone:
return NPPST_ERROR;
case nppStBorderClamp:
return NPPST_ERROR;
case nppStBorderWrap:
return NPPST_ERROR;
case nppStBorderMirror:
FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
break;
default:
return NPPST_ERROR;
}
return NPPST_SUCCESS;
}
NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
Ncv32f *pDst,
NcvSize32u dstSize,
Ncv32u nDstStep,
NcvRect32u oROI,
NppStBorderType borderType,
const Ncv32f *pKernel,
Ncv32s nKernelSize,
Ncv32s nAnchor,
Ncv32f multiplier)
{
ncvAssertReturn (pSrc != NULL &&
pDst != NULL &&
pKernel != NULL, NCV_NULL_PTR);
ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
dstSize.width * sizeof (Ncv32f) <= nDstStep &&
oROI.width * sizeof (Ncv32f) <= nSrcStep &&
oROI.width * sizeof (Ncv32f) <= nDstStep &&
nSrcStep % sizeof (Ncv32f) == 0 &&
nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
// adjust ROI size to be within source image
if (oROI.x + oROI.width > srcSize.width)
{
oROI.width = srcSize.width - oROI.x;
}
if (oROI.y + oROI.height > srcSize.height)
{
oROI.height = srcSize.height - oROI.y;
}
cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
texSrc.normalized = false;
texKernel.normalized = false;
cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
dim3 ctaSize (32, 6);
dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
(oROI.height + ctaSize.y - 1) / ctaSize.y);
switch (borderType)
{
case nppStBorderClamp:
return NPPST_ERROR;
case nppStBorderWrap:
return NPPST_ERROR;
case nppStBorderMirror:
FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
break;
default:
return NPPST_ERROR;
}
return NPPST_SUCCESS;
}
//==============================================================================
//
// FrameInterpolate.cu
//
//==============================================================================
inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
{
return (num + denom - 1)/denom;
}
texture<float, 2, cudaReadModeElementType> tex_src1;
texture<float, 2, cudaReadModeElementType> tex_src0;
__global__ void BlendFramesKernel(const float *u, const float *v, // forward flow
const float *ur, const float *vr, // backward flow
const float *o0, const float *o1, // coverage masks
int w, int h, int s,
float theta, float *out)
{
const int ix = threadIdx.x + blockDim.x * blockIdx.x;
const int iy = threadIdx.y + blockDim.y * blockIdx.y;
const int pos = ix + s * iy;
if (ix >= w || iy >= h) return;
float _u = u[pos];
float _v = v[pos];
float _ur = ur[pos];
float _vr = vr[pos];
float x = (float)ix + 0.5f;
float y = (float)iy + 0.5f;
bool b0 = o0[pos] > 1e-4f;
bool b1 = o1[pos] > 1e-4f;
if (b0 && b1)
{
// pixel is visible on both frames
out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
}
else if (b0)
{
// visible on the first frame only
out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
}
else
{
// visible on the second frame only
out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
}
}
NCVStatus BlendFrames(const Ncv32f *src0,
const Ncv32f *src1,
const Ncv32f *ufi,
const Ncv32f *vfi,
const Ncv32f *ubi,
const Ncv32f *vbi,
const Ncv32f *o1,
const Ncv32f *o2,
Ncv32u width,
Ncv32u height,
Ncv32u stride,
Ncv32f theta,
Ncv32f *out)
{
tex_src1.addressMode[0] = cudaAddressModeClamp;
tex_src1.addressMode[1] = cudaAddressModeClamp;
tex_src1.filterMode = cudaFilterModeLinear;
tex_src1.normalized = false;
tex_src0.addressMode[0] = cudaAddressModeClamp;
tex_src0.addressMode[1] = cudaAddressModeClamp;
tex_src0.filterMode = cudaFilterModeLinear;
tex_src0.normalized = false;
cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
const Ncv32u pitch = stride * sizeof (float);
ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
dim3 threads (32, 4);
dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
(ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
ncvAssertCUDAReturn (cudaGetLastError (), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
return NPPST_SUCCESS;
}
NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize,
Ncv32u nStep,
Ncv32u *hpSize)
{
NCVStatus status = NPPST_ERROR;
status = nppiStVectorWarpGetBufferSize(srcSize, nStep, hpSize);
return status;
}
NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
{
// check state validity
ncvAssertReturn (pState->pSrcFrame0 != 0 &&
pState->pSrcFrame1 != 0 &&
pState->pFU != 0 &&
pState->pFV != 0 &&
pState->pBU != 0 &&
pState->pBV != 0 &&
pState->pNewFrame != 0 &&
pState->ppBuffers[0] != 0 &&
pState->ppBuffers[1] != 0 &&
pState->ppBuffers[2] != 0 &&
pState->ppBuffers[3] != 0 &&
pState->ppBuffers[4] != 0 &&
pState->ppBuffers[5] != 0, NPPST_NULL_POINTER_ERROR);
ncvAssertReturn (pState->size.width > 0 &&
pState->size.height > 0, NPPST_ERROR);
ncvAssertReturn (pState->nStep >= pState->size.width * sizeof (Ncv32f) &&
pState->nStep > 0 &&
pState->nStep % sizeof (Ncv32f) == 0,
NPPST_INVALID_STEP);
// change notation
Ncv32f *cov0 = pState->ppBuffers[0];
Ncv32f *cov1 = pState->ppBuffers[1];
Ncv32f *fwdU = pState->ppBuffers[2]; // forward u
Ncv32f *fwdV = pState->ppBuffers[3]; // forward v
Ncv32f *bwdU = pState->ppBuffers[4]; // backward u
Ncv32f *bwdV = pState->ppBuffers[5]; // backward v
// warp flow
ncvAssertReturnNcvStat (
nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU,
pState->size,
pState->nStep,
pState->pFU,
pState->pFV,
pState->nStep,
cov0,
pState->pos,
fwdU) );
ncvAssertReturnNcvStat (
nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV,
pState->size,
pState->nStep,
pState->pFU,
pState->pFV,
pState->nStep,
cov0,
pState->pos,
fwdV) );
// warp backward flow
ncvAssertReturnNcvStat (
nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU,
pState->size,
pState->nStep,
pState->pBU,
pState->pBV,
pState->nStep,
cov1,
1.0f - pState->pos,
bwdU) );
ncvAssertReturnNcvStat (
nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV,
pState->size,
pState->nStep,
pState->pBU,
pState->pBV,
pState->nStep,
cov1,
1.0f - pState->pos,
bwdU) );
// interpolate frame
ncvAssertReturnNcvStat (
BlendFrames (pState->pSrcFrame0,
pState->pSrcFrame1,
fwdU,
fwdV,
bwdU,
bwdV,
cov0,
cov1,
pState->size.width,
pState->size.height,
pState->nStep / sizeof (Ncv32f),
pState->pos,
pState->pNewFrame) );
return NPPST_SUCCESS;
}
//==============================================================================
//
// VectorWarpFrame.cu
//
//==============================================================================
#if __CUDA_ARCH__ < 200
// FP32 atomic add
static __forceinline__ __device__ float _atomicAdd(float *addr, float val)
{
float old = *addr, assumed;
do {
assumed = old;
old = int_as_float(__iAtomicCAS((int*)addr,
float_as_int(assumed),
float_as_int(val+assumed)));
} while( assumed!=old );
return old;
}
#else
#define _atomicAdd atomicAdd
#endif
__global__ void ForwardWarpKernel_PSF2x2(const float *u,
const float *v,
const float *src,
const int w,
const int h,
const int flow_stride,
const int image_stride,
const float time_scale,
float *normalization_factor,
float *dst)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
int i = threadIdx.y + blockDim.y * blockIdx.y;
if (i >= h || j >= w) return;
int flow_row_offset = i * flow_stride;
int image_row_offset = i * image_stride;
//bottom left corner of a target pixel
float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
// pixel containing bottom left corner
float px;
float py;
float dx = modff (cx, &px);
float dy = modff (cy, &py);
// target pixel integer coords
int tx;
int ty;
tx = (int) px;
ty = (int) py;
float value = src[image_row_offset + j];
float weight;
// fill pixel containing bottom right corner
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * dy;
_atomicAdd (dst + ty * image_stride + tx, value * weight);
_atomicAdd (normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing bottom left corner
tx -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * dy;
_atomicAdd (dst + ty * image_stride + tx, value * weight);
_atomicAdd (normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper left corner
ty -= 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = (1.0f - dx) * (1.0f - dy);
_atomicAdd (dst + ty * image_stride + tx, value * weight);
_atomicAdd (normalization_factor + ty * image_stride + tx, weight);
}
// fill pixel containing upper right corner
tx += 1;
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
weight = dx * (1.0f - dy);
_atomicAdd (dst + ty * image_stride + tx, value * weight);
_atomicAdd (normalization_factor + ty * image_stride + tx, weight);
}
}
__global__ void ForwardWarpKernel_PSF1x1(const float *u,
const float *v,
const float *src,
const int w,
const int h,
const int flow_stride,
const int image_stride,
const float time_scale,
float *dst)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
int i = threadIdx.y + blockDim.y * blockIdx.y;
if (i >= h || j >= w) return;
int flow_row_offset = i * flow_stride;
int image_row_offset = i * image_stride;
float u_ = u[flow_row_offset + j];
float v_ = v[flow_row_offset + j];
//bottom left corner of target pixel
float cx = u_ * time_scale + (float)j + 1.0f;
float cy = v_ * time_scale + (float)i + 1.0f;
// pixel containing bottom left corner
int tx = __float2int_rn (cx);
int ty = __float2int_rn (cy);
float value = src[image_row_offset + j];
// fill pixel
if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
{
_atomicAdd (dst + ty * image_stride + tx, value);
}
}
__global__ void NormalizeKernel(const float *normalization_factor, int w, int h, int s, float *image)
{
int i = threadIdx.y + blockDim.y * blockIdx.y;
int j = threadIdx.x + blockDim.x * blockIdx.x;
if (i >= h || j >= w) return;
const int pos = i * s + j;
float scale = normalization_factor[pos];
float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
image[pos] *= invScale;
}
__global__ void MemsetKernel(const float value, int w, int h, float *image)
{
int i = threadIdx.y + blockDim.y * blockIdx.y;
int j = threadIdx.x + blockDim.x * blockIdx.x;
if (i >= h || j >= w) return;
const int pos = i * w + j;
image[pos] = value;
}
NCVStatus nppiStVectorWarpGetBufferSize (NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32u *hpSize)
{
ncvAssertReturn (hpSize != NULL, NPPST_NULL_POINTER_ERROR);
ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep,
NPPST_INVALID_STEP);
*hpSize = nSrcStep * srcSize.height;
return NPPST_SUCCESS;
}
// does not require normalization
NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
const Ncv32f *pU,
const Ncv32f *pV,
Ncv32u nVFStep,
Ncv32f timeScale,
Ncv32f *pDst)
{
ncvAssertReturn (pSrc != NULL &&
pU != NULL &&
pV != NULL &&
pDst != NULL, NPPST_NULL_POINTER_ERROR);
ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
srcSize.width * sizeof (Ncv32f) <= nVFStep,
NPPST_INVALID_STEP);
Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
Ncv32u vfStep = nVFStep / sizeof (Ncv32f);
dim3 ctaSize (32, 6);
dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
ForwardWarpKernel_PSF1x1 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pDst);
return NPPST_SUCCESS;
}
NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
const Ncv32f *pU,
const Ncv32f *pV,
Ncv32u nVFStep,
Ncv32f *pBuffer,
Ncv32f timeScale,
Ncv32f *pDst)
{
ncvAssertReturn (pSrc != NULL &&
pU != NULL &&
pV != NULL &&
pDst != NULL &&
pBuffer != NULL, NPPST_NULL_POINTER_ERROR);
ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
srcSize.width * sizeof (Ncv32f) <= nVFStep, NPPST_INVALID_STEP);
Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
Ncv32u vfStep = nVFStep / sizeof(Ncv32f);
dim3 ctaSize(32, 6);
dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
MemsetKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(0, srcSize.width, srcSize.height, pBuffer);
ForwardWarpKernel_PSF2x2 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pBuffer, pDst);
NormalizeKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
(pBuffer, srcSize.width, srcSize.height, srcStep, pDst);
return NPPST_SUCCESS;
}
//==============================================================================
//
// Resize.cu
//
//==============================================================================
texture <float, 2, cudaReadModeElementType> texSrc2D;
__forceinline__
__device__ float processLine(int spos,
float xmin,
float xmax,
int ixmin,
int ixmax,
float fxmin,
float cxmax)
{
// first element
float wsum = 1.0f - xmin + fxmin;
float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
spos++;
for (int ix = ixmin + 1; ix < ixmax; ++ix)
{
sum += tex1Dfetch(texSrc, spos);
spos++;
wsum += 1.0f;
}
sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
wsum += cxmax - xmax;
return sum / wsum;
}
__global__ void resizeSuperSample_32f(NcvSize32u srcSize,
Ncv32u srcStep,
NcvRect32u srcROI,
Ncv32f *dst,
NcvSize32u dstSize,
Ncv32u dstStep,
NcvRect32u dstROI,
Ncv32f scaleX,
Ncv32f scaleY)
{
// position within dst ROI
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix >= dstROI.width || iy >= dstROI.height)
{
return;
}
float rw = (float) srcROI.width;
float rh = (float) srcROI.height;
// source position
float x = scaleX * (float) ix;
float y = scaleY * (float) iy;
// x sampling range
float xBegin = fmax (x - scaleX, 0.0f);
float xEnd = fmin (x + scaleX, rw - 1.0f);
// y sampling range
float yBegin = fmax (y - scaleY, 0.0f);
float yEnd = fmin (y + scaleY, rh - 1.0f);
// x range of source samples
float floorXBegin = floorf (xBegin);
float ceilXEnd = ceilf (xEnd);
int iXBegin = srcROI.x + (int) floorXBegin;
int iXEnd = srcROI.x + (int) ceilXEnd;
// y range of source samples
float floorYBegin = floorf (yBegin);
float ceilYEnd = ceilf (yEnd);
int iYBegin = srcROI.y + (int) floorYBegin;
int iYEnd = srcROI.y + (int) ceilYEnd;
// first row
int pos = iYBegin * srcStep + iXBegin;
float wsum = 1.0f - yBegin + floorYBegin;
float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd) * (1.0f - yBegin + floorYBegin);
pos += srcStep;
for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
{
sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd);
pos += srcStep;
wsum += 1.0f;
}
sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
ceilXEnd) * (ceilYEnd - yEnd);
wsum += ceilYEnd - yEnd;
sum /= wsum;
dst[(ix + dstROI.x) + (iy + dstROI.y) * dstStep] = sum;
}
// bicubic interpolation
__forceinline__
__device__ float bicubicCoeff(float x_)
{
float x = fabsf(x_);
if (x <= 1.0f)
{
return x * x * (1.5f * x - 2.5f) + 1.0f;
}
else if (x < 2.0f)
{
return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
}
else
{
return 0.0f;
}
}
__global__ void resizeBicubic(NcvSize32u srcSize,
NcvRect32u srcROI,
NcvSize32u dstSize,
Ncv32u dstStep,
Ncv32f *dst,
NcvRect32u dstROI,
Ncv32f scaleX,
Ncv32f scaleY)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix >= dstROI.width || iy >= dstROI.height)
{
return;
}
const float dx = 1.0f / srcROI.width;
const float dy = 1.0f / srcROI.height;
float rx = (float) srcROI.x;
float ry = (float) srcROI.y;
float rw = (float) srcROI.width;
float rh = (float) srcROI.height;
float x = scaleX * (float) ix;
float y = scaleY * (float) iy;
// sampling range
// border mode is clamp
float xmin = fmax (ceilf (x - 2.0f), 0.0f);
float xmax = fmin (floorf (x + 2.0f), rw - 1.0f);
float ymin = fmax (ceilf (y - 2.0f), 0.0f);
float ymax = fmin (floorf (y + 2.0f), rh - 1.0f);
// shift data window to match ROI
rx += 0.5f;
ry += 0.5f;
x += rx;
y += ry;
xmin += rx;
xmax += rx;
ymin += ry;
ymax += ry;
float sum = 0.0f;
float wsum = 0.0f;
for (float cy = ymin; cy <= ymax; cy += 1.0f)
{
for (float cx = xmin; cx <= xmax; cx += 1.0f)
{
float xDist = x - cx;
float yDist = y - cy;
float wx = bicubicCoeff (xDist);
float wy = bicubicCoeff (yDist);
wx *= wy;
sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
wsum += wx;
}
}
dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = sum / wsum;
}
NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
NcvSize32u srcSize,
Ncv32u nSrcStep,
NcvRect32u srcROI,
Ncv32f *pDst,
NcvSize32u dstSize,
Ncv32u nDstStep,
NcvRect32u dstROI,
Ncv32f xFactor,
Ncv32f yFactor,
NppStInterpMode interpolation)
{
NCVStatus status = NPPST_SUCCESS;
ncvAssertReturn (pSrc != NULL && pDst != NULL, NPPST_NULL_POINTER_ERROR);
ncvAssertReturn (xFactor != 0.0 && yFactor != 0.0, NPPST_INVALID_SCALE);
ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width &&
nDstStep >= sizeof (Ncv32f) * (Ncv32f) dstSize.width,
NPPST_INVALID_STEP);
Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
// TODO: preprocess ROI to prevent out of bounds access
if (interpolation == nppStSupersample)
{
// bind texture
cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
// invoke kernel
dim3 ctaSize (32, 6);
dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x,
(dstROI.height + ctaSize.y - 1) / ctaSize.y);
resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
}
else if (interpolation == nppStBicubic)
{
texSrc2D.addressMode[0] = cudaAddressModeMirror;
texSrc2D.addressMode[1] = cudaAddressModeMirror;
texSrc2D.normalized = true;
cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
nSrcStep);
dim3 ctaSize (32, 6);
dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x,
(dstSize.height + ctaSize.y - 1) / ctaSize.y);
resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
(srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
}
else
{
status = NPPST_ERROR;
}
return status;
}
modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
View file @
42c7aece
...
...
@@ -84,6 +84,255 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
*/
/** Border type
*
* Filtering operations assume that each pixel has a neighborhood of pixels.
* The following structure describes possible ways to define non-existent pixels.
*/
enum
NppStBorderType
{
nppStBorderNone
=
0
,
///< There is no need to define additional pixels, image is extended already
nppStBorderClamp
=
1
,
///< Clamp out of range position to borders
nppStBorderWrap
=
2
,
///< Wrap out of range position. Image becomes periodic.
nppStBorderMirror
=
3
///< reflect out of range position across borders
};
/**
* Filter types for image resizing
*/
enum
NppStInterpMode
{
nppStSupersample
,
///< Supersampling. For downscaling only
nppStBicubic
///< Bicubic convolution filter, a = -0.5 (cubic Hermite spline)
};
/** Frame interpolation state
*
* This structure holds parameters required for frame interpolation.
* Forward displacement field is a per-pixel mapping from frame 0 to frame 1.
* Backward displacement field is a per-pixel mapping from frame 1 to frame 0.
*/
struct
NppStInterpolationState
{
NcvSize32u
size
;
///< frame size
Ncv32u
nStep
;
///< pitch
Ncv32f
pos
;
///< new frame position
Ncv32f
*
pSrcFrame0
;
///< frame 0
Ncv32f
*
pSrcFrame1
;
///< frame 1
Ncv32f
*
pFU
;
///< forward horizontal displacement
Ncv32f
*
pFV
;
///< forward vertical displacement
Ncv32f
*
pBU
;
///< backward horizontal displacement
Ncv32f
*
pBV
;
///< backward vertical displacement
Ncv32f
*
pNewFrame
;
///< new frame
Ncv32f
*
ppBuffers
[
6
];
///< temporary buffers
};
/** Size of a buffer required for interpolation.
*
* Requires several such buffers. See \see NppStInterpolationState.
*
* \param srcSize [IN] Frame size (both frames must be of the same size)
* \param nStep [IN] Frame line step
* \param hpSize [OUT] Where to store computed size (host memory)
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStGetInterpolationBufferSize
(
NcvSize32u
srcSize
,
Ncv32u
nStep
,
Ncv32u
*
hpSize
);
/** Interpolate frames (images) using provided optical flow (displacement field).
* 32-bit floating point images, single channel
*
* \param pState [IN] structure containing all required parameters (host memory)
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStInterpolateFrames
(
const
NppStInterpolationState
*
pState
);
/** Row linear filter. 32-bit floating point image, single channel
*
* Apply horizontal linear filter
*
* \param pSrc [IN] Source image pointer (CUDA device memory)
* \param srcSize [IN] Source image size
* \param nSrcStep [IN] Source image line step
* \param pDst [OUT] Destination image pointer (CUDA device memory)
* \param dstSize [OUT] Destination image size
* \param oROI [IN] Region of interest in the source image
* \param borderType [IN] Type of border
* \param pKernel [IN] Pointer to row kernel values (CUDA device memory)
* \param nKernelSize [IN] Size of the kernel in pixels
* \param nAnchor [IN] The kernel row alignment with respect to the position of the input pixel
* \param multiplier [IN] Value by which the computed result is multiplied
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStFilterRowBorder_32f_C1R
(
const
Ncv32f
*
pSrc
,
NcvSize32u
srcSize
,
Ncv32u
nSrcStep
,
Ncv32f
*
pDst
,
NcvSize32u
dstSize
,
Ncv32u
nDstStep
,
NcvRect32u
oROI
,
NppStBorderType
borderType
,
const
Ncv32f
*
pKernel
,
Ncv32s
nKernelSize
,
Ncv32s
nAnchor
,
Ncv32f
multiplier
);
/** Column linear filter. 32-bit floating point image, single channel
*
* Apply vertical linear filter
*
* \param pSrc [IN] Source image pointer (CUDA device memory)
* \param srcSize [IN] Source image size
* \param nSrcStep [IN] Source image line step
* \param pDst [OUT] Destination image pointer (CUDA device memory)
* \param dstSize [OUT] Destination image size
* \param oROI [IN] Region of interest in the source image
* \param borderType [IN] Type of border
* \param pKernel [IN] Pointer to column kernel values (CUDA device memory)
* \param nKernelSize [IN] Size of the kernel in pixels
* \param nAnchor [IN] The kernel column alignment with respect to the position of the input pixel
* \param multiplier [IN] Value by which the computed result is multiplied
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStFilterColumnBorder_32f_C1R
(
const
Ncv32f
*
pSrc
,
NcvSize32u
srcSize
,
Ncv32u
nSrcStep
,
Ncv32f
*
pDst
,
NcvSize32u
dstSize
,
Ncv32u
nDstStep
,
NcvRect32u
oROI
,
NppStBorderType
borderType
,
const
Ncv32f
*
pKernel
,
Ncv32s
nKernelSize
,
Ncv32s
nAnchor
,
Ncv32f
multiplier
);
/** Size of buffer required for vector image warping.
*
* \param srcSize [IN] Source image size
* \param nStep [IN] Source image line step
* \param hpSize [OUT] Where to store computed size (host memory)
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStVectorWarpGetBufferSize
(
NcvSize32u
srcSize
,
Ncv32u
nSrcStep
,
Ncv32u
*
hpSize
);
/** Warp image using provided 2D vector field and 1x1 point spread function.
* 32-bit floating point image, single channel
*
* During warping pixels from the source image may fall between pixels of the destination image.
* PSF (point spread function) describes how the source image pixel affects pixels of the destination.
* For 1x1 PSF only single pixel with the largest intersection is affected (similar to nearest interpolation).
*
* Destination image size and line step must be the same as the source image size and line step
*
* \param pSrc [IN] Source image pointer (CUDA device memory)
* \param srcSize [IN] Source image size
* \param nSrcStep [IN] Source image line step
* \param pU [IN] Pointer to horizontal displacement field (CUDA device memory)
* \param pV [IN] Pointer to vertical displacement field (CUDA device memory)
* \param nVFStep [IN] Displacement field line step
* \param timeScale [IN] Value by which displacement field will be scaled for warping
* \param pDst [OUT] Destination image pointer (CUDA device memory)
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStVectorWarp_PSF1x1_32f_C1
(
const
Ncv32f
*
pSrc
,
NcvSize32u
srcSize
,
Ncv32u
nSrcStep
,
const
Ncv32f
*
pU
,
const
Ncv32f
*
pV
,
Ncv32u
nVFStep
,
Ncv32f
timeScale
,
Ncv32f
*
pDst
);
/** Warp image using provided 2D vector field and 2x2 point spread function.
* 32-bit floating point image, single channel
*
* During warping pixels from the source image may fall between pixels of the destination image.
* PSF (point spread function) describes how the source image pixel affects pixels of the destination.
* For 2x2 PSF all four intersected pixels will be affected.
*
* Destination image size and line step must be the same as the source image size and line step
*
* \param pSrc [IN] Source image pointer (CUDA device memory)
* \param srcSize [IN] Source image size
* \param nSrcStep [IN] Source image line step
* \param pU [IN] Pointer to horizontal displacement field (CUDA device memory)
* \param pV [IN] Pointer to vertical displacement field (CUDA device memory)
* \param nVFStep [IN] Displacement field line step
* \param timeScale [IN] Value by which displacement field will be scaled for warping
* \param pDst [OUT] Destination image pointer (CUDA device memory)
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStVectorWarp_PSF2x2_32f_C1
(
const
Ncv32f
*
pSrc
,
NcvSize32u
srcSize
,
Ncv32u
nSrcStep
,
const
Ncv32f
*
pU
,
const
Ncv32f
*
pV
,
Ncv32u
nVFStep
,
Ncv32f
*
pBuffer
,
Ncv32f
timeScale
,
Ncv32f
*
pDst
);
/** Resize. 32-bit floating point image, single channel
*
* Resizes image using specified filter (interpolation type)
*
* \param pSrc [IN] Source image pointer (CUDA device memory)
* \param srcSize [IN] Source image size
* \param nSrcStep [IN] Source image line step
* \param srcROI [IN] Source image region of interest
* \param pDst [OUT] Destination image pointer (CUDA device memory)
* \param dstSize [IN] Destination image size
* \param nDstStep [IN] Destination image line step
* \param dstROI [IN] Destination image region of interest
* \param xFactor [IN] Row scale factor
* \param yFactor [IN] Column scale factor
* \param interpolation [IN] Interpolation type
*
* \return NCV status code
*/
NCV_EXPORTS
NCVStatus
nppiStResize_32f_C1R
(
const
Ncv32f
*
pSrc
,
NcvSize32u
srcSize
,
Ncv32u
nSrcStep
,
NcvRect32u
srcROI
,
Ncv32f
*
pDst
,
NcvSize32u
dstSize
,
Ncv32u
nDstStep
,
NcvRect32u
dstROI
,
Ncv32f
xFactor
,
Ncv32f
yFactor
,
NppStInterpMode
interpolation
);
/**
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel.
*
...
...
samples/gpu/opticalflow_nvidia_api.cpp
0 → 100644
View file @
42c7aece
#if _MSC_VER >= 1400
#pragma warning( disable : 4201 4408 4127 4100)
#endif
#include <iostream>
#include <iomanip>
#include <memory>
#include <exception>
#include <ctime>
#include "cvconfig.h"
#include <iostream>
#include <iomanip>
#include "opencv2/opencv.hpp"
#include "opencv2/gpu/gpu.hpp"
#ifdef HAVE_CUDA
#include "NPP_staging/NPP_staging.hpp"
#include "NCVBroxOpticalFlow.hpp"
#endif
#if !defined(HAVE_CUDA)
int
main
(
int
argc
,
const
char
**
argv
)
{
cout
<<
"Please compile the library with CUDA support"
<<
endl
;
return
-
1
;
}
#else
using
std
::
tr1
::
shared_ptr
;
#define PARAM_INPUT "--input"
#define PARAM_SCALE "--scale"
#define PARAM_ALPHA "--alpha"
#define PARAM_GAMMA "--gamma"
#define PARAM_INNER "--inner"
#define PARAM_OUTER "--outer"
#define PARAM_SOLVER "--solver"
#define PARAM_TIME_STEP "--time-step"
#define PARAM_HELP "--help"
shared_ptr
<
INCVMemAllocator
>
g_pGPUMemAllocator
;
shared_ptr
<
INCVMemAllocator
>
g_pHostMemAllocator
;
class
RgbToMonochrome
{
public
:
float
operator
()(
unsigned
char
b
,
unsigned
char
g
,
unsigned
char
r
)
{
float
_r
=
static_cast
<
float
>
(
r
)
/
255.0
f
;
float
_g
=
static_cast
<
float
>
(
g
)
/
255.0
f
;
float
_b
=
static_cast
<
float
>
(
b
)
/
255.0
f
;
return
(
_r
+
_g
+
_b
)
/
3.0
f
;
}
};
class
RgbToR
{
public
:
float
operator
()(
unsigned
char
b
,
unsigned
char
g
,
unsigned
char
r
)
{
return
static_cast
<
float
>
(
r
)
/
255.0
f
;
}
};
class
RgbToG
{
public
:
float
operator
()(
unsigned
char
b
,
unsigned
char
g
,
unsigned
char
r
)
{
return
static_cast
<
float
>
(
g
)
/
255.0
f
;
}
};
class
RgbToB
{
public
:
float
operator
()(
unsigned
char
b
,
unsigned
char
g
,
unsigned
char
r
)
{
return
static_cast
<
float
>
(
b
)
/
255.0
f
;
}
};
template
<
class
T
>
NCVStatus
CopyData
(
IplImage
*
image
,
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
&
dst
)
{
dst
=
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
(
new
NCVMatrixAlloc
<
Ncv32f
>
(
*
g_pHostMemAllocator
,
image
->
width
,
image
->
height
));
ncvAssertReturn
(
dst
->
isMemAllocated
(),
NCV_ALLOCATOR_BAD_ALLOC
);
unsigned
char
*
row
=
reinterpret_cast
<
unsigned
char
*>
(
image
->
imageData
);
T
convert
;
for
(
int
i
=
0
;
i
<
image
->
height
;
++
i
)
{
for
(
int
j
=
0
;
j
<
image
->
width
;
++
j
)
{
if
(
image
->
nChannels
<
3
)
{
dst
->
ptr
()[
j
+
i
*
dst
->
stride
()]
=
static_cast
<
float
>
(
*
(
row
+
j
*
image
->
nChannels
))
/
255.0
f
;
}
else
{
unsigned
char
*
color
=
row
+
j
*
image
->
nChannels
;
dst
->
ptr
()[
j
+
i
*
dst
->
stride
()]
=
convert
(
color
[
0
],
color
[
1
],
color
[
2
]);
}
}
row
+=
image
->
widthStep
;
}
return
NCV_SUCCESS
;
}
template
<
class
T
>
NCVStatus
CopyData
(
const
IplImage
*
image
,
const
NCVMatrixAlloc
<
Ncv32f
>
&
dst
)
{
unsigned
char
*
row
=
reinterpret_cast
<
unsigned
char
*>
(
image
->
imageData
);
T
convert
;
for
(
int
i
=
0
;
i
<
image
->
height
;
++
i
)
{
for
(
int
j
=
0
;
j
<
image
->
width
;
++
j
)
{
if
(
image
->
nChannels
<
3
)
{
dst
.
ptr
()[
j
+
i
*
dst
.
stride
()]
=
static_cast
<
float
>
(
*
(
row
+
j
*
image
->
nChannels
))
/
255.0
f
;
}
else
{
unsigned
char
*
color
=
row
+
j
*
image
->
nChannels
;
dst
.
ptr
()[
j
+
i
*
dst
.
stride
()]
=
convert
(
color
[
0
],
color
[
1
],
color
[
2
]);
}
}
row
+=
image
->
widthStep
;
}
return
NCV_SUCCESS
;
}
NCVStatus
LoadImages
(
const
char
*
frame0Name
,
const
char
*
frame1Name
,
int
&
width
,
int
&
height
,
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
&
src
,
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
&
dst
,
IplImage
*&
firstFrame
,
IplImage
*&
lastFrame
)
{
IplImage
*
image
;
image
=
cvLoadImage
(
frame0Name
);
if
(
image
==
0
)
{
std
::
cout
<<
"Could not open '"
<<
frame0Name
<<
"'
\n
"
;
return
NCV_FILE_ERROR
;
}
firstFrame
=
image
;
// copy data to src
ncvAssertReturnNcvStat
(
CopyData
<
RgbToMonochrome
>
(
image
,
src
));
IplImage
*
image2
;
image2
=
cvLoadImage
(
frame1Name
);
if
(
image2
==
0
)
{
std
::
cout
<<
"Could not open '"
<<
frame1Name
<<
"'
\n
"
;
return
NCV_FILE_ERROR
;
}
lastFrame
=
image2
;
ncvAssertReturnNcvStat
(
CopyData
<
RgbToMonochrome
>
(
image2
,
dst
));
width
=
image
->
width
;
height
=
image
->
height
;
return
NCV_SUCCESS
;
}
template
<
typename
T
>
inline
T
Clamp
(
T
x
,
T
a
,
T
b
)
{
return
((
x
)
>
(
a
)
?
((
x
)
<
(
b
)
?
(
x
)
:
(
b
))
:
(
a
));
}
template
<
typename
T
>
inline
T
MapValue
(
T
x
,
T
a
,
T
b
,
T
c
,
T
d
)
{
x
=
Clamp
(
x
,
a
,
b
);
return
c
+
(
d
-
c
)
*
(
x
-
a
)
/
(
b
-
a
);
}
NCVStatus
ShowFlow
(
NCVMatrixAlloc
<
Ncv32f
>
&
u
,
NCVMatrixAlloc
<
Ncv32f
>
&
v
,
const
char
*
name
)
{
IplImage
*
flowField
;
NCVMatrixAlloc
<
Ncv32f
>
host_u
(
*
g_pHostMemAllocator
,
u
.
width
(),
u
.
height
());
ncvAssertReturn
(
host_u
.
isMemAllocated
(),
NCV_ALLOCATOR_BAD_ALLOC
);
NCVMatrixAlloc
<
Ncv32f
>
host_v
(
*
g_pHostMemAllocator
,
u
.
width
(),
u
.
height
());
ncvAssertReturn
(
host_v
.
isMemAllocated
(),
NCV_ALLOCATOR_BAD_ALLOC
);
ncvAssertReturnNcvStat
(
u
.
copySolid
(
host_u
,
0
));
ncvAssertReturnNcvStat
(
v
.
copySolid
(
host_v
,
0
));
float
*
ptr_u
=
host_u
.
ptr
();
float
*
ptr_v
=
host_v
.
ptr
();
float
maxDisplacement
=
1.0
f
;
for
(
Ncv32u
i
=
0
;
i
<
u
.
height
();
++
i
)
{
for
(
Ncv32u
j
=
0
;
j
<
u
.
width
();
++
j
)
{
float
d
=
std
::
max
(
fabsf
(
*
ptr_u
),
fabsf
(
*
ptr_v
)
);
if
(
d
>
maxDisplacement
)
maxDisplacement
=
d
;
++
ptr_u
;
++
ptr_v
;
}
ptr_u
+=
u
.
stride
()
-
u
.
width
();
ptr_v
+=
v
.
stride
()
-
v
.
width
();
}
CvSize
image_size
=
cvSize
(
u
.
width
(),
u
.
height
());
flowField
=
cvCreateImage
(
image_size
,
IPL_DEPTH_8U
,
4
);
if
(
flowField
==
0
)
return
NCV_NULL_PTR
;
unsigned
char
*
row
=
reinterpret_cast
<
unsigned
char
*>
(
flowField
->
imageData
);
ptr_u
=
host_u
.
ptr
();
ptr_v
=
host_v
.
ptr
();
for
(
int
i
=
0
;
i
<
flowField
->
height
;
++
i
)
{
for
(
int
j
=
0
;
j
<
flowField
->
width
;
++
j
)
{
(
row
+
j
*
flowField
->
nChannels
)[
0
]
=
0
;
(
row
+
j
*
flowField
->
nChannels
)[
1
]
=
static_cast
<
unsigned
char
>
(
MapValue
(
-
(
*
ptr_v
),
-
maxDisplacement
,
maxDisplacement
,
0.0
f
,
255.0
f
));
(
row
+
j
*
flowField
->
nChannels
)[
2
]
=
static_cast
<
unsigned
char
>
(
MapValue
(
*
ptr_u
,
-
maxDisplacement
,
maxDisplacement
,
0.0
f
,
255.0
f
));
(
row
+
j
*
flowField
->
nChannels
)[
3
]
=
255
;
++
ptr_u
;
++
ptr_v
;
}
row
+=
flowField
->
widthStep
;
ptr_u
+=
u
.
stride
()
-
u
.
width
();
ptr_v
+=
v
.
stride
()
-
v
.
width
();
}
cvShowImage
(
name
,
flowField
);
return
NCV_SUCCESS
;
}
IplImage
*
CreateImage
(
NCVMatrixAlloc
<
Ncv32f
>
&
h_r
,
NCVMatrixAlloc
<
Ncv32f
>
&
h_g
,
NCVMatrixAlloc
<
Ncv32f
>
&
h_b
)
{
CvSize
imageSize
=
cvSize
(
h_r
.
width
(),
h_r
.
height
());
IplImage
*
image
=
cvCreateImage
(
imageSize
,
IPL_DEPTH_8U
,
4
);
if
(
image
==
0
)
return
0
;
unsigned
char
*
row
=
reinterpret_cast
<
unsigned
char
*>
(
image
->
imageData
);
for
(
int
i
=
0
;
i
<
image
->
height
;
++
i
)
{
for
(
int
j
=
0
;
j
<
image
->
width
;
++
j
)
{
int
offset
=
j
*
image
->
nChannels
;
int
pos
=
i
*
h_r
.
stride
()
+
j
;
row
[
offset
+
0
]
=
static_cast
<
unsigned
char
>
(
h_b
.
ptr
()[
pos
]
*
255.0
f
);
row
[
offset
+
1
]
=
static_cast
<
unsigned
char
>
(
h_g
.
ptr
()[
pos
]
*
255.0
f
);
row
[
offset
+
2
]
=
static_cast
<
unsigned
char
>
(
h_r
.
ptr
()[
pos
]
*
255.0
f
);
row
[
offset
+
3
]
=
255
;
}
row
+=
image
->
widthStep
;
}
return
image
;
}
void
PrintHelp
()
{
std
::
cout
<<
"Usage help:
\n
"
;
std
::
cout
<<
std
::
setiosflags
(
std
::
ios
::
left
);
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_ALPHA
<<
" - set alpha
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_GAMMA
<<
" - set gamma
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_INNER
<<
" - set number of inner iterations
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_INPUT
<<
" - specify input file names (2 image files)
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_OUTER
<<
" - set number of outer iterations
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_SCALE
<<
" - set pyramid scale factor
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_SOLVER
<<
" - set number of basic solver iterations
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_TIME_STEP
<<
" - set frame interpolation time step
\n
"
;
std
::
cout
<<
"
\t
"
<<
std
::
setw
(
15
)
<<
PARAM_HELP
<<
" - display this help message
\n
"
;
}
int
ProcessCommandLine
(
int
argc
,
char
**
argv
,
Ncv32f
&
timeStep
,
char
*&
frame0Name
,
char
*&
frame1Name
,
NCVBroxOpticalFlowDescriptor
&
desc
)
{
timeStep
=
0.25
f
;
for
(
int
iarg
=
1
;
iarg
<
argc
;
++
iarg
)
{
if
(
strcmp
(
argv
[
iarg
],
PARAM_INPUT
)
==
0
)
{
if
(
iarg
+
2
<
argc
)
{
frame0Name
=
argv
[
++
iarg
];
frame1Name
=
argv
[
++
iarg
];
}
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_SCALE
)
==
0
)
{
if
(
iarg
+
1
<
argc
)
desc
.
scale_factor
=
static_cast
<
Ncv32f
>
(
atof
(
argv
[
++
iarg
]));
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_ALPHA
)
==
0
)
{
if
(
iarg
+
1
<
argc
)
desc
.
alpha
=
static_cast
<
Ncv32f
>
(
atof
(
argv
[
++
iarg
]));
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_GAMMA
)
==
0
)
{
if
(
iarg
+
1
<
argc
)
desc
.
gamma
=
static_cast
<
Ncv32f
>
(
atof
(
argv
[
++
iarg
]));
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_INNER
)
==
0
)
{
if
(
iarg
+
1
<
argc
)
desc
.
number_of_inner_iterations
=
static_cast
<
Ncv32u
>
(
atoi
(
argv
[
++
iarg
]));
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_OUTER
)
==
0
)
{
if
(
iarg
+
1
<
argc
)
desc
.
number_of_outer_iterations
=
static_cast
<
Ncv32u
>
(
atoi
(
argv
[
++
iarg
]));
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_SOLVER
)
==
0
)
{
if
(
iarg
+
1
<
argc
)
desc
.
number_of_solver_iterations
=
static_cast
<
Ncv32u
>
(
atoi
(
argv
[
++
iarg
]));
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_TIME_STEP
)
==
0
)
{
if
(
iarg
+
1
<
argc
)
timeStep
=
static_cast
<
Ncv32f
>
(
atof
(
argv
[
++
iarg
]));
else
return
-
1
;
}
else
if
(
strcmp
(
argv
[
iarg
],
PARAM_HELP
)
==
0
)
{
PrintHelp
();
return
0
;
}
}
return
0
;
}
int
main
(
int
argc
,
char
**
argv
)
{
char
*
frame0Name
=
0
,
*
frame1Name
=
0
;
Ncv32f
timeStep
=
0.01
f
;
NCVBroxOpticalFlowDescriptor
desc
;
desc
.
alpha
=
0.197
f
;
desc
.
gamma
=
50.0
f
;
desc
.
number_of_inner_iterations
=
10
;
desc
.
number_of_outer_iterations
=
77
;
desc
.
number_of_solver_iterations
=
10
;
desc
.
scale_factor
=
0.8
f
;
int
result
=
ProcessCommandLine
(
argc
,
argv
,
timeStep
,
frame0Name
,
frame1Name
,
desc
);
if
(
argc
==
1
||
result
)
{
PrintHelp
();
return
result
;
}
std
::
cout
<<
"OpenCV / NVIDIA Computer Vision
\n
"
;
std
::
cout
<<
"Optical Flow Demo: Frame Interpolation
\n
"
;
std
::
cout
<<
"=========================================
\n
"
;
std
::
cout
<<
"Press:
\n
ESC to quit
\n
'a' to move to the previous frame
\n
's' to move to the next frame
\n
"
;
int
devId
;
ncvAssertCUDAReturn
(
cudaGetDevice
(
&
devId
),
-
1
);
cudaDeviceProp
devProp
;
ncvAssertCUDAReturn
(
cudaGetDeviceProperties
(
&
devProp
,
devId
),
-
1
);
std
::
cout
<<
"Using GPU: "
<<
devId
<<
"("
<<
devProp
.
name
<<
"), arch="
<<
devProp
.
major
<<
"."
<<
devProp
.
minor
<<
std
::
endl
;
g_pGPUMemAllocator
=
shared_ptr
<
INCVMemAllocator
>
(
new
NCVMemNativeAllocator
(
NCVMemoryTypeDevice
,
devProp
.
textureAlignment
));
ncvAssertPrintReturn
(
g_pGPUMemAllocator
->
isInitialized
(),
"Device memory allocator isn't initialized"
,
-
1
);
g_pHostMemAllocator
=
shared_ptr
<
INCVMemAllocator
>
(
new
NCVMemNativeAllocator
(
NCVMemoryTypeHostPageable
,
devProp
.
textureAlignment
));
ncvAssertPrintReturn
(
g_pHostMemAllocator
->
isInitialized
(),
"Host memory allocator isn't initialized"
,
-
1
);
int
width
,
height
;
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
src_host
;
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
dst_host
;
IplImage
*
firstFrame
,
*
lastFrame
;
if
(
frame0Name
!=
0
&&
frame1Name
!=
0
)
{
ncvAssertReturnNcvStat
(
LoadImages
(
frame0Name
,
frame1Name
,
width
,
height
,
src_host
,
dst_host
,
firstFrame
,
lastFrame
));
}
else
{
ncvAssertReturnNcvStat
(
LoadImages
(
"frame10.bmp"
,
"frame11.bmp"
,
width
,
height
,
src_host
,
dst_host
,
firstFrame
,
lastFrame
));
}
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
src
(
new
NCVMatrixAlloc
<
Ncv32f
>
(
*
g_pGPUMemAllocator
,
src_host
->
width
(),
src_host
->
height
()));
ncvAssertReturn
(
src
->
isMemAllocated
(),
-
1
);
shared_ptr
<
NCVMatrixAlloc
<
Ncv32f
>>
dst
(
new
NCVMatrixAlloc
<
Ncv32f
>
(
*
g_pGPUMemAllocator
,
src_host
->
width
(),
src_host
->
height
()));
ncvAssertReturn
(
dst
->
isMemAllocated
(),
-
1
);
ncvAssertReturnNcvStat
(
src_host
->
copySolid
(
*
src
,
0
));
ncvAssertReturnNcvStat
(
dst_host
->
copySolid
(
*
dst
,
0
));
#if defined SAFE_MAT_DECL
#undef SAFE_MAT_DECL
#endif
#define SAFE_MAT_DECL(name, allocator, sx, sy) \
NCVMatrixAlloc
<
Ncv32f
>
name
(
*
allocator
,
sx
,
sy
);
\
ncvAssertReturn
(
name
##
.
isMemAllocated
(),
-
1
);
SAFE_MAT_DECL
(
u
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
v
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
uBck
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
vBck
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
h_r
,
g_pHostMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
h_g
,
g_pHostMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
h_b
,
g_pHostMemAllocator
,
width
,
height
);
std
::
cout
<<
"Estimating optical flow
\n
Forward...
\n
"
;
if
(
NCV_SUCCESS
!=
NCVBroxOpticalFlow
(
desc
,
*
g_pGPUMemAllocator
,
*
src
,
*
dst
,
u
,
v
,
0
))
{
std
::
cout
<<
"Failed
\n
"
;
return
-
1
;
}
std
::
cout
<<
"Backward...
\n
"
;
if
(
NCV_SUCCESS
!=
NCVBroxOpticalFlow
(
desc
,
*
g_pGPUMemAllocator
,
*
dst
,
*
src
,
uBck
,
vBck
,
0
))
{
std
::
cout
<<
"Failed
\n
"
;
return
-
1
;
}
// matrix for temporary data
SAFE_MAT_DECL
(
d_temp
,
g_pGPUMemAllocator
,
width
,
height
);
// first frame color components (GPU memory)
SAFE_MAT_DECL
(
d_r
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
d_g
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
d_b
,
g_pGPUMemAllocator
,
width
,
height
);
// second frame color components (GPU memory)
SAFE_MAT_DECL
(
d_rt
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
d_gt
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
d_bt
,
g_pGPUMemAllocator
,
width
,
height
);
// intermediate frame color components (GPU memory)
SAFE_MAT_DECL
(
d_rNew
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
d_gNew
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
d_bNew
,
g_pGPUMemAllocator
,
width
,
height
);
// interpolated forward flow
SAFE_MAT_DECL
(
ui
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
vi
,
g_pGPUMemAllocator
,
width
,
height
);
// interpolated backward flow
SAFE_MAT_DECL
(
ubi
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
vbi
,
g_pGPUMemAllocator
,
width
,
height
);
// occlusion masks
SAFE_MAT_DECL
(
occ0
,
g_pGPUMemAllocator
,
width
,
height
);
SAFE_MAT_DECL
(
occ1
,
g_pGPUMemAllocator
,
width
,
height
);
// prepare color components on host and copy them to device memory
ncvAssertReturnNcvStat
(
CopyData
<
RgbToR
>
(
firstFrame
,
h_r
));
ncvAssertReturnNcvStat
(
CopyData
<
RgbToG
>
(
firstFrame
,
h_g
));
ncvAssertReturnNcvStat
(
CopyData
<
RgbToB
>
(
firstFrame
,
h_b
));
ncvAssertReturnNcvStat
(
h_r
.
copySolid
(
d_r
,
0
));
ncvAssertReturnNcvStat
(
h_g
.
copySolid
(
d_g
,
0
));
ncvAssertReturnNcvStat
(
h_b
.
copySolid
(
d_b
,
0
));
ncvAssertReturnNcvStat
(
CopyData
<
RgbToR
>
(
lastFrame
,
h_r
));
ncvAssertReturnNcvStat
(
CopyData
<
RgbToG
>
(
lastFrame
,
h_g
));
ncvAssertReturnNcvStat
(
CopyData
<
RgbToB
>
(
lastFrame
,
h_b
));
ncvAssertReturnNcvStat
(
h_r
.
copySolid
(
d_rt
,
0
));
ncvAssertReturnNcvStat
(
h_g
.
copySolid
(
d_gt
,
0
));
ncvAssertReturnNcvStat
(
h_b
.
copySolid
(
d_bt
,
0
));
std
::
cout
<<
"Interpolating...
\n
"
;
std
::
cout
.
precision
(
4
);
std
::
vector
<
IplImage
*>
frames
;
frames
.
push_back
(
firstFrame
);
// compute interpolated frames
for
(
Ncv32f
timePos
=
timeStep
;
timePos
<
1.0
f
;
timePos
+=
timeStep
)
{
ncvAssertCUDAReturn
(
cudaMemset
(
ui
.
ptr
(),
0
,
ui
.
pitch
()
*
ui
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
vi
.
ptr
(),
0
,
vi
.
pitch
()
*
vi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
ubi
.
ptr
(),
0
,
ubi
.
pitch
()
*
ubi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
vbi
.
ptr
(),
0
,
vbi
.
pitch
()
*
vbi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
occ0
.
ptr
(),
0
,
occ0
.
pitch
()
*
occ0
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
occ1
.
ptr
(),
0
,
occ1
.
pitch
()
*
occ1
.
height
()),
NCV_CUDA_ERROR
);
NppStInterpolationState
state
;
// interpolation state should be filled once except pSrcFrame0, pSrcFrame1, and pNewFrame
// we will only need to reset buffers content to 0 since interpolator doesn't do this itself
state
.
size
=
NcvSize32u
(
width
,
height
);
state
.
nStep
=
d_r
.
pitch
();
state
.
pSrcFrame0
=
d_r
.
ptr
();
state
.
pSrcFrame1
=
d_rt
.
ptr
();
state
.
pFU
=
u
.
ptr
();
state
.
pFV
=
v
.
ptr
();
state
.
pBU
=
uBck
.
ptr
();
state
.
pBV
=
vBck
.
ptr
();
state
.
pos
=
timePos
;
state
.
pNewFrame
=
d_rNew
.
ptr
();
state
.
ppBuffers
[
0
]
=
occ0
.
ptr
();
state
.
ppBuffers
[
1
]
=
occ1
.
ptr
();
state
.
ppBuffers
[
2
]
=
ui
.
ptr
();
state
.
ppBuffers
[
3
]
=
vi
.
ptr
();
state
.
ppBuffers
[
4
]
=
ubi
.
ptr
();
state
.
ppBuffers
[
5
]
=
vbi
.
ptr
();
// interpolate red channel
nppiStInterpolateFrames
(
&
state
);
// reset buffers
ncvAssertCUDAReturn
(
cudaMemset
(
ui
.
ptr
(),
0
,
ui
.
pitch
()
*
ui
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
vi
.
ptr
(),
0
,
vi
.
pitch
()
*
vi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
ubi
.
ptr
(),
0
,
ubi
.
pitch
()
*
ubi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
vbi
.
ptr
(),
0
,
vbi
.
pitch
()
*
vbi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
occ0
.
ptr
(),
0
,
occ0
.
pitch
()
*
occ0
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
occ1
.
ptr
(),
0
,
occ1
.
pitch
()
*
occ1
.
height
()),
NCV_CUDA_ERROR
);
// interpolate green channel
state
.
pSrcFrame0
=
d_g
.
ptr
();
state
.
pSrcFrame1
=
d_gt
.
ptr
();
state
.
pNewFrame
=
d_gNew
.
ptr
();
nppiStInterpolateFrames
(
&
state
);
// reset buffers
ncvAssertCUDAReturn
(
cudaMemset
(
ui
.
ptr
(),
0
,
ui
.
pitch
()
*
ui
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
vi
.
ptr
(),
0
,
vi
.
pitch
()
*
vi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
ubi
.
ptr
(),
0
,
ubi
.
pitch
()
*
ubi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
vbi
.
ptr
(),
0
,
vbi
.
pitch
()
*
vbi
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
occ0
.
ptr
(),
0
,
occ0
.
pitch
()
*
occ0
.
height
()),
NCV_CUDA_ERROR
);
ncvAssertCUDAReturn
(
cudaMemset
(
occ1
.
ptr
(),
0
,
occ1
.
pitch
()
*
occ1
.
height
()),
NCV_CUDA_ERROR
);
// interpolate blue channel
state
.
pSrcFrame0
=
d_b
.
ptr
();
state
.
pSrcFrame1
=
d_bt
.
ptr
();
state
.
pNewFrame
=
d_bNew
.
ptr
();
nppiStInterpolateFrames
(
&
state
);
// copy to host memory
ncvAssertReturnNcvStat
(
d_rNew
.
copySolid
(
h_r
,
0
));
ncvAssertReturnNcvStat
(
d_gNew
.
copySolid
(
h_g
,
0
));
ncvAssertReturnNcvStat
(
d_bNew
.
copySolid
(
h_b
,
0
));
// convert to IplImage
IplImage
*
newFrame
=
CreateImage
(
h_r
,
h_g
,
h_b
);
if
(
newFrame
==
0
)
{
std
::
cout
<<
"Could not create new frame in host memory
\n
"
;
break
;
}
frames
.
push_back
(
newFrame
);
std
::
cout
<<
timePos
*
100.0
f
<<
"%
\r
"
;
}
std
::
cout
<<
std
::
setw
(
5
)
<<
"100%
\n
"
;
frames
.
push_back
(
lastFrame
);
Ncv32u
currentFrame
;
currentFrame
=
0
;
ShowFlow
(
u
,
v
,
"Forward flow"
);
ShowFlow
(
uBck
,
vBck
,
"Backward flow"
);
cvShowImage
(
"Interpolated frame"
,
frames
[
currentFrame
]);
bool
qPressed
=
false
;
while
(
!
qPressed
)
{
int
key
=
toupper
(
cvWaitKey
(
10
));
switch
(
key
)
{
case
27
:
qPressed
=
true
;
break
;
case
'A'
:
if
(
currentFrame
>
0
)
--
currentFrame
;
cvShowImage
(
"Interpolated frame"
,
frames
[
currentFrame
]);
break
;
case
'S'
:
if
(
currentFrame
<
frames
.
size
()
-
1
)
++
currentFrame
;
cvShowImage
(
"Interpolated frame"
,
frames
[
currentFrame
]);
break
;
}
}
cvDestroyAllWindows
();
std
::
vector
<
IplImage
*>::
iterator
iter
;
for
(
iter
=
frames
.
begin
();
iter
!=
frames
.
end
();
++
iter
)
{
cvReleaseImage
(
&
(
*
iter
));
}
return
0
;
}
#endif
samples/gpu/rubberwhale1.png
0 → 100644
View file @
42c7aece
352 KB
samples/gpu/rubberwhale2.png
0 → 100644
View file @
42c7aece
353 KB
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment