[+] Added Brox optical flow (implementation courtesy of Michael Smirnov)

42c7aece · Anton Obukhov · f838db92 · 42c7aece · 42c7aece · 42c7aece
Commit 42c7aece authored Sep 29, 2011 by Anton Obukhov
7 changed files
--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
+//
+// Algorithm is explained in the original paper:
+//      T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
+//      High accuracy optical flow estimation based on a theory for warping.
+//      ECCV 2004.
+//
+// Implementation by Mikhail Smirnov
+// email: msmirnov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
+//
+////////////////////////////////////////////////////////////////////////////////
+#include <iostream>
+#include <vector>
+#include <memory>
+#include "NPP_staging\NPP_staging.hpp"
+#include "NCVBroxOpticalFlow.hpp"
+using std::tr1::shared_ptr;
+typedef NCVVectorAlloc<Ncv32f> FloatVector;
+/////////////////////////////////////////////////////////////////////////////////////////
+// Implementation specific constants
+/////////////////////////////////////////////////////////////////////////////////////////
+__device__ const float eps2 = 1e-6f;
+/////////////////////////////////////////////////////////////////////////////////////////
+// Additional defines
+/////////////////////////////////////////////////////////////////////////////////////////
+// rounded up division
+inline int iDivUp(int a, int b)
+{
+    return (a + b - 1)/b;
+}
+/////////////////////////////////////////////////////////////////////////////////////////
+// Texture references
+/////////////////////////////////////////////////////////////////////////////////////////
+texture<float, 2, cudaReadModeElementType> tex_coarse;
+texture<float, 2, cudaReadModeElementType> tex_fine;
+texture<float, 2, cudaReadModeElementType> tex_I1;
+texture<float, 2, cudaReadModeElementType> tex_I0;
+texture<float, 2, cudaReadModeElementType> tex_Ix;
+texture<float, 2, cudaReadModeElementType> tex_Ixx;
+texture<float, 2, cudaReadModeElementType> tex_Ix0;
+texture<float, 2, cudaReadModeElementType> tex_Iy;
+texture<float, 2, cudaReadModeElementType> tex_Iyy;
+texture<float, 2, cudaReadModeElementType> tex_Iy0;
+texture<float, 2, cudaReadModeElementType> tex_Ixy;
+texture<float, 1, cudaReadModeElementType> tex_u;
+texture<float, 1, cudaReadModeElementType> tex_v;
+texture<float, 1, cudaReadModeElementType> tex_du;
+texture<float, 1, cudaReadModeElementType> tex_dv;
+texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
+texture<float, 1, cudaReadModeElementType> tex_numerator_u;
+texture<float, 1, cudaReadModeElementType> tex_numerator_v;
+texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
+texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
+texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
+texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
+/////////////////////////////////////////////////////////////////////////////////////////
+// SUPPLEMENTARY FUNCTIONS
+/////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+/// \brief performs pointwise summation of two vectors stored in device memory
+/// \param d_res    - pointer to resulting vector (device memory)
+/// \param d_op1    - term #1 (device memory)
+/// \param d_op2    - term #2 (device memory)
+/// \param len    - vector size
+///////////////////////////////////////////////////////////////////////////////
+__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
+{
+    const int pos = blockIdx.x*blockDim.x + threadIdx.x;
+    if(pos >= len) return;
+    d_res[pos] = d_op1[pos] + d_op2[pos];
+}
+///////////////////////////////////////////////////////////////////////////////
+/// \brief wrapper for summation kernel.
+///  Computes \b op1 + \b op2 and stores result to \b res
+/// \param res   array, containing op1 + op2 (device memory)
+/// \param op1   term #1 (device memory)
+/// \param op2   term #2 (device memory)
+/// \param count vector size
+///////////////////////////////////////////////////////////////////////////////
+static void add(float *res, const float *op1, const float *op2, const int count, cudaStream_t stream)
+{
+    dim3 threads(256);
+    dim3 blocks(iDivUp(count, threads.x));
+    pointwise_add<<<blocks, threads, 0, stream>>>(res, op1, op2, count);
+}
+///////////////////////////////////////////////////////////////////////////////
+/// \brief wrapper for summation kernel.
+/// Increments \b res by \b rhs
+/// \param res   initial vector, will be replaced with result (device memory)
+/// \param rhs   increment (device memory)
+/// \param count vector size
+///////////////////////////////////////////////////////////////////////////////
+static void add(float *res, const float *rhs, const int count, cudaStream_t stream)
+{
+    add(res, res, rhs, count, stream);
+}
+///////////////////////////////////////////////////////////////////////////////
+/// \brief kernel for scaling vector by scalar
+/// \param d_res  scaled vector (device memory)
+/// \param d_src  source vector (device memory)
+/// \param scale  scalar to scale by
+/// \param len    vector size (number of elements)
+///////////////////////////////////////////////////////////////////////////////
+__global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len)
+{
+	const int pos = blockIdx.x * blockDim.x + threadIdx.x;
+	if (pos >= len) return;
+	d_res[pos] = d_src[pos] * scale;
+}
+///////////////////////////////////////////////////////////////////////////////
+/// \brief scale vector by scalar
+///
+/// kernel wrapper
+/// \param d_res  scaled vector (device memory)
+/// \param d_src  source vector (device memory)
+/// \param scale  scalar to scale by
+/// \param len    vector size (number of elements)
+/// \param stream CUDA stream
+///////////////////////////////////////////////////////////////////////////////
+static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream)
+{
+	dim3 threads(256);
+	dim3 blocks(iDivUp(len, threads.x));
+	scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len);
+}
+const int SOR_TILE_WIDTH = 32;
+const int SOR_TILE_HEIGHT = 6;
+const int PSOR_TILE_WIDTH = 32;
+const int PSOR_TILE_HEIGHT = 6;
+const int PSOR_PITCH = PSOR_TILE_WIDTH + 4;
+const int PSOR_HEIGHT = PSOR_TILE_HEIGHT + 4;
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Compute smooth term diffusivity along x axis
+///\param s (out) pointer to memory location for result (diffusivity)
+///\param pos (in) position within shared memory array containing \b u
+///\param u (in) shared memory array containing \b u
+///\param v (in) shared memory array containing \b v
+///\param du (in) shared memory array containing \b du
+///\param dv (in) shared memory array containing \b dv
+///////////////////////////////////////////////////////////////////////////////
+__forceinline__ __device__ void diffusivity_along_x(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
+{
+    //x derivative between pixels (i,j) and (i-1,j)
+    const int left = pos-1;
+    float u_x = u[pos] + du[pos] - u[left] - du[left];
+    float v_x = v[pos] + dv[pos] - v[left] - dv[left];
+    const int up        = pos + PSOR_PITCH;
+    const int down      = pos - PSOR_PITCH;
+    const int up_left   = up - 1;
+    const int down_left = down-1;
+    //y derivative between pixels (i,j) and (i-1,j)
+    float u_y = 0.25f*(u[up] + du[up] + u[up_left] + du[up_left] - u[down] - du[down] - u[down_left] - du[down_left]);
+    float v_y = 0.25f*(v[up] + dv[up] + v[up_left] + dv[up_left] - v[down] - dv[down] - v[down_left] - dv[down_left]);
+    *s = 0.5f / sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
+}
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Compute smooth term diffusivity along y axis
+///\param s (out) pointer to memory location for result (diffusivity)
+///\param pos (in) position within shared memory array containing \b u
+///\param u (in) shared memory array containing \b u
+///\param v (in) shared memory array containing \b v
+///\param du (in) shared memory array containing \b du
+///\param dv (in) shared memory array containing \b dv
+///////////////////////////////////////////////////////////////////////////////
+__forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
+{
+    //y derivative between pixels (i,j) and (i,j-1)
+    const int down = pos-PSOR_PITCH;
+    float u_y = u[pos] + du[pos] - u[down] - du[down];
+    float v_y = v[pos] + dv[pos] - v[down] - dv[down];
+    const int right      = pos + 1;
+    const int left       = pos - 1;
+    const int down_right = down + 1;
+    const int down_left  = down - 1;
+    //x derivative between pixels (i,j) and (i,j-1);
+    float u_x = 0.25f*(u[right] + u[down_right] + du[right] + du[down_right] - u[left] - u[down_left] - du[left] - du[down_left]);
+    float v_x = 0.25f*(v[right] + v[down_right] + dv[right] + dv[down_right] - v[left] - v[down_left] - dv[left] - dv[down_left]);
+    *s = 0.5f/sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
+}
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Load element of 2D global memory to shared memory
+///\param smem pointer to shared memory array
+///\param is shared memory array column
+///\param js shared memory array row
+///\param w number of columns in global memory array
+///\param h number of rows in global memory array
+///\param p global memory array pitch in floats
+///////////////////////////////////////////////////////////////////////////////
+template<int tex_id>
+__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
+{    
+    //position within shared memory array
+    const int ijs = js * PSOR_PITCH + is;
+    //mirror reflection across borders
+    i = max(i, -i-1);
+    i = min(i, w-i+w-1);
+    j = max(j, -j-1);
+    j = min(j, h-j+h-1);
+    const int pos = j * p + i;
+    switch(tex_id){
+        case 0:
+            smem[ijs] = tex1Dfetch(tex_u, pos);
+            break;
+        case 1:
+            smem[ijs] = tex1Dfetch(tex_v, pos);
+            break;
+        case 2:
+            smem[ijs] = tex1Dfetch(tex_du, pos);
+            break;
+        case 3:
+            smem[ijs] = tex1Dfetch(tex_dv, pos);
+            break;
+    }
+}
+///////////////////////////////////////////////////////////////////////////////
+///\brief Utility function. Load part (tile) of 2D global memory to shared memory
+///\param smem pointer to target shared memory array
+///\param ig column number within source
+///\param jg row number within source
+///\param w number of columns in global memory array
+///\param h number of rows in global memory array
+///\param p global memory array pitch in floats
+///////////////////////////////////////////////////////////////////////////////
+template<int tex> 
+__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
+{
+    const int i = threadIdx.x + 2;
+    const int j = threadIdx.y + 2;
+    load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
+    __syncthreads();
+    if(threadIdx.y < 2)
+    {
+        //load bottom shadow elements
+        load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
+        if(threadIdx.x < 2)
+        {
+            //load bottom right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
+            //load middle right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
+        {
+            //load bottom left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
+            //load middle left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+    }
+    else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
+    {
+        //load upper shadow elements
+        load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
+        if(threadIdx.x < 2)
+        {
+            //load upper right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
+            //load middle right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
+        {
+            //load upper left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
+            //load middle left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+    }
+    else
+    {
+        //load middle shadow elements
+        if(threadIdx.x < 2)
+        {
+            //load middle right shadow elements
+            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
+        {
+            //load middle left shadow elements
+            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+        }
+    }
+    __syncthreads();
+}
+///////////////////////////////////////////////////////////////////////////////
+/// \brief computes matrix of linearised system for \c du, \c dv
+/// Computed values reside in GPU memory. \n
+/// Matrix computation is divided into two steps. This kernel performs first step\n
+/// - compute smoothness term diffusivity between pixels - psi dash smooth
+/// - compute robustness factor in the data term - psi dash data
+/// \param diffusivity_x (in/out) diffusivity between pixels along x axis in smoothness term
+/// \param diffusivity_y (in/out) diffusivity between pixels along y axis in smoothness term
+/// \param denominator_u (in/out) precomputed part of expression for new du value in SOR iteration
+/// \param denominator_v (in/out) precomputed part of expression for new dv value in SOR iteration
+/// \param numerator_dudv (in/out) precomputed part of expression for new du and dv value in SOR iteration
+/// \param numerator_u (in/out) precomputed part of expression for new du value in SOR iteration
+/// \param numerator_v (in/out) precomputed part of expression for new dv value in SOR iteration
+/// \param w (in) frame width
+/// \param h (in) frame height
+/// \param pitch (in) pitch in floats
+/// \param alpha (in) alpha in Brox model (flow smoothness)
+/// \param gamma (in) gamma in Brox model (edge importance)
+///////////////////////////////////////////////////////////////////////////////
+__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, 
+                                                        float *denominator_u, float *denominator_v,
+                                                        float *numerator_dudv,
+                                                        float *numerator_u, float *numerator_v,
+                                                        int w, int h, int s,
+                                                        float alpha, float gamma)
+{
+    __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
+    __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
+    __shared__ float du[PSOR_PITCH * PSOR_HEIGHT];
+    __shared__ float dv[PSOR_PITCH * PSOR_HEIGHT];
+    //position within tile
+    const int i = threadIdx.x;
+    const int j = threadIdx.y;
+    //position within smem arrays
+    const int ijs = (j+2) * PSOR_PITCH + i + 2;
+    //position within global memory
+    const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
+    const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
+    const int ijg = jg * s + ig;
+    //position within texture
+    float x = (float)ig + 0.5f;
+    float y = (float)jg + 0.5f;
+    //load u  and v to smem
+    load_array<0>(u, ig, jg, w, h, s);
+    load_array<1>(v, ig, jg, w, h, s);
+    load_array<2>(du, ig, jg, w, h, s);
+    load_array<3>(dv, ig, jg, w, h, s);
+    //warped position
+    float wx = (x + u[ijs])/(float)w;
+    float wy = (y + v[ijs])/(float)h;
+    x /= (float)w;
+    y /= (float)h;
+    //compute image derivatives
+    const float Iz  = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
+    const float Ix  = tex2D(tex_Ix, wx, wy);
+    const float Ixz = Ix - tex2D(tex_Ix0, x, y);
+    const float Ixy = tex2D(tex_Ixy, wx, wy);
+    const float Ixx = tex2D(tex_Ixx, wx, wy);
+    const float Iy  = tex2D(tex_Iy, wx, wy);
+    const float Iyz = Iy - tex2D(tex_Iy0, x, y);
+    const float Iyy = tex2D(tex_Iyy, wx, wy);
+    //compute data term
+    float q0, q1, q2;
+    q0 = Iz  + Ix  * du[ijs] + Iy  * dv[ijs];
+    q1 = Ixz + Ixx * du[ijs] + Ixy * dv[ijs];
+    q2 = Iyz + Ixy * du[ijs] + Iyy * dv[ijs];
+    float data_term = 0.5f * rsqrtf(q0*q0 + gamma*(q1*q1 + q2*q2) + eps2);
+    //scale data term by 1/alpha
+    data_term /= alpha;
+    //compute smoothness term (diffusivity)
+    float sx, sy;
+    if(ig >= w || jg >= h) return;
+    diffusivity_along_x(&sx, ijs, u, v, du, dv);
+    diffusivity_along_y(&sy, ijs, u, v, du, dv);
+    if(ig == 0) sx = 0.0f;
+    if(jg == 0) sy = 0.0f;
+    numerator_dudv[ijg] = data_term * (Ix*Iy + gamma * Ixy*(Ixx + Iyy));
+    numerator_u[ijg]    = data_term * (Ix*Iz + gamma * (Ixx*Ixz + Ixy*Iyz));
+    numerator_v[ijg]    = data_term * (Iy*Iz + gamma * (Iyy*Iyz + Ixy*Ixz));
+    denominator_u[ijg]  = data_term * (Ix*Ix + gamma * (Ixy*Ixy + Ixx*Ixx));
+    denominator_v[ijg]  = data_term * (Iy*Iy + gamma * (Ixy*Ixy + Iyy*Iyy));
+    diffusivity_x[ijg]  = sx;
+    diffusivity_y[ijg]  = sy;
+}
+///////////////////////////////////////////////////////////////////////////////
+///\brief computes matrix of linearised system for \c du, \c dv
+///\param inv_denominator_u
+///\param inv_denominator_v
+///\param w
+///\param h
+///\param s
+///////////////////////////////////////////////////////////////////////////////
+__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
+                                    int w, int h, int s)
+{
+    __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
+    __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
+    //position within tile
+    const int i = threadIdx.x;
+    const int j = threadIdx.y;
+    //position within smem arrays
+    const int ijs = j*(PSOR_TILE_WIDTH+1) + i;
+    //position within global memory
+    const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
+    const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
+    const int ijg = jg*s + ig;
+    int inside = ig < w && jg < h;
+    float denom_u;
+    float denom_v;
+    if(inside)
+    {
+        denom_u = inv_denominator_u[ijg];
+        denom_v = inv_denominator_v[ijg];
+    }
+    if(inside)
+    {
+        sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
+        sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
+    }
+    else
+    {
+        sx[ijs] = 0.0f;
+        sy[ijs] = 0.0f;
+    }
+    int up = ijs+PSOR_TILE_WIDTH+1;
+    if(j == PSOR_TILE_HEIGHT-1)
+    {
+        if(jg < h-1 && inside)
+        {
+            sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
+        }
+        else
+        {
+            sy[up] = 0.0f;
+        }
+    }
+    int right = ijs + 1;
+    if(threadIdx.x == PSOR_TILE_WIDTH-1)
+    {
+        if(ig < w-1 && inside)
+        {
+            sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
+        }
+        else
+        {
+            sx[right] = 0.0f;
+        }
+    }
+    __syncthreads();
+    float diffusivity_sum;
+    diffusivity_sum = sx[ijs] + sx[ijs+1] + sy[ijs] + sy[ijs+PSOR_TILE_WIDTH+1];
+    if(inside)
+    {
+        denom_u += diffusivity_sum;
+        denom_v += diffusivity_sum;
+        inv_denominator_u[ijg] = 1.0f/denom_u;
+        inv_denominator_v[ijg] = 1.0f/denom_v;
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////
+// Red-Black SOR
+/////////////////////////////////////////////////////////////////////////////////////////
+template<int isBlack> __global__ void sor_pass(float *new_du, 
+                                               float *new_dv, 
+                                               const float *g_inv_denominator_u, 
+                                               const float *g_inv_denominator_v,
+                                               const float *g_numerator_u, 
+                                               const float *g_numerator_v, 
+                                               const float *g_numerator_dudv, 
+                                               float omega, 
+                                               int width, 
+                                               int height, 
+                                               int stride)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int j = blockIdx.y * blockDim.y + threadIdx.y;
+    if(i >= width || j >= height)
+        return;
+    const int pos = j * stride + i;
+    const int pos_r = pos + 1;
+    const int pos_u = pos + stride;
+    const int pos_d = pos - stride;
+    const int pos_l = pos - 1;
+    //load smooth term
+    float s_up, s_left, s_right, s_down;
+    s_left = tex1Dfetch(tex_diffusivity_x, pos);
+    s_down = tex1Dfetch(tex_diffusivity_y, pos);
+    if(i < width-1)
+        s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
+    else
+        s_right = 0.0f; //Neumann BC
+    if(j < height-1)
+        s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
+    else
+        s_up = 0.0f; //Neumann BC
+    //load u, v and du, dv
+    float u_up, u_left, u_right, u_down, u;
+    float v_up, v_left, v_right, v_down, v;
+    float du_up, du_left, du_right, du_down, du;
+    float dv_up, dv_left, dv_right, dv_down, dv;
+    u_left  = tex1Dfetch(tex_u, pos_l);
+    u_right = tex1Dfetch(tex_u, pos_r);
+    u_down  = tex1Dfetch(tex_u, pos_d);
+    u_up    = tex1Dfetch(tex_u, pos_u);
+    u       = tex1Dfetch(tex_u, pos);
+    v_left  = tex1Dfetch(tex_v, pos_l);
+    v_right = tex1Dfetch(tex_v, pos_r);
+    v_down  = tex1Dfetch(tex_v, pos_d);
+    v       = tex1Dfetch(tex_v, pos);
+    v_up    = tex1Dfetch(tex_v, pos_u);
+    du       = tex1Dfetch(tex_du, pos);
+    du_left  = tex1Dfetch(tex_du, pos_l);
+    du_right = tex1Dfetch(tex_du, pos_r);
+    du_down  = tex1Dfetch(tex_du, pos_d);
+    du_up    = tex1Dfetch(tex_du, pos_u);
+    dv       = tex1Dfetch(tex_dv, pos);
+    dv_left  = tex1Dfetch(tex_dv, pos_l);
+    dv_right = tex1Dfetch(tex_dv, pos_r);
+    dv_down  = tex1Dfetch(tex_dv, pos_d);
+    dv_up    = tex1Dfetch(tex_dv, pos_u);
+    float numerator_dudv    = g_numerator_dudv[pos];
+    if((i+j)%2 == isBlack)
+    {
+        // update du
+        float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - 
+                             u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
+        du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
+        // update dv
+        float numerator_v = (s_left*(v_left + dv_left) + s_up*(v_up + dv_up) + s_right*(v_right + dv_right) + s_down*(v_down + dv_down) -
+                             v * (s_left + s_right + s_up + s_down) - g_numerator_v[pos] - numerator_dudv*du);
+        dv = (1.0f - omega) * dv + omega * g_inv_denominator_v[pos] * numerator_v;
+    }
+    new_du[pos] = du;
+    new_dv[pos] = dv;
+}
+///////////////////////////////////////////////////////////////////////////////
+// utility functions
+///////////////////////////////////////////////////////////////////////////////
+void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
+{
+    tex.addressMode[0] = cudaAddressModeClamp;
+    tex.filterMode = cudaFilterModePoint;
+    tex.normalized = false;
+}
+void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
+{
+    tex.addressMode[0] = cudaAddressModeMirror;
+    tex.addressMode[1] = cudaAddressModeMirror;
+    tex.filterMode = cudaFilterModeLinear;
+    tex.normalized = true;
+}
+void InitTextures()
+{
+    initTexture2D(tex_I0);
+    initTexture2D(tex_I1);
+    initTexture2D(tex_fine);      // for downsampling
+    initTexture2D(tex_coarse);    // for prolongation
+    initTexture2D(tex_Ix);
+    initTexture2D(tex_Ixx);
+    initTexture2D(tex_Ix0);
+    initTexture2D(tex_Iy);
+    initTexture2D(tex_Iyy);
+    initTexture2D(tex_Iy0);
+    initTexture2D(tex_Ixy);
+    initTexture1D(tex_u);
+    initTexture1D(tex_v);
+    initTexture1D(tex_du);
+    initTexture1D(tex_dv);
+    initTexture1D(tex_diffusivity_x);
+    initTexture1D(tex_diffusivity_y);
+    initTexture1D(tex_inv_denominator_u);
+    initTexture1D(tex_inv_denominator_v);
+    initTexture1D(tex_numerator_dudv);
+    initTexture1D(tex_numerator_u);
+    initTexture1D(tex_numerator_v);
+}
+/////////////////////////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+/////////////////////////////////////////////////////////////////////////////////////////
+NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
+                             INCVMemAllocator &gpu_mem_allocator,
+                             const NCVMatrix<Ncv32f> &frame0,
+                             const NCVMatrix<Ncv32f> &frame1,
+                             NCVMatrix<Ncv32f> &uOut,
+                             NCVMatrix<Ncv32f> &vOut,
+                             cudaStream_t stream)
+{
+    ncvAssertPrintReturn(desc.alpha > 0.0f                   , "Invalid alpha"                      , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.gamma >= 0.0f                  , "Invalid gamma"                      , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.number_of_inner_iterations > 0 , "Invalid number of inner iterations" , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.number_of_outer_iterations > 0 , "Invalid number of outer iterations" , NCV_INCONSISTENT_INPUT);
+    ncvAssertPrintReturn(desc.number_of_solver_iterations > 0, "Invalid number of solver iterations", NCV_INCONSISTENT_INPUT);
+    const Ncv32u kSourceWidth  = frame0.width();
+    const Ncv32u kSourceHeight = frame0.height();
+    ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
+    ncvAssertReturn(uOut.width() == kSourceWidth   && vOut.width() == kSourceWidth && 
+        uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
+    ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
+    bool kSkipProcessing = gpu_mem_allocator.isCounting();
+    cudaDeviceProp device_props;
+    int cuda_device;
+    ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);
+    Ncv32u alignmentValue = gpu_mem_allocator.alignment ();
+    const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
+    const Ncv32u kSourcePitch = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
+    const Ncv32f scale_factor = desc.scale_factor;
+    const Ncv32f alpha = desc.alpha;
+    const Ncv32f gamma = desc.gamma;
+    const Ncv32u kSizeInPixelsAligned = alignUp(kSourceWidth, kStrideAlignmentFloat)*kSourceHeight;
+#if defined SAFE_VECTOR_DECL
+#undef SAFE_VECTOR_DECL
+#endif
+#define SAFE_VECTOR_DECL(name, allocator, size) \
+    FloatVector name((allocator), (size)); \
+    ncvAssertReturn(name##.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    // matrix elements
+    SAFE_VECTOR_DECL(diffusivity_x,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(diffusivity_y,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(denom_u,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(denom_v,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(num_dudv, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(num_u,    gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(num_v,    gpu_mem_allocator, kSizeInPixelsAligned);
+    // flow components
+    SAFE_VECTOR_DECL(u, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(v, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(u_new, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(v_new, gpu_mem_allocator, kSizeInPixelsAligned);
+    // flow increments
+    SAFE_VECTOR_DECL(du, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(dv, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(du_new, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
+    // temporary storage
+    SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, 
+        alignUp(kSourceWidth, kStrideAlignmentFloat) 
+        * alignUp(kSourceHeight, kStrideAlignmentFloat));
+    // image derivatives
+    SAFE_VECTOR_DECL(Ix,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Ixx, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Ix0, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Iy,  gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Iyy, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Iy0, gpu_mem_allocator, kSizeInPixelsAligned);
+    SAFE_VECTOR_DECL(Ixy, gpu_mem_allocator, kSizeInPixelsAligned);
+    // spatial derivative filter size
+    const int kDFilterSize = 5;
+    const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
+    SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);
+    ncvAssertCUDAReturn(
+        cudaMemcpy(derivativeFilter.ptr(), 
+                   derivativeFilterHost, 
+                   sizeof(float) * kDFilterSize, 
+                   cudaMemcpyHostToDevice),
+        NCV_CUDA_ERROR);
+    InitTextures();
+    //prepare image pyramid
+    std::vector< shared_ptr<FloatVector> > img0_pyramid;
+    std::vector< shared_ptr<FloatVector> > img1_pyramid;
+    std::vector<Ncv32u> w_pyramid;
+    std::vector<Ncv32u> h_pyramid;
+    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
+    float scale = 1.0f;
+    //cuda arrays for frames
+    shared_ptr<FloatVector> I0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(I0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    shared_ptr<FloatVector> I1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
+    ncvAssertReturn(I1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    if (!kSkipProcessing)
+    {
+        //copy frame data to device
+        size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
+        size_t src_width_in_bytes = kSourceWidth * sizeof(float);
+        size_t src_pitch_in_bytes = frame0.pitch();
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(I0->ptr(), dst_width_in_bytes, frame0.ptr(), 
+            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(I1->ptr(), dst_width_in_bytes, frame1.ptr(), 
+            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+    }
+        //prepare pyramid
+    img0_pyramid.push_back(I0);
+    img1_pyramid.push_back(I1);
+    w_pyramid.push_back(kSourceWidth);
+    h_pyramid.push_back(kSourceHeight);
+    scale *= scale_factor;
+    Ncv32u prev_level_width  = kSourceWidth;
+    Ncv32u prev_level_height = kSourceHeight;
+    while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(img0_pyramid.size()) < desc.number_of_outer_iterations))
+    {
+        //current resolution
+        Ncv32u level_width  = static_cast<Ncv32u>(ceilf(kSourceWidth  * scale));
+        Ncv32u level_height = static_cast<Ncv32u>(ceilf(kSourceHeight * scale));
+        Ncv32u level_width_aligned  = alignUp(level_width,  kStrideAlignmentFloat);
+        Ncv32u buffer_size = alignUp(level_width, kStrideAlignmentFloat) * level_height; // buffer size in floats
+        Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);
+        shared_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
+        ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+        shared_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
+        ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+        if (!kSkipProcessing)
+        {
+            NcvSize32u srcSize (prev_level_width, prev_level_height);
+            NcvSize32u dstSize (level_width, level_height);
+            NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
+            NcvRect32u dstROI (0, 0, level_width, level_height);
+            // frame 0
+            nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, 
+                level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
+            // frame 1
+            nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, 
+                level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample);
+        }
+        //store pointers
+        img0_pyramid.push_back(level_frame0);
+        img1_pyramid.push_back(level_frame1);
+        w_pyramid.push_back(level_width);
+        h_pyramid.push_back(level_height);
+        scale *= scale_factor;
+        prev_level_width  = level_width;
+        prev_level_height = level_height;
+        I0 = level_frame0;
+        I1 = level_frame1;
+    }
+    if (!kSkipProcessing)
+    {
+        //initial values for flow is 0
+        ncvAssertCUDAReturn(cudaMemsetAsync(u.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
+        //select images with lowest resolution
+        size_t pitch = alignUp(w_pyramid.back(), kStrideAlignmentFloat) * sizeof(float);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, img0_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, img1_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+    }
+    FloatVector* ptrU = &u;
+    FloatVector* ptrV = &v;
+    FloatVector* ptrUNew = &u_new;
+    FloatVector* ptrVNew = &v_new;
+    std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img0Iter = img0_pyramid.rbegin();
+    std::vector< shared_ptr<FloatVector> >::const_reverse_iterator img1Iter = img1_pyramid.rbegin();
+    //outer loop
+    //warping fixed point iteration
+    while(!w_pyramid.empty())
+    {
+        //current grid dimensions
+        const Ncv32u kLevelWidth  = w_pyramid.back();
+        const Ncv32u kLevelHeight = h_pyramid.back();
+        const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
+        //size of current image in bytes
+        const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
+        //number of points at current resolution
+        const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
+        if (!kSkipProcessing)
+        {
+            //initial guess for du and dv
+            ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
+        }
+        //texture format descriptor
+        cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
+        I0 = *img0Iter;
+        I1 = *img1Iter;
+        ++img0Iter;
+        ++img1Iter;
+        if (!kSkipProcessing)
+        {
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+        }
+        //compute derivatives
+        dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
+        dim3 dThreads(32, 6);
+        const int kPitchTex = kLevelStride * sizeof(float);
+        if (!kSkipProcessing)
+        {
+            NcvSize32u srcSize(kLevelWidth, kLevelHeight);
+            Ncv32u nSrcStep = kLevelStride * sizeof(float);
+            NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
+            // Ix0
+            nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); 
+            // Iy0
+            nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
+            // Ix
+            nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
+            // Iy
+            nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
+            // Ixx
+            nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
+            // Iyy
+            nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f);
+            // Ixy
+            nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); 
+        }
+        if (!kSkipProcessing)
+        {
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            //    flow
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            //    flow increments
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+        }
+        dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
+        dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
+        dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
+        dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
+        // inner loop
+        // lagged nonlinearity fixed point iteration
+        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+        for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
+        {
+            //compute coefficients
+            if (!kSkipProcessing)
+            {
+                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
+                    (diffusivity_x.ptr(), 
+                     diffusivity_y.ptr(), 
+                     denom_u.ptr(), 
+                     denom_v.ptr(), 
+                     num_dudv.ptr(), 
+                     num_u.ptr(), 
+                     num_v.ptr(), 
+                     kLevelWidth, 
+                     kLevelHeight, 
+                     kLevelStride, 
+                     alpha, 
+                     gamma);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
+                //    linear system coefficients
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            }
+            //solve linear system
+            for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
+            {
+                float omega = 1.99f;
+                if (!kSkipProcessing)
+                {
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
+                        (du_new.ptr(), 
+                        dv_new.ptr(), 
+                        denom_u.ptr(), 
+                        denom_v.ptr(),
+                        num_u.ptr(), 
+                        num_v.ptr(), 
+                        num_dudv.ptr(), 
+                        omega, 
+                        kLevelWidth, 
+                        kLevelHeight, 
+                        kLevelStride);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
+                        (du.ptr(), 
+                        dv.ptr(), 
+                        denom_u.ptr(), 
+                        denom_v.ptr(),
+                        num_u.ptr(), 
+                        num_v.ptr(), 
+                        num_dudv.ptr(), 
+                        omega, 
+                        kLevelWidth, 
+                        kLevelHeight, 
+                        kLevelStride);
+                }
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            }//end of solver loop
+        }// end of inner loop
+        //update u and v
+        if (!kSkipProcessing)
+        {
+            add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
+            add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
+        }
+        //prolongate using texture
+        w_pyramid.pop_back();
+        h_pyramid.pop_back();
+        if (!w_pyramid.empty())
+        {
+            //compute new image size
+            Ncv32u nw = w_pyramid.back();
+            Ncv32u nh = h_pyramid.back();
+            Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
+            dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
+            dim3 p_threads(32, 8);
+            if (!kSkipProcessing)
+            {
+                NcvSize32u srcSize (kLevelWidth, kLevelHeight);
+                NcvSize32u dstSize (nw, nh);
+                NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
+                NcvRect32u dstROI (0, 0, nw, nh);
+                nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, 
+                    ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
+				ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
+                nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, 
+                    ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic);
+				ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
+            }
+            std::swap<FloatVector*>(ptrU, ptrUNew);
+            std::swap<FloatVector*>(ptrV, ptrVNew);
+        }
+        scale /= scale_factor;
+    }
+    // end of warping iterations
+    ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+    ncvAssertCUDAReturn( cudaMemcpy2DAsync
+        (uOut.ptr(), uOut.pitch(), ptrU->ptr(), 
+        kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+    ncvAssertCUDAReturn( cudaMemcpy2DAsync
+        (vOut.ptr(), vOut.pitch(), ptrV->ptr(), 
+        kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
+    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
+    return NCV_SUCCESS;
+}
--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.hpp
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
+//
+// Algorithm is explained in the original paper:
+//      T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
+//      High accuracy optical flow estimation based on a theory for warping.
+//      ECCV 2004.
+//
+// Implementation by Mikhail Smirnov
+// email: msmirnov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
+//
+////////////////////////////////////////////////////////////////////////////////
+#ifndef _ncv_optical_flow_h_
+#define _ncv_optical_flow_h_
+#include "NCV.hpp"
+/// \brief Model and solver parameters
+struct NCVBroxOpticalFlowDescriptor
+{
+	/// flow smoothness
+	Ncv32f alpha;
+	/// gradient constancy importance
+	Ncv32f gamma;
+	/// pyramid scale factor
+	Ncv32f scale_factor;
+	/// number of lagged non-linearity iterations (inner loop)
+	Ncv32u number_of_inner_iterations;
+	/// number of warping iterations (number of pyramid levels)
+	Ncv32u number_of_outer_iterations;
+	/// number of linear system solver iterations
+	Ncv32u number_of_solver_iterations;
+};
+/////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Compute optical flow
+///
+/// Based on method by Brox et al [2004]
+/// \param [in]  desc              model and solver parameters
+/// \param [in]  gpu_mem_allocator GPU memory allocator
+/// \param [in]  frame0            source frame
+/// \param [in]  frame1            frame to track
+/// \param [out] u                 flow horizontal component (along \b x axis)
+/// \param [out] v                 flow vertical component (along \b y axis)
+/// \return                        computation status
+/////////////////////////////////////////////////////////////////////////////////////////
+NCV_EXPORTS
+NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
+							 INCVMemAllocator &gpu_mem_allocator,
+							 const NCVMatrix<Ncv32f> &frame0,
+							 const NCVMatrix<Ncv32f> &frame1,
+							 NCVMatrix<Ncv32f> &u,
+							 NCVMatrix<Ncv32f> &v,
+							 cudaStream_t stream);
+#endif
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -1610,3 +1610,952 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
 {
    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
 }
+//==============================================================================
+//
+// Filter.cu
+//
+//==============================================================================
+texture <float, 1, cudaReadModeElementType> texSrc;
+texture <float, 1, cudaReadModeElementType> texKernel;
+__forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
+                                                   int i,
+                                                   int w)
+{
+    if (i < 0) i = 1 - i;
+    if (i >= w) i = w + w - i - 1;
+    return tex1Dfetch (texSrc, rowOffset + i);
+}
+__forceinline__ __device__ float getValueMirrorColumn(const int offset,
+                                                      const int rowStep,
+                                                      int j,
+                                                      int h)
+{
+    if (j < 0) j = 1 - j;
+    if (j >= h) j = h + h - j - 1;
+    return tex1Dfetch (texSrc, offset + j * rowStep);
+}
+__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
+                                              Ncv32f *pDst, 
+                                              NcvSize32u dstSize,
+                                              Ncv32u dstStep,
+                                              NcvRect32u roi,
+                                              Ncv32s nKernelSize,
+                                              Ncv32s nAnchor,
+                                              Ncv32f multiplier)
+{
+    // position within ROI
+    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+    const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+    if (ix >= roi.width || iy >= roi.height)
+    {
+        return;
+    }
+    const int p = nKernelSize - nAnchor - 1;
+    const int j = roi.y + iy;
+    const int rowOffset = j * srcStep + roi.x;
+    float sum = 0.0f;
+    for (int m = 0; m < nKernelSize; ++m)
+    {
+        sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width) 
+            * tex1Dfetch (texKernel, m);
+    }
+    pDst[iy * dstStep + ix] = sum * multiplier;
+}
+__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
+                                                 Ncv32f *pDst,
+                                                 NcvSize32u dstSize,
+                                                 Ncv32u dstStep,
+                                                 NcvRect32u roi,
+                                                 Ncv32s nKernelSize,
+                                                 Ncv32s nAnchor,
+                                                 Ncv32f multiplier)
+{
+    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+    const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+    if (ix >= roi.width || iy >= roi.height)
+    {
+        return;
+    }
+    const int p = nKernelSize - nAnchor - 1;
+    const int i = roi.x + ix;
+    const int offset = i + roi.y * srcStep;
+    float sum = 0.0f;
+    for (int m = 0; m < nKernelSize; ++m)
+    {
+        sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height) 
+            * tex1Dfetch (texKernel, m);
+    }
+    pDst[ix + iy * dstStep] = sum * multiplier;
+}
+NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+                                        NcvSize32u srcSize,
+                                        Ncv32u nSrcStep,
+                                        Ncv32f *pDst,
+                                        NcvSize32u dstSize,
+                                        Ncv32u nDstStep,
+                                        NcvRect32u oROI,
+                                        NppStBorderType borderType,
+                                        const Ncv32f *pKernel,
+                                        Ncv32s nKernelSize,
+                                        Ncv32s nAnchor,
+                                        Ncv32f multiplier)
+{
+    ncvAssertReturn (pSrc != NULL &&
+        pDst != NULL &&
+        pKernel != NULL, NCV_NULL_PTR);
+    ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        dstSize.width * sizeof (Ncv32f) <= nDstStep &&
+        oROI.width * sizeof (Ncv32f) <= nSrcStep &&
+        oROI.width * sizeof (Ncv32f) <= nDstStep &&
+        nSrcStep % sizeof (Ncv32f) == 0 &&
+        nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
+    // adjust ROI size to be within source image
+    if (oROI.x + oROI.width > srcSize.width)
+    {
+        oROI.width = srcSize.width - oROI.x;
+    }
+    if (oROI.y + oROI.height > srcSize.height)
+    {
+        oROI.height = srcSize.height - oROI.y;
+    }
+    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
+    texSrc.normalized    = false;
+    texKernel.normalized = false;
+    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
+    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+    dim3 ctaSize (32, 6);
+    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
+        (oROI.height + ctaSize.y - 1) / ctaSize.y);
+    switch (borderType)
+    {
+    case nppStBorderNone:
+        return NPPST_ERROR;
+    case nppStBorderClamp:
+        return NPPST_ERROR;
+    case nppStBorderWrap:
+        return NPPST_ERROR;
+    case nppStBorderMirror:
+        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        break;
+    default:
+        return NPPST_ERROR;
+    }
+    return NPPST_SUCCESS;
+}
+NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+                                           NcvSize32u srcSize,
+                                           Ncv32u nSrcStep,
+                                           Ncv32f *pDst,
+                                           NcvSize32u dstSize,
+                                           Ncv32u nDstStep,
+                                           NcvRect32u oROI,
+                                           NppStBorderType borderType,
+                                           const Ncv32f *pKernel,
+                                           Ncv32s nKernelSize,
+                                           Ncv32s nAnchor,
+                                           Ncv32f multiplier)
+{
+    ncvAssertReturn (pSrc != NULL &&
+        pDst != NULL &&
+        pKernel != NULL, NCV_NULL_PTR);
+    ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        dstSize.width * sizeof (Ncv32f) <= nDstStep &&
+        oROI.width * sizeof (Ncv32f) <= nSrcStep &&
+        oROI.width * sizeof (Ncv32f) <= nDstStep &&
+        nSrcStep % sizeof (Ncv32f) == 0 &&
+        nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
+    // adjust ROI size to be within source image
+    if (oROI.x + oROI.width > srcSize.width)
+    {
+        oROI.width = srcSize.width - oROI.x;
+    }
+    if (oROI.y + oROI.height > srcSize.height)
+    {
+        oROI.height = srcSize.height - oROI.y;
+    }
+    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
+    texSrc.normalized    = false;
+    texKernel.normalized = false;
+    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
+    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+    dim3 ctaSize (32, 6);
+    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
+        (oROI.height + ctaSize.y - 1) / ctaSize.y);
+    switch (borderType)
+    {
+    case nppStBorderClamp:
+        return NPPST_ERROR;
+    case nppStBorderWrap:
+        return NPPST_ERROR;
+    case nppStBorderMirror:
+        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        break;
+    default:
+        return NPPST_ERROR;
+    }
+    return NPPST_SUCCESS;
+}
+//==============================================================================
+//
+// FrameInterpolate.cu
+//
+//==============================================================================
+inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
+{
+    return (num + denom - 1)/denom;
+}
+texture<float, 2, cudaReadModeElementType> tex_src1;
+texture<float, 2, cudaReadModeElementType> tex_src0;
+__global__ void BlendFramesKernel(const float *u, const float *v,   // forward flow
+                                  const float *ur, const float *vr, // backward flow
+                                  const float *o0, const float *o1, // coverage masks
+                                  int w, int h, int s, 
+                                  float theta, float *out)
+{
+    const int ix = threadIdx.x + blockDim.x * blockIdx.x;
+    const int iy = threadIdx.y + blockDim.y * blockIdx.y;
+    const int pos = ix + s * iy;
+    if (ix >= w || iy >= h) return;
+    float _u = u[pos];
+    float _v = v[pos];
+    float _ur = ur[pos];
+    float _vr = vr[pos];
+    float x = (float)ix + 0.5f;
+    float y = (float)iy + 0.5f;
+    bool b0 = o0[pos] > 1e-4f;
+    bool b1 = o1[pos] > 1e-4f;
+    if (b0 && b1)
+    {
+        // pixel is visible on both frames
+        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) + 
+            tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
+    }
+    else if (b0)
+    {
+        // visible on the first frame only
+        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
+    }
+    else
+    {
+        // visible on the second frame only
+        out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
+    }
+}
+NCVStatus BlendFrames(const Ncv32f *src0,
+                      const Ncv32f *src1,
+                      const Ncv32f *ufi,
+                      const Ncv32f *vfi,
+                      const Ncv32f *ubi,
+                      const Ncv32f *vbi,
+                      const Ncv32f *o1,
+                      const Ncv32f *o2,
+                      Ncv32u width,
+                      Ncv32u height,
+                      Ncv32u stride,
+                      Ncv32f theta,
+                      Ncv32f *out)
+{
+    tex_src1.addressMode[0] = cudaAddressModeClamp;
+    tex_src1.addressMode[1] = cudaAddressModeClamp;
+    tex_src1.filterMode = cudaFilterModeLinear;
+    tex_src1.normalized = false;
+    tex_src0.addressMode[0] = cudaAddressModeClamp;
+    tex_src0.addressMode[1] = cudaAddressModeClamp;
+    tex_src0.filterMode = cudaFilterModeLinear;
+    tex_src0.normalized = false;
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
+    const Ncv32u pitch = stride * sizeof (float);
+    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
+    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
+    dim3 threads (32, 4);
+    dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
+    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
+        (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
+    ncvAssertCUDAReturn (cudaGetLastError (), NPPST_CUDA_KERNEL_EXECUTION_ERROR);
+    return NPPST_SUCCESS;
+}
+NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize,
+                                           Ncv32u nStep,
+                                           Ncv32u *hpSize)
+{
+    NCVStatus status = NPPST_ERROR;
+    status = nppiStVectorWarpGetBufferSize(srcSize, nStep, hpSize);
+    return status;
+}
+NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
+{
+    // check state validity
+    ncvAssertReturn (pState->pSrcFrame0 != 0 &&
+        pState->pSrcFrame1 != 0 &&
+        pState->pFU != 0 &&
+        pState->pFV != 0 &&
+        pState->pBU != 0 &&
+        pState->pBV != 0 &&
+        pState->pNewFrame != 0 &&
+        pState->ppBuffers[0] != 0 &&
+        pState->ppBuffers[1] != 0 &&
+        pState->ppBuffers[2] != 0 &&
+        pState->ppBuffers[3] != 0 &&
+        pState->ppBuffers[4] != 0 &&
+        pState->ppBuffers[5] != 0, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn (pState->size.width  > 0 &&
+        pState->size.height > 0, NPPST_ERROR);
+    ncvAssertReturn (pState->nStep >= pState->size.width * sizeof (Ncv32f) &&
+        pState->nStep > 0 &&
+        pState->nStep % sizeof (Ncv32f) == 0,
+        NPPST_INVALID_STEP);
+    // change notation
+    Ncv32f *cov0 = pState->ppBuffers[0];
+    Ncv32f *cov1 = pState->ppBuffers[1];
+    Ncv32f *fwdU = pState->ppBuffers[2]; // forward u
+    Ncv32f *fwdV = pState->ppBuffers[3]; // forward v
+    Ncv32f *bwdU = pState->ppBuffers[4]; // backward u
+    Ncv32f *bwdV = pState->ppBuffers[5]; // backward v
+    // warp flow
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU, 
+        pState->size, 
+        pState->nStep,
+        pState->pFU,
+        pState->pFV,
+        pState->nStep,
+        cov0,
+        pState->pos,
+        fwdU) );
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV, 
+        pState->size, 
+        pState->nStep,
+        pState->pFU,
+        pState->pFV,
+        pState->nStep,
+        cov0,
+        pState->pos,
+        fwdV) );
+    // warp backward flow
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU, 
+        pState->size, 
+        pState->nStep,
+        pState->pBU,
+        pState->pBV,
+        pState->nStep,
+        cov1,
+        1.0f - pState->pos,
+        bwdU) );
+    ncvAssertReturnNcvStat (
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV, 
+        pState->size, 
+        pState->nStep,
+        pState->pBU,
+        pState->pBV,
+        pState->nStep,
+        cov1,
+        1.0f - pState->pos,
+        bwdU) );
+    // interpolate frame
+    ncvAssertReturnNcvStat (
+        BlendFrames (pState->pSrcFrame0,
+        pState->pSrcFrame1,
+        fwdU,
+        fwdV,
+        bwdU,
+        bwdV,
+        cov0,
+        cov1,
+        pState->size.width,
+        pState->size.height,
+        pState->nStep / sizeof (Ncv32f),
+        pState->pos,
+        pState->pNewFrame) );
+    return NPPST_SUCCESS;
+}
+//==============================================================================
+//
+// VectorWarpFrame.cu
+//
+//==============================================================================
+#if __CUDA_ARCH__ < 200
+// FP32 atomic add
+static __forceinline__ __device__ float _atomicAdd(float *addr, float val)
+{
+    float old = *addr, assumed;
+    do {
+        assumed = old;
+        old = int_as_float(__iAtomicCAS((int*)addr,
+              float_as_int(assumed),
+              float_as_int(val+assumed)));
+    } while( assumed!=old );
+    return old;
+}
+#else
+#define _atomicAdd atomicAdd
+#endif
+__global__ void ForwardWarpKernel_PSF2x2(const float *u,
+                                         const float *v,
+                                         const float *src,
+                                         const int w,
+                                         const int h,
+                                         const int flow_stride,
+                                         const int image_stride,
+                                         const float time_scale,
+                                         float *normalization_factor,
+                                         float *dst)
+{
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+    if (i >= h || j >= w) return;
+    int flow_row_offset  = i * flow_stride;
+    int image_row_offset = i * image_stride;
+    //bottom left corner of a target pixel
+    float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
+    float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
+    // pixel containing bottom left corner
+    float px;
+    float py;
+    float dx = modff (cx, &px);
+    float dy = modff (cy, &py);
+    // target pixel integer coords
+    int tx;
+    int ty;
+    tx = (int) px;
+    ty = (int) py;
+    float value = src[image_row_offset + j];
+    float weight;
+    // fill pixel containing bottom right corner
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * dy;
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+    // fill pixel containing bottom left corner
+    tx -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * dy;
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+    // fill pixel containing upper left corner
+    ty -= 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = (1.0f - dx) * (1.0f - dy);
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+    // fill pixel containing upper right corner
+    tx += 1;
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        weight = dx * (1.0f - dy);
+        _atomicAdd (dst + ty * image_stride + tx, value * weight);
+        _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
+    }
+}
+__global__ void ForwardWarpKernel_PSF1x1(const float *u,
+                                         const float *v,
+                                         const float *src,
+                                         const int w,
+                                         const int h,
+                                         const int flow_stride,
+                                         const int image_stride,
+                                         const float time_scale,
+                                         float *dst)
+{
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+    if (i >= h || j >= w) return;
+    int flow_row_offset = i * flow_stride;
+    int image_row_offset = i * image_stride;
+    float u_ = u[flow_row_offset + j];
+    float v_ = v[flow_row_offset + j];
+    //bottom left corner of target pixel
+    float cx = u_ * time_scale + (float)j + 1.0f;
+    float cy = v_ * time_scale + (float)i + 1.0f;
+    // pixel containing bottom left corner
+    int tx = __float2int_rn (cx);
+    int ty = __float2int_rn (cy);
+    float value = src[image_row_offset + j];
+    // fill pixel
+    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
+    {
+        _atomicAdd (dst + ty * image_stride + tx, value);
+    }
+}
+__global__ void NormalizeKernel(const float *normalization_factor, int w, int h, int s, float *image)
+{
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+    if (i >= h || j >= w) return;
+    const int pos = i * s + j;
+    float scale = normalization_factor[pos];
+    float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
+    image[pos] *= invScale;
+}
+__global__ void MemsetKernel(const float value, int w, int h, float *image)
+{
+    int i = threadIdx.y + blockDim.y * blockIdx.y;
+    int j = threadIdx.x + blockDim.x * blockIdx.x;
+    if (i >= h || j >= w) return;
+    const int pos = i * w + j;
+    image[pos] = value;
+}
+NCVStatus nppiStVectorWarpGetBufferSize (NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32u *hpSize)
+{
+    ncvAssertReturn (hpSize != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep,
+        NPPST_INVALID_STEP);
+    *hpSize = nSrcStep * srcSize.height;
+    return NPPST_SUCCESS;
+}
+// does not require normalization
+NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst)
+{
+    ncvAssertReturn (pSrc != NULL && 
+        pU   != NULL &&
+        pV   != NULL &&
+        pDst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        srcSize.width * sizeof (Ncv32f) <= nVFStep,
+        NPPST_INVALID_STEP);
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u vfStep  = nVFStep / sizeof (Ncv32f);
+    dim3 ctaSize (32, 6);
+    dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
+    ForwardWarpKernel_PSF1x1 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pDst);
+    return NPPST_SUCCESS;
+}
+NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f *pBuffer,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst)
+{
+    ncvAssertReturn (pSrc != NULL && 
+        pU   != NULL &&
+        pV   != NULL &&
+        pDst != NULL &&
+        pBuffer != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
+        srcSize.width * sizeof (Ncv32f) <= nVFStep, NPPST_INVALID_STEP);
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u vfStep = nVFStep / sizeof(Ncv32f);
+    dim3 ctaSize(32, 6);
+    dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
+    MemsetKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (0, srcSize.width, srcSize.height, pBuffer);
+    ForwardWarpKernel_PSF2x2 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pBuffer, pDst);
+    NormalizeKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
+        (pBuffer, srcSize.width, srcSize.height, srcStep, pDst);
+    return NPPST_SUCCESS;
+}
+//==============================================================================
+//
+// Resize.cu
+//
+//==============================================================================
+texture <float, 2, cudaReadModeElementType> texSrc2D;
+__forceinline__
+__device__ float processLine(int spos,
+                             float xmin,
+                             float xmax,
+                             int ixmin,
+                             int ixmax,
+                             float fxmin,
+                             float cxmax)
+{
+    // first element
+    float wsum = 1.0f - xmin + fxmin;
+    float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
+    spos++;
+    for (int ix = ixmin + 1; ix < ixmax; ++ix)
+    {
+        sum += tex1Dfetch(texSrc, spos);
+        spos++;
+        wsum += 1.0f;
+    }
+    sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
+    wsum += cxmax - xmax;
+    return sum / wsum;
+}
+__global__ void resizeSuperSample_32f(NcvSize32u srcSize,
+                                      Ncv32u srcStep,
+                                      NcvRect32u srcROI,
+                                      Ncv32f *dst,
+                                      NcvSize32u dstSize,
+                                      Ncv32u dstStep,
+                                      NcvRect32u dstROI,
+                                      Ncv32f scaleX,
+                                      Ncv32f scaleY)
+{
+    // position within dst ROI
+    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    if (ix >= dstROI.width || iy >= dstROI.height)
+    {
+        return;
+    }
+    float rw = (float) srcROI.width;
+    float rh = (float) srcROI.height; 
+    // source position
+    float x = scaleX * (float) ix;
+    float y = scaleY * (float) iy;
+    // x sampling range
+    float xBegin = fmax (x - scaleX, 0.0f);
+    float xEnd   = fmin (x + scaleX, rw - 1.0f);
+    // y sampling range
+    float yBegin = fmax (y - scaleY, 0.0f);
+    float yEnd   = fmin (y + scaleY, rh - 1.0f);
+    // x range of source samples
+    float floorXBegin = floorf (xBegin);
+    float ceilXEnd    = ceilf (xEnd);
+    int iXBegin = srcROI.x + (int) floorXBegin;
+    int iXEnd   = srcROI.x + (int) ceilXEnd;
+    // y range of source samples
+    float floorYBegin = floorf (yBegin);
+    float ceilYEnd    = ceilf (yEnd);
+    int iYBegin = srcROI.y + (int) floorYBegin;
+    int iYEnd   = srcROI.y + (int) ceilYEnd;
+    // first row
+    int pos = iYBegin * srcStep + iXBegin;
+    float wsum = 1.0f - yBegin + floorYBegin;
+    float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+        ceilXEnd) * (1.0f - yBegin + floorYBegin);
+    pos += srcStep;
+    for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
+    {
+        sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+            ceilXEnd);
+        pos += srcStep;
+        wsum += 1.0f;
+    }
+    sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+        ceilXEnd) * (ceilYEnd - yEnd);
+    wsum += ceilYEnd - yEnd;
+    sum /= wsum;
+    dst[(ix + dstROI.x) + (iy + dstROI.y) * dstStep] = sum;
+}
+// bicubic interpolation
+__forceinline__
+__device__ float bicubicCoeff(float x_)
+{
+    float x = fabsf(x_);
+    if (x <= 1.0f)
+    {
+        return x * x * (1.5f * x - 2.5f) + 1.0f;
+    }
+    else if (x < 2.0f)
+    {
+        return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+__global__ void resizeBicubic(NcvSize32u srcSize,
+                              NcvRect32u srcROI,
+                              NcvSize32u dstSize,
+                              Ncv32u dstStep,
+                              Ncv32f *dst,
+                              NcvRect32u dstROI,
+                              Ncv32f scaleX,
+                              Ncv32f scaleY)
+{
+    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    if (ix >= dstROI.width || iy >= dstROI.height)
+    {
+        return;
+    }
+    const float dx = 1.0f / srcROI.width;
+    const float dy = 1.0f / srcROI.height;
+    float rx = (float) srcROI.x;
+    float ry = (float) srcROI.y;
+    float rw = (float) srcROI.width;
+    float rh = (float) srcROI.height;
+    float x = scaleX * (float) ix;
+    float y = scaleY * (float) iy;
+    // sampling range
+    // border mode is clamp
+    float xmin = fmax (ceilf (x - 2.0f), 0.0f);
+    float xmax = fmin (floorf (x + 2.0f), rw - 1.0f);
+    float ymin = fmax (ceilf (y - 2.0f), 0.0f);
+    float ymax = fmin (floorf (y + 2.0f), rh - 1.0f);
+    // shift data window to match ROI
+    rx += 0.5f;
+    ry += 0.5f;
+    x += rx;
+    y += ry;
+    xmin += rx;
+    xmax += rx;
+    ymin += ry;
+    ymax += ry;
+    float sum  = 0.0f;
+    float wsum = 0.0f;
+    for (float cy = ymin; cy <= ymax; cy += 1.0f)
+    {
+        for (float cx = xmin; cx <= xmax; cx += 1.0f)
+        {
+            float xDist = x - cx;
+            float yDist = y - cy;
+            float wx = bicubicCoeff (xDist);
+            float wy = bicubicCoeff (yDist);
+            wx *= wy;
+            sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
+            wsum += wx;
+        }
+    }
+    dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = sum / wsum;
+}
+NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+                               NcvSize32u srcSize,
+                               Ncv32u nSrcStep,
+                               NcvRect32u srcROI,
+                               Ncv32f *pDst,
+                               NcvSize32u dstSize,
+                               Ncv32u nDstStep,
+                               NcvRect32u dstROI,
+                               Ncv32f xFactor,
+                               Ncv32f yFactor,
+                               NppStInterpMode interpolation)
+{
+    NCVStatus status = NPPST_SUCCESS;
+    ncvAssertReturn (pSrc != NULL && pDst != NULL, NPPST_NULL_POINTER_ERROR);
+    ncvAssertReturn (xFactor != 0.0 && yFactor != 0.0, NPPST_INVALID_SCALE);
+    ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width && 
+        nDstStep >= sizeof (Ncv32f) * (Ncv32f) dstSize.width,
+        NPPST_INVALID_STEP);
+    Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
+    Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
+    // TODO: preprocess ROI to prevent out of bounds access
+    if (interpolation == nppStSupersample)
+    {
+        // bind texture
+        cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
+        // invoke kernel
+        dim3 ctaSize (32, 6);
+        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,
+            (dstROI.height + ctaSize.y - 1) / ctaSize.y);
+        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> 
+            (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+    }
+    else if (interpolation == nppStBicubic)
+    {
+        texSrc2D.addressMode[0] = cudaAddressModeMirror;
+        texSrc2D.addressMode[1] = cudaAddressModeMirror;
+        texSrc2D.normalized = true;
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
+        cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
+            nSrcStep);
+        dim3 ctaSize (32, 6);
+        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x,
+            (dstSize.height + ctaSize.y - 1) / ctaSize.y);
+        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+            (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+    }
+    else
+    {
+        status = NPPST_ERROR;
+    }
+    return status;
+}
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
@@ -84,6 +84,255 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
 */
+/** Border type
+ *
+ * Filtering operations assume that each pixel has a neighborhood of pixels.
+ * The following structure describes possible ways to define non-existent pixels.
+ */
+enum NppStBorderType
+{
+    nppStBorderNone   = 0, ///< There is no need to define additional pixels, image is extended already
+    nppStBorderClamp  = 1, ///< Clamp out of range position to borders
+    nppStBorderWrap   = 2, ///< Wrap out of range position. Image becomes periodic.
+    nppStBorderMirror = 3  ///< reflect out of range position across borders
+};
+/**
+ * Filter types for image resizing
+ */
+enum NppStInterpMode
+{
+    nppStSupersample, ///< Supersampling. For downscaling only
+    nppStBicubic      ///< Bicubic convolution filter, a = -0.5 (cubic Hermite spline)
+};
+/** Frame interpolation state
+ *
+ * This structure holds parameters required for frame interpolation.
+ * Forward displacement field is a per-pixel mapping from frame 0 to frame 1.
+ * Backward displacement field is a per-pixel mapping from frame 1 to frame 0.
+ */
+ struct NppStInterpolationState
+{
+    NcvSize32u size;      ///< frame size
+    Ncv32u nStep;         ///< pitch
+    Ncv32f pos;           ///< new frame position
+    Ncv32f *pSrcFrame0;   ///< frame 0
+    Ncv32f *pSrcFrame1;   ///< frame 1
+    Ncv32f *pFU;          ///< forward horizontal displacement
+    Ncv32f *pFV;          ///< forward vertical displacement
+    Ncv32f *pBU;          ///< backward horizontal displacement
+    Ncv32f *pBV;          ///< backward vertical displacement
+    Ncv32f *pNewFrame;    ///< new frame
+    Ncv32f *ppBuffers[6]; ///< temporary buffers
+};
+/** Size of a buffer required for interpolation.
+ * 
+ * Requires several such buffers. See \see NppStInterpolationState.
+ *
+ * \param srcSize           [IN]  Frame size (both frames must be of the same size)
+ * \param nStep             [IN]  Frame line step
+ * \param hpSize            [OUT] Where to store computed size (host memory)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize,
+                                           Ncv32u nStep,
+                                           Ncv32u *hpSize);
+/** Interpolate frames (images) using provided optical flow (displacement field).
+ * 32-bit floating point images, single channel
+ *
+ * \param pState            [IN] structure containing all required parameters (host memory)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
+/** Row linear filter. 32-bit floating point image, single channel
+ *
+ * Apply horizontal linear filter
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ * \param dstSize           [OUT] Destination image size
+ * \param oROI              [IN]  Region of interest in the source image
+ * \param borderType        [IN]  Type of border
+ * \param pKernel           [IN]  Pointer to row kernel values (CUDA device memory)
+ * \param nKernelSize       [IN]  Size of the kernel in pixels
+ * \param nAnchor           [IN]  The kernel row alignment with respect to the position of the input pixel
+ * \param multiplier        [IN]  Value by which the computed result is multiplied
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, 
+                                        NcvSize32u srcSize, 
+                                        Ncv32u nSrcStep,
+                                        Ncv32f *pDst, 
+                                        NcvSize32u dstSize, 
+                                        Ncv32u nDstStep,
+                                        NcvRect32u oROI, 
+                                        NppStBorderType borderType,
+                                        const Ncv32f *pKernel, 
+                                        Ncv32s nKernelSize,
+                                        Ncv32s nAnchor, 
+                                        Ncv32f multiplier);
+/** Column linear filter. 32-bit floating point image, single channel
+ *
+ * Apply vertical linear filter
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ * \param dstSize           [OUT] Destination image size
+ * \param oROI              [IN]  Region of interest in the source image
+ * \param borderType        [IN]  Type of border
+ * \param pKernel           [IN]  Pointer to column kernel values (CUDA device memory)
+ * \param nKernelSize       [IN]  Size of the kernel in pixels
+ * \param nAnchor           [IN]  The kernel column alignment with respect to the position of the input pixel
+ * \param multiplier        [IN]  Value by which the computed result is multiplied
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+                                           NcvSize32u srcSize,
+                                           Ncv32u nSrcStep,
+                                           Ncv32f *pDst,
+                                           NcvSize32u dstSize,
+                                           Ncv32u nDstStep,
+                                           NcvRect32u oROI,
+                                           NppStBorderType borderType,
+                                           const Ncv32f *pKernel,
+                                           Ncv32s nKernelSize,
+                                           Ncv32s nAnchor,
+                                           Ncv32f multiplier);
+/** Size of buffer required for vector image warping.
+ * 
+ * \param srcSize           [IN]  Source image size
+ * \param nStep             [IN]  Source image line step
+ * \param hpSize            [OUT] Where to store computed size (host memory)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS 
+NCVStatus nppiStVectorWarpGetBufferSize(NcvSize32u srcSize,
+                                        Ncv32u nSrcStep,
+                                        Ncv32u *hpSize);
+/** Warp image using provided 2D vector field and 1x1 point spread function.
+ * 32-bit floating point image, single channel
+ *
+ * During warping pixels from the source image may fall between pixels of the destination image.
+ * PSF (point spread function) describes how the source image pixel affects pixels of the destination.
+ * For 1x1 PSF only single pixel with the largest intersection is affected (similar to nearest interpolation).
+ *
+ * Destination image size and line step must be the same as the source image size and line step
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pU                [IN]  Pointer to horizontal displacement field (CUDA device memory)
+ * \param pV                [IN]  Pointer to vertical displacement field (CUDA device memory)
+ * \param nVFStep           [IN]  Displacement field line step
+ * \param timeScale         [IN]  Value by which displacement field will be scaled for warping
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst);
+/** Warp image using provided 2D vector field and 2x2 point spread function.
+ * 32-bit floating point image, single channel
+ *
+ * During warping pixels from the source image may fall between pixels of the destination image.
+ * PSF (point spread function) describes how the source image pixel affects pixels of the destination.
+ * For 2x2 PSF all four intersected pixels will be affected.
+ *
+ * Destination image size and line step must be the same as the source image size and line step
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param pU                [IN]  Pointer to horizontal displacement field (CUDA device memory)
+ * \param pV                [IN]  Pointer to vertical displacement field (CUDA device memory)
+ * \param nVFStep           [IN]  Displacement field line step
+ * \param timeScale         [IN]  Value by which displacement field will be scaled for warping
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
+                                         NcvSize32u srcSize,
+                                         Ncv32u nSrcStep,
+                                         const Ncv32f *pU,
+                                         const Ncv32f *pV,
+                                         Ncv32u nVFStep,
+                                         Ncv32f *pBuffer,
+                                         Ncv32f timeScale,
+                                         Ncv32f *pDst);
+/** Resize. 32-bit floating point image, single channel
+ *
+ * Resizes image using specified filter (interpolation type)
+ *
+ * \param pSrc              [IN]  Source image pointer (CUDA device memory)
+ * \param srcSize           [IN]  Source image size
+ * \param nSrcStep          [IN]  Source image line step
+ * \param srcROI            [IN]  Source image region of interest
+ * \param pDst              [OUT] Destination image pointer (CUDA device memory)
+ * \param dstSize           [IN]  Destination image size
+ * \param nDstStep          [IN]  Destination image line step
+ * \param dstROI            [IN]  Destination image region of interest
+ * \param xFactor           [IN]  Row scale factor
+ * \param yFactor           [IN]  Column scale factor
+ * \param interpolation     [IN]  Interpolation type
+ * 
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+                               NcvSize32u srcSize,
+                               Ncv32u nSrcStep,
+                               NcvRect32u srcROI,
+                               Ncv32f *pDst,
+                               NcvSize32u dstSize,
+                               Ncv32u nDstStep,
+                               NcvRect32u dstROI,
+                               Ncv32f xFactor,
+                               Ncv32f yFactor,
+                               NppStInterpMode interpolation);
 /**
 * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel.
 *

--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
+#if _MSC_VER >= 1400
+#pragma warning( disable : 4201 4408 4127 4100)
+#endif
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <exception>
+#include <ctime>
+#include "cvconfig.h"
+#include <iostream>
+#include <iomanip>
+#include "opencv2/opencv.hpp"
+#include "opencv2/gpu/gpu.hpp"
+#ifdef HAVE_CUDA
+#include "NPP_staging/NPP_staging.hpp"
+#include "NCVBroxOpticalFlow.hpp"
+#endif
+#if !defined(HAVE_CUDA)
+int main( int argc, const char** argv )
+{
+    cout << "Please compile the library with CUDA support" << endl;
+    return -1;
+}
+#else
+using std::tr1::shared_ptr;
+#define PARAM_INPUT "--input"
+#define PARAM_SCALE "--scale"
+#define PARAM_ALPHA "--alpha"
+#define PARAM_GAMMA "--gamma"
+#define PARAM_INNER "--inner"
+#define PARAM_OUTER "--outer"
+#define PARAM_SOLVER "--solver"
+#define PARAM_TIME_STEP "--time-step"
+#define PARAM_HELP "--help"
+shared_ptr<INCVMemAllocator> g_pGPUMemAllocator;
+shared_ptr<INCVMemAllocator> g_pHostMemAllocator;
+class RgbToMonochrome
+{
+public:
+    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    {
+        float _r = static_cast<float>(r)/255.0f;
+        float _g = static_cast<float>(g)/255.0f;
+        float _b = static_cast<float>(b)/255.0f;
+        return (_r + _g + _b)/3.0f;
+    }
+};
+class RgbToR
+{
+public:
+    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    {
+        return static_cast<float>(r)/255.0f;
+    }
+};
+class RgbToG
+{
+public:
+    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    {
+        return static_cast<float>(g)/255.0f;
+    }
+};
+class RgbToB
+{
+public:
+    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    {
+        return static_cast<float>(b)/255.0f;
+    }
+};
+template<class T>
+NCVStatus CopyData(IplImage *image, shared_ptr<NCVMatrixAlloc<Ncv32f>> &dst)
+{
+    dst = shared_ptr<NCVMatrixAlloc<Ncv32f>> (new NCVMatrixAlloc<Ncv32f> (*g_pHostMemAllocator, image->width, image->height));
+    ncvAssertReturn (dst->isMemAllocated (), NCV_ALLOCATOR_BAD_ALLOC);
+    unsigned char *row = reinterpret_cast<unsigned char*> (image->imageData);
+    T convert;
+    for (int i = 0; i < image->height; ++i)
+    {
+        for (int j = 0; j < image->width; ++j)
+        {
+            if (image->nChannels < 3)
+            {
+                dst->ptr ()[j + i*dst->stride ()] = static_cast<float> (*(row + j*image->nChannels))/255.0f;
+            }
+            else
+            {
+                unsigned char *color = row + j * image->nChannels;
+                dst->ptr ()[j +i*dst->stride ()] = convert (color[0], color[1], color[2]);
+            }
+        }
+        row += image->widthStep;
+    }
+    return NCV_SUCCESS;
+}
+template<class T>
+NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc<Ncv32f> &dst)
+{
+    unsigned char *row = reinterpret_cast<unsigned char*> (image->imageData);
+    T convert;
+    for (int i = 0; i < image->height; ++i)
+    {
+        for (int j = 0; j < image->width; ++j)
+        {
+            if (image->nChannels < 3)
+            {
+                dst.ptr ()[j + i*dst.stride ()] = static_cast<float>(*(row + j*image->nChannels))/255.0f;
+            }
+            else
+            {
+                unsigned char *color = row + j * image->nChannels;
+                dst.ptr ()[j +i*dst.stride()] = convert (color[0], color[1], color[2]);
+            }
+        }
+        row += image->widthStep;
+    }
+    return NCV_SUCCESS;
+}
+NCVStatus LoadImages (const char *frame0Name, 
+                      const char *frame1Name, 
+                      int &width, 
+                      int &height, 
+                      shared_ptr<NCVMatrixAlloc<Ncv32f>> &src, 
+                      shared_ptr<NCVMatrixAlloc<Ncv32f>> &dst, 
+                      IplImage *&firstFrame, 
+                      IplImage *&lastFrame)
+{
+    IplImage *image;
+    image = cvLoadImage (frame0Name);
+    if (image == 0)
+    {
+        std::cout << "Could not open '" << frame0Name << "'\n";
+        return NCV_FILE_ERROR;
+    }
+    firstFrame = image;
+    // copy data to src
+    ncvAssertReturnNcvStat (CopyData<RgbToMonochrome> (image, src));
+    IplImage *image2;
+    image2 = cvLoadImage (frame1Name);
+    if (image2 == 0)
+    {
+        std::cout << "Could not open '" << frame1Name << "'\n";
+        return NCV_FILE_ERROR;
+    }
+    lastFrame = image2;
+    ncvAssertReturnNcvStat (CopyData<RgbToMonochrome> (image2, dst));
+    width  = image->width;
+    height = image->height;
+    return NCV_SUCCESS;
+}
+template<typename T>
+inline T Clamp (T x, T a, T b)
+{
+    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
+}
+template<typename T>
+inline T MapValue (T x, T a, T b, T c, T d)
+{
+    x = Clamp (x, a, b);
+    return c + (d - c) * (x - a) / (b - a);
+}
+NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
+{
+    IplImage *flowField;
+    NCVMatrixAlloc<Ncv32f> host_u(*g_pHostMemAllocator, u.width(), u.height());
+    ncvAssertReturn(host_u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv32f> host_v (*g_pHostMemAllocator, u.width (), u.height ());
+    ncvAssertReturn (host_v.isMemAllocated (), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturnNcvStat (u.copySolid (host_u, 0));
+    ncvAssertReturnNcvStat (v.copySolid (host_v, 0));
+    float *ptr_u = host_u.ptr ();
+    float *ptr_v = host_v.ptr ();
+    float maxDisplacement = 1.0f;
+    for (Ncv32u i = 0; i < u.height (); ++i)
+    {
+        for (Ncv32u j = 0; j < u.width (); ++j)
+        {
+            float d = std::max ( fabsf(*ptr_u), fabsf(*ptr_v) );
+            if (d > maxDisplacement) maxDisplacement = d;
+            ++ptr_u;
+            ++ptr_v;
+        }
+        ptr_u += u.stride () - u.width ();
+        ptr_v += v.stride () - v.width ();
+    }
+    CvSize image_size = cvSize (u.width (), u.height ());
+    flowField = cvCreateImage (image_size, IPL_DEPTH_8U, 4);
+    if (flowField == 0) return NCV_NULL_PTR;
+    unsigned char *row = reinterpret_cast<unsigned char *> (flowField->imageData);
+    ptr_u = host_u.ptr();
+    ptr_v = host_v.ptr();
+    for (int i = 0; i < flowField->height; ++i)
+    {
+        for (int j = 0; j < flowField->width; ++j)
+        {
+            (row + j * flowField->nChannels)[0] = 0;
+            (row + j * flowField->nChannels)[1] = static_cast<unsigned char> (MapValue (-(*ptr_v), -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            (row + j * flowField->nChannels)[2] = static_cast<unsigned char> (MapValue (*ptr_u   , -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
+            (row + j * flowField->nChannels)[3] = 255;
+            ++ptr_u;
+            ++ptr_v;
+        }
+        row += flowField->widthStep;
+        ptr_u += u.stride () - u.width ();
+        ptr_v += v.stride () - v.width ();
+    }
+    cvShowImage (name, flowField);
+    return NCV_SUCCESS;
+}
+IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
+{
+    CvSize imageSize = cvSize (h_r.width (), h_r.height ());
+    IplImage *image  = cvCreateImage (imageSize, IPL_DEPTH_8U, 4);
+    if (image == 0) return 0;
+    unsigned char *row = reinterpret_cast<unsigned char*> (image->imageData);
+    for (int i = 0; i < image->height; ++i)
+    {
+        for (int j = 0; j < image->width; ++j)
+        {
+            int offset = j * image->nChannels;
+            int pos    = i * h_r.stride () + j;
+            row[offset + 0] = static_cast<unsigned char> (h_b.ptr ()[pos] * 255.0f);
+            row[offset + 1] = static_cast<unsigned char> (h_g.ptr ()[pos] * 255.0f);
+            row[offset + 2] = static_cast<unsigned char> (h_r.ptr ()[pos] * 255.0f);
+            row[offset + 3] = 255;
+        }
+        row += image->widthStep;
+    }
+    return image;
+}
+void PrintHelp ()
+{
+    std::cout << "Usage help:\n";
+    std::cout << std::setiosflags(std::ios::left);
+    std::cout << "\t" << std::setw(15) << PARAM_ALPHA << " - set alpha\n";
+    std::cout << "\t" << std::setw(15) << PARAM_GAMMA << " - set gamma\n";
+    std::cout << "\t" << std::setw(15) << PARAM_INNER << " - set number of inner iterations\n";
+    std::cout << "\t" << std::setw(15) << PARAM_INPUT << " - specify input file names (2 image files)\n";
+    std::cout << "\t" << std::setw(15) << PARAM_OUTER << " - set number of outer iterations\n";
+    std::cout << "\t" << std::setw(15) << PARAM_SCALE << " - set pyramid scale factor\n";
+    std::cout << "\t" << std::setw(15) << PARAM_SOLVER << " - set number of basic solver iterations\n";
+    std::cout << "\t" << std::setw(15) << PARAM_TIME_STEP << " - set frame interpolation time step\n";
+    std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n";
+}
+int ProcessCommandLine(int argc, char **argv, 
+                       Ncv32f &timeStep, 
+                       char *&frame0Name, 
+                       char *&frame1Name, 
+                       NCVBroxOpticalFlowDescriptor &desc)
+{
+    timeStep = 0.25f;
+    for (int iarg = 1; iarg < argc; ++iarg)
+    {
+        if (strcmp(argv[iarg], PARAM_INPUT) == 0)
+        {
+            if (iarg + 2 < argc)
+            {
+                frame0Name = argv[++iarg];
+                frame1Name = argv[++iarg];
+            }
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_SCALE) == 0)
+        {
+            if (iarg + 1 < argc)
+                desc.scale_factor = static_cast<Ncv32f>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_ALPHA) == 0)
+        {
+            if (iarg + 1 < argc)
+                desc.alpha = static_cast<Ncv32f>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_GAMMA) == 0)
+        {
+            if (iarg + 1 < argc)
+                desc.gamma = static_cast<Ncv32f>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_INNER) == 0)
+        {
+            if (iarg + 1 < argc)
+                desc.number_of_inner_iterations = static_cast<Ncv32u>(atoi(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_OUTER) == 0)
+        {
+            if (iarg + 1 < argc)
+                desc.number_of_outer_iterations = static_cast<Ncv32u>(atoi(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_SOLVER) == 0)
+        {
+            if (iarg + 1 < argc)
+                desc.number_of_solver_iterations = static_cast<Ncv32u>(atoi(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_TIME_STEP) == 0)
+        {
+            if (iarg + 1 < argc)
+                timeStep = static_cast<Ncv32f>(atof(argv[++iarg]));
+            else
+                return -1;
+        }
+        else if(strcmp(argv[iarg], PARAM_HELP) == 0)
+        {
+            PrintHelp ();
+            return 0;
+        }
+    }
+    return 0;
+}
+int main(int argc, char **argv)
+{
+    char *frame0Name = 0, *frame1Name = 0;
+    Ncv32f timeStep = 0.01f;
+    NCVBroxOpticalFlowDescriptor desc;
+    desc.alpha = 0.197f;
+    desc.gamma = 50.0f;
+    desc.number_of_inner_iterations  = 10;
+    desc.number_of_outer_iterations  = 77;
+    desc.number_of_solver_iterations = 10;
+    desc.scale_factor = 0.8f;
+    int result = ProcessCommandLine (argc, argv, timeStep, frame0Name, frame1Name, desc);
+    if (argc == 1 || result)
+    {
+        PrintHelp();
+        return result;
+    }
+    std::cout << "OpenCV / NVIDIA Computer Vision\n";
+    std::cout << "Optical Flow Demo: Frame Interpolation\n";
+    std::cout << "=========================================\n";
+    std::cout << "Press:\n ESC to quit\n 'a' to move to the previous frame\n 's' to move to the next frame\n";
+    int devId;
+    ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);
+    cudaDeviceProp devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);
+    std::cout << "Using GPU: " << devId << "(" << devProp.name <<
+        "), arch=" << devProp.major << "." << devProp.minor << std::endl;
+    g_pGPUMemAllocator  = shared_ptr<INCVMemAllocator> (new NCVMemNativeAllocator (NCVMemoryTypeDevice, devProp.textureAlignment));
+    ncvAssertPrintReturn (g_pGPUMemAllocator->isInitialized (), "Device memory allocator isn't initialized", -1);
+    g_pHostMemAllocator = shared_ptr<INCVMemAllocator> (new NCVMemNativeAllocator (NCVMemoryTypeHostPageable, devProp.textureAlignment));
+    ncvAssertPrintReturn (g_pHostMemAllocator->isInitialized (), "Host memory allocator isn't initialized", -1);
+    int width, height;
+    shared_ptr<NCVMatrixAlloc<Ncv32f>> src_host;
+    shared_ptr<NCVMatrixAlloc<Ncv32f>> dst_host;
+    IplImage *firstFrame, *lastFrame;
+    if (frame0Name != 0 && frame1Name != 0)
+    {
+        ncvAssertReturnNcvStat (LoadImages (frame0Name, frame1Name, width, height, src_host, dst_host, firstFrame, lastFrame));
+    }
+    else
+    {
+        ncvAssertReturnNcvStat (LoadImages ("frame10.bmp", "frame11.bmp", width, height, src_host, dst_host, firstFrame, lastFrame));
+    }
+    shared_ptr<NCVMatrixAlloc<Ncv32f>> src (new NCVMatrixAlloc<Ncv32f> (*g_pGPUMemAllocator, src_host->width (), src_host->height ()));
+    ncvAssertReturn(src->isMemAllocated(), -1);
+    shared_ptr<NCVMatrixAlloc<Ncv32f>> dst (new NCVMatrixAlloc<Ncv32f> (*g_pGPUMemAllocator, src_host->width (), src_host->height ()));
+    ncvAssertReturn (dst->isMemAllocated (), -1);
+    ncvAssertReturnNcvStat (src_host->copySolid ( *src, 0 ));
+    ncvAssertReturnNcvStat (dst_host->copySolid ( *dst, 0 ));
+#if defined SAFE_MAT_DECL
+#undef SAFE_MAT_DECL
+#endif
+#define SAFE_MAT_DECL(name, allocator, sx, sy) \
+    NCVMatrixAlloc<Ncv32f> name(*allocator, sx, sy);\
+    ncvAssertReturn(name##.isMemAllocated(), -1);
+    SAFE_MAT_DECL (u, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (v, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (uBck, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (vBck, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (h_r, g_pHostMemAllocator, width, height);
+    SAFE_MAT_DECL (h_g, g_pHostMemAllocator, width, height);
+    SAFE_MAT_DECL (h_b, g_pHostMemAllocator, width, height);
+    std::cout << "Estimating optical flow\nForward...\n";
+    if (NCV_SUCCESS != NCVBroxOpticalFlow (desc, *g_pGPUMemAllocator, *src, *dst, u, v, 0))
+    {
+        std::cout << "Failed\n";
+        return -1;
+    }
+    std::cout << "Backward...\n";
+    if (NCV_SUCCESS != NCVBroxOpticalFlow (desc, *g_pGPUMemAllocator, *dst, *src, uBck, vBck, 0))
+    {
+        std::cout << "Failed\n";
+        return -1;
+    }
+    // matrix for temporary data
+    SAFE_MAT_DECL (d_temp, g_pGPUMemAllocator, width, height);
+    // first frame color components (GPU memory)
+    SAFE_MAT_DECL (d_r, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (d_g, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (d_b, g_pGPUMemAllocator, width, height);
+    // second frame color components (GPU memory)
+    SAFE_MAT_DECL (d_rt, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (d_gt, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (d_bt, g_pGPUMemAllocator, width, height);
+    // intermediate frame color components (GPU memory)
+    SAFE_MAT_DECL (d_rNew, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (d_gNew, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (d_bNew, g_pGPUMemAllocator, width, height);
+    // interpolated forward flow
+    SAFE_MAT_DECL (ui, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (vi, g_pGPUMemAllocator, width, height);
+    // interpolated backward flow
+    SAFE_MAT_DECL (ubi, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (vbi, g_pGPUMemAllocator, width, height);
+    // occlusion masks
+    SAFE_MAT_DECL (occ0, g_pGPUMemAllocator, width, height);
+    SAFE_MAT_DECL (occ1, g_pGPUMemAllocator, width, height);
+    // prepare color components on host and copy them to device memory
+    ncvAssertReturnNcvStat (CopyData<RgbToR> (firstFrame, h_r));
+    ncvAssertReturnNcvStat (CopyData<RgbToG> (firstFrame, h_g));
+    ncvAssertReturnNcvStat (CopyData<RgbToB> (firstFrame, h_b));
+    ncvAssertReturnNcvStat (h_r.copySolid ( d_r, 0 ));
+    ncvAssertReturnNcvStat (h_g.copySolid ( d_g, 0 ));
+    ncvAssertReturnNcvStat (h_b.copySolid ( d_b, 0 ));
+    ncvAssertReturnNcvStat (CopyData<RgbToR> (lastFrame, h_r));
+    ncvAssertReturnNcvStat (CopyData<RgbToG> (lastFrame, h_g));
+    ncvAssertReturnNcvStat (CopyData<RgbToB> (lastFrame, h_b));
+    ncvAssertReturnNcvStat (h_r.copySolid ( d_rt, 0 ));
+    ncvAssertReturnNcvStat (h_g.copySolid ( d_gt, 0 ));
+    ncvAssertReturnNcvStat (h_b.copySolid ( d_bt, 0 ));
+    std::cout << "Interpolating...\n";
+    std::cout.precision (4);
+    std::vector<IplImage*> frames;
+    frames.push_back (firstFrame);
+    // compute interpolated frames
+    for (Ncv32f timePos = timeStep; timePos < 1.0f; timePos += timeStep)
+    {
+        ncvAssertCUDAReturn (cudaMemset (ui.ptr (), 0, ui.pitch () * ui.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (vi.ptr (), 0, vi.pitch () * vi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (ubi.ptr (), 0, ubi.pitch () * ubi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (vbi.ptr (), 0, vbi.pitch () * vbi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (occ0.ptr (), 0, occ0.pitch () * occ0.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (occ1.ptr (), 0, occ1.pitch () * occ1.height ()), NCV_CUDA_ERROR);
+        NppStInterpolationState state;
+        // interpolation state should be filled once except pSrcFrame0, pSrcFrame1, and pNewFrame
+        // we will only need to reset buffers content to 0 since interpolator doesn't do this itself
+        state.size  = NcvSize32u (width, height);
+        state.nStep = d_r.pitch ();
+        state.pSrcFrame0 = d_r.ptr ();
+        state.pSrcFrame1 = d_rt.ptr ();
+        state.pFU = u.ptr ();
+        state.pFV = v.ptr ();
+        state.pBU = uBck.ptr ();
+        state.pBV = vBck.ptr ();
+        state.pos = timePos;
+        state.pNewFrame = d_rNew.ptr ();
+        state.ppBuffers[0] = occ0.ptr ();
+        state.ppBuffers[1] = occ1.ptr ();
+        state.ppBuffers[2] = ui.ptr ();
+        state.ppBuffers[3] = vi.ptr ();
+        state.ppBuffers[4] = ubi.ptr ();
+        state.ppBuffers[5] = vbi.ptr ();
+        // interpolate red channel
+        nppiStInterpolateFrames (&state);
+        // reset buffers
+        ncvAssertCUDAReturn (cudaMemset (ui.ptr (), 0, ui.pitch () * ui.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (vi.ptr (), 0, vi.pitch () * vi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (ubi.ptr (), 0, ubi.pitch () * ubi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (vbi.ptr (), 0, vbi.pitch () * vbi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (occ0.ptr (), 0, occ0.pitch () * occ0.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (occ1.ptr (), 0, occ1.pitch () * occ1.height ()), NCV_CUDA_ERROR);
+        // interpolate green channel
+        state.pSrcFrame0 = d_g.ptr ();
+        state.pSrcFrame1 = d_gt.ptr ();
+        state.pNewFrame  = d_gNew.ptr ();
+        nppiStInterpolateFrames (&state);
+        // reset buffers
+        ncvAssertCUDAReturn (cudaMemset (ui.ptr (), 0, ui.pitch () * ui.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (vi.ptr (), 0, vi.pitch () * vi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (ubi.ptr (), 0, ubi.pitch () * ubi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (vbi.ptr (), 0, vbi.pitch () * vbi.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (occ0.ptr (), 0, occ0.pitch () * occ0.height ()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn (cudaMemset (occ1.ptr (), 0, occ1.pitch () * occ1.height ()), NCV_CUDA_ERROR);
+        // interpolate blue channel
+        state.pSrcFrame0 = d_b.ptr ();
+        state.pSrcFrame1 = d_bt.ptr ();
+        state.pNewFrame  = d_bNew.ptr ();
+        nppiStInterpolateFrames (&state);
+        // copy to host memory
+        ncvAssertReturnNcvStat (d_rNew.copySolid (h_r, 0));
+        ncvAssertReturnNcvStat (d_gNew.copySolid (h_g, 0));
+        ncvAssertReturnNcvStat (d_bNew.copySolid (h_b, 0));
+        // convert to IplImage
+        IplImage *newFrame = CreateImage (h_r, h_g, h_b);
+        if (newFrame == 0)
+        {
+            std::cout << "Could not create new frame in host memory\n";
+            break;
+        }
+        frames.push_back (newFrame);
+        std::cout << timePos * 100.0f << "%\r";
+    }
+    std::cout << std::setw (5) << "100%\n";
+    frames.push_back (lastFrame);
+    Ncv32u currentFrame;
+    currentFrame = 0;
+    ShowFlow (u, v, "Forward flow");
+    ShowFlow (uBck, vBck, "Backward flow");
+    cvShowImage ("Interpolated frame", frames[currentFrame]);
+    bool qPressed = false;
+    while ( !qPressed )
+    {
+        int key = toupper (cvWaitKey (10));
+        switch (key)
+        {
+        case 27:
+            qPressed = true;
+            break;
+        case 'A':
+            if (currentFrame > 0) --currentFrame;
+            cvShowImage ("Interpolated frame", frames[currentFrame]);
+            break;
+        case 'S':
+            if (currentFrame < frames.size()-1) ++currentFrame;
+            cvShowImage ("Interpolated frame", frames[currentFrame]);
+            break;
+        }
+    }
+    cvDestroyAllWindows ();
+    std::vector<IplImage*>::iterator iter;
+    for (iter = frames.begin (); iter != frames.end (); ++iter)
+    {
+        cvReleaseImage (&(*iter));
+    }
+    return 0;
+}
+#endif
--- a/samples/gpu/rubberwhale1.png
+++ b/samples/gpu/rubberwhale1.png
--- a/samples/gpu/rubberwhale2.png
+++ b/samples/gpu/rubberwhale2.png