Merge pull request #943 from jet47:cuda-5.5-support

75cf5cc4 · Roman Donchenko · OpenCV Buildbot · 5237647f · bcf8bdb4 · 75cf5cc4
Commit 75cf5cc4 authored Jun 03, 2013 by Roman Donchenko Committed by OpenCV Buildbot Jun 03, 2013
10 changed files
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -26,6 +26,15 @@ if(CUDA_FOUND)
    set(HAVE_CUBLAS 1)
  endif()
+  if(${CUDA_VERSION} VERSION_LESS "5.5")
+    find_cuda_helper_libs(npp)
+  else()
+    find_cuda_helper_libs(nppc)
+    find_cuda_helper_libs(nppi)
+    find_cuda_helper_libs(npps)
+    set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
+  endif()
  if(WITH_NVCUVID)
    find_cuda_helper_libs(nvcuvid)
    set(HAVE_NVCUVID 1)
@@ -136,8 +145,6 @@ if(CUDA_FOUND)
  mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
-  find_cuda_helper_libs(npp)
  macro(ocv_cuda_compile VAR)
    foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
      set(${var}_backup_in_cuda_compile_ "${${var}}")

--- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@@ -124,8 +124,8 @@ namespace cv { namespace gpu { namespace device
    struct WithOutMask
    {
-        __device__ __forceinline__ WithOutMask(){}
+        __host__ __device__ __forceinline__ WithOutMask(){}
-        __device__ __forceinline__ WithOutMask(const WithOutMask& mask){}
+        __host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}
        __device__ __forceinline__ void next() const
        {

--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -67,8 +67,8 @@ namespace cv { namespace gpu { namespace device
                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
            }
-            __device__ __forceinline__ TransformOp() {}
+            __host__ __device__ __forceinline__ TransformOp() {}
-            __device__ __forceinline__ TransformOp(const TransformOp&) {}
+            __host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
        };
        void call(const PtrStepSz<float3> src, const float* rot,
@@ -106,8 +106,8 @@ namespace cv { namespace gpu { namespace device
                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
            }
-            __device__ __forceinline__ ProjectOp() {}
+            __host__ __device__ __forceinline__ ProjectOp() {}
-            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
+            __host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
        };
        void call(const PtrStepSz<float3> src, const float* rot,

--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -62,8 +62,8 @@ namespace canny
            return ::abs(x) + ::abs(y);
        }
-        __device__ __forceinline__ L1() {}
+        __host__ __device__ __forceinline__ L1() {}
-        __device__ __forceinline__ L1(const L1&) {}
+        __host__ __device__ __forceinline__ L1(const L1&) {}
    };
    struct L2 : binary_function<int, int, float>
    {
@@ -72,8 +72,8 @@ namespace canny
            return ::sqrtf(x * x + y * y);
        }
-        __device__ __forceinline__ L2() {}
+        __host__ __device__ __forceinline__ L2() {}
-        __device__ __forceinline__ L2(const L2&) {}
+        __host__ __device__ __forceinline__ L2(const L2&) {}
    };
 }
@@ -470,8 +470,8 @@ namespace canny
            return (uchar)(-(e >> 1));
        }
-        __device__ __forceinline__ GetEdges() {}
+        __host__ __device__ __forceinline__ GetEdges() {}
-        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
    };
 }

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -162,8 +162,8 @@ namespace arithm
            return vadd4(a, b);
        }
-        __device__ __forceinline__ VAdd4() {}
+        __host__ __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
+        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
    };
    ////////////////////////////////////
@@ -175,8 +175,8 @@ namespace arithm
            return vadd2(a, b);
        }
-        __device__ __forceinline__ VAdd2() {}
+        __host__ __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
+        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
    };
    ////////////////////////////////////
@@ -188,8 +188,8 @@ namespace arithm
            return saturate_cast<D>(a + b);
        }
-        __device__ __forceinline__ AddMat() {}
+        __host__ __device__ __forceinline__ AddMat() {}
-        __device__ __forceinline__ AddMat(const AddMat& other) {}
+        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
    };
 }
@@ -397,8 +397,8 @@ namespace arithm
            return vsub4(a, b);
        }
-        __device__ __forceinline__ VSub4() {}
+        __host__ __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4& other) {}
+        __host__ __device__ __forceinline__ VSub4(const VSub4&) {}
    };
    ////////////////////////////////////
@@ -410,8 +410,8 @@ namespace arithm
            return vsub2(a, b);
        }
-        __device__ __forceinline__ VSub2() {}
+        __host__ __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2& other) {}
+        __host__ __device__ __forceinline__ VSub2(const VSub2&) {}
    };
    ////////////////////////////////////
@@ -423,8 +423,8 @@ namespace arithm
            return saturate_cast<D>(a - b);
        }
-        __device__ __forceinline__ SubMat() {}
+        __host__ __device__ __forceinline__ SubMat() {}
-        __device__ __forceinline__ SubMat(const SubMat& other) {}
+        __host__ __device__ __forceinline__ SubMat(const SubMat&) {}
    };
 }
@@ -617,8 +617,8 @@ namespace arithm
            return res;
        }
-        __device__ __forceinline__ Mul_8uc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f() {}
-        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
    };
    struct Mul_16sc4_32f : binary_function<short4, float, short4>
@@ -629,8 +629,8 @@ namespace arithm
                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
        }
-        __device__ __forceinline__ Mul_16sc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f() {}
-        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
    };
    template <typename T, typename D> struct Mul : binary_function<T, T, D>
@@ -640,8 +640,8 @@ namespace arithm
            return saturate_cast<D>(a * b);
        }
-        __device__ __forceinline__ Mul() {}
+        __host__ __device__ __forceinline__ Mul() {}
-        __device__ __forceinline__ Mul(const Mul& other) {}
+        __host__ __device__ __forceinline__ Mul(const Mul&) {}
    };
    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
@@ -888,8 +888,8 @@ namespace arithm
            return b != 0 ? saturate_cast<D>(a / b) : 0;
        }
-        __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, float> : binary_function<T, T, float>
    {
@@ -898,8 +898,8 @@ namespace arithm
            return b != 0 ? static_cast<float>(a) / b : 0;
        }
-        __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, double> : binary_function<T, T, double>
    {
@@ -908,8 +908,8 @@ namespace arithm
            return b != 0 ? static_cast<double>(a) / b : 0;
        }
-        __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
@@ -1196,8 +1196,8 @@ namespace arithm
            return vabsdiff4(a, b);
        }
-        __device__ __forceinline__ VAbsDiff4() {}
+        __host__ __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
    };
    ////////////////////////////////////
@@ -1209,8 +1209,8 @@ namespace arithm
            return vabsdiff2(a, b);
        }
-        __device__ __forceinline__ VAbsDiff2() {}
+        __host__ __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
    };
    ////////////////////////////////////
@@ -1235,8 +1235,8 @@ namespace arithm
            return saturate_cast<T>(_abs(a - b));
        }
-        __device__ __forceinline__ AbsDiffMat() {}
+        __host__ __device__ __forceinline__ AbsDiffMat() {}
-        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
    };
 }
@@ -1370,8 +1370,8 @@ namespace arithm
            return saturate_cast<T>(x * x);
        }
-        __device__ __forceinline__ Sqr() {}
+        __host__ __device__ __forceinline__ Sqr() {}
-        __device__ __forceinline__ Sqr(const Sqr& other) {}
+        __host__ __device__ __forceinline__ Sqr(const Sqr&) {}
    };
 }
@@ -1466,8 +1466,8 @@ namespace arithm
            return saturate_cast<T>(f(x));
        }
-        __device__ __forceinline__ Exp() {}
+        __host__ __device__ __forceinline__ Exp() {}
-        __device__ __forceinline__ Exp(const Exp& other) {}
+        __host__ __device__ __forceinline__ Exp(const Exp&) {}
    };
 }
@@ -1507,8 +1507,8 @@ namespace arithm
            return vcmpeq4(a, b);
        }
-        __device__ __forceinline__ VCmpEq4() {}
+        __host__ __device__ __forceinline__ VCmpEq4() {}
-        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
    };
    struct VCmpNe4 : binary_function<uint, uint, uint>
    {
@@ -1517,8 +1517,8 @@ namespace arithm
            return vcmpne4(a, b);
        }
-        __device__ __forceinline__ VCmpNe4() {}
+        __host__ __device__ __forceinline__ VCmpNe4() {}
-        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
    };
    struct VCmpLt4 : binary_function<uint, uint, uint>
    {
@@ -1527,8 +1527,8 @@ namespace arithm
            return vcmplt4(a, b);
        }
-        __device__ __forceinline__ VCmpLt4() {}
+        __host__ __device__ __forceinline__ VCmpLt4() {}
-        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
    };
    struct VCmpLe4 : binary_function<uint, uint, uint>
    {
@@ -1537,8 +1537,8 @@ namespace arithm
            return vcmple4(a, b);
        }
-        __device__ __forceinline__ VCmpLe4() {}
+        __host__ __device__ __forceinline__ VCmpLe4() {}
-        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
    };
    ////////////////////////////////////
@@ -2008,8 +2008,8 @@ namespace arithm
            return vmin4(a, b);
        }
-        __device__ __forceinline__ VMin4() {}
+        __host__ __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
+        __host__ __device__ __forceinline__ VMin4(const VMin4&) {}
    };
    ////////////////////////////////////
@@ -2021,8 +2021,8 @@ namespace arithm
            return vmin2(a, b);
        }
-        __device__ __forceinline__ VMin2() {}
+        __host__ __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
+        __host__ __device__ __forceinline__ VMin2(const VMin2&) {}
    };
 }
@@ -2100,8 +2100,8 @@ namespace arithm
            return vmax4(a, b);
        }
-        __device__ __forceinline__ VMax4() {}
+        __host__ __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
+        __host__ __device__ __forceinline__ VMax4(const VMax4&) {}
    };
    ////////////////////////////////////
@@ -2113,8 +2113,8 @@ namespace arithm
            return vmax2(a, b);
        }
-        __device__ __forceinline__ VMax2() {}
+        __host__ __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
+        __host__ __device__ __forceinline__ VMax2(const VMax2&) {}
    };
 }

--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -81,48 +81,90 @@ namespace
    const ErrorEntry npp_errors [] =
    {
-        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
-        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
-        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
 #if defined (_MSC_VER)
        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
 #endif
+#if NPP_VERSION < 5500
        error_entry( NPP_BAD_ARG_ERROR ),
-        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_TEXTURE_BIND_ERROR ),
        error_entry( NPP_COEFF_ERROR ),
        error_entry( NPP_RECT_ERROR ),
        error_entry( NPP_QUAD_ERROR ),
-        error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
-        error_entry( NPP_NOT_EVEN_STEP_ERROR ),
-        error_entry( NPP_INTERPOLATION_ERROR ),
-        error_entry( NPP_RESIZE_FACTOR_ERROR ),
-        error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
        error_entry( NPP_MEMFREE_ERR ),
        error_entry( NPP_MEMSET_ERR ),
-        error_entry( NPP_MEMCPY_ERROR ),
        error_entry( NPP_MEM_ALLOC_ERR ),
        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
        error_entry( NPP_MIRROR_FLIP_ERR ),
        error_entry( NPP_INVALID_INPUT ),
+        error_entry( NPP_POINTER_ERROR ),
+        error_entry( NPP_WARNING ),
+        error_entry( NPP_ODD_ROI_WARNING ),
+#else
+        error_entry( NPP_INVALID_HOST_POINTER_ERROR ),
+        error_entry( NPP_INVALID_DEVICE_POINTER_ERROR ),
+        error_entry( NPP_LUT_PALETTE_BITSIZE_ERROR ),
+        error_entry( NPP_ZC_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_MEMFREE_ERROR ),
+        error_entry( NPP_MEMSET_ERROR ),
+        error_entry( NPP_QUALITY_INDEX_ERROR ),
+        error_entry( NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_CHANNEL_ORDER_ERROR ),
+        error_entry( NPP_ZERO_MASK_VALUE_ERROR ),
+        error_entry( NPP_QUADRANGLE_ERROR ),
+        error_entry( NPP_RECTANGLE_ERROR ),
+        error_entry( NPP_COEFFICIENT_ERROR ),
+        error_entry( NPP_NUMBER_OF_CHANNELS_ERROR ),
+        error_entry( NPP_COI_ERROR ),
+        error_entry( NPP_DIVISOR_ERROR ),
+        error_entry( NPP_CHANNEL_ERROR ),
+        error_entry( NPP_STRIDE_ERROR ),
+        error_entry( NPP_ANCHOR_ERROR ),
+        error_entry( NPP_MASK_SIZE_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERROR ),
+        error_entry( NPP_MOMENT_00_ZERO_ERROR ),
+        error_entry( NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR ),
+        error_entry( NPP_THRESHOLD_ERROR ),
+        error_entry( NPP_CONTEXT_MATCH_ERROR ),
+        error_entry( NPP_FFT_FLAG_ERROR ),
+        error_entry( NPP_FFT_ORDER_ERROR ),
+        error_entry( NPP_SCALE_RANGE_ERROR ),
+        error_entry( NPP_DATA_TYPE_ERROR ),
+        error_entry( NPP_OUT_OFF_RANGE_ERROR ),
+        error_entry( NPP_DIVIDE_BY_ZERO_ERROR ),
+        error_entry( NPP_MEMORY_ALLOCATION_ERR ),
+        error_entry( NPP_RANGE_ERROR ),
+        error_entry( NPP_BAD_ARGUMENT_ERROR ),
+        error_entry( NPP_NO_MEMORY_ERROR ),
+        error_entry( NPP_ERROR_RESERVED ),
+        error_entry( NPP_NO_OPERATION_WARNING ),
+        error_entry( NPP_DIVIDE_BY_ZERO_WARNING ),
+        error_entry( NPP_WRONG_INTERSECTION_ROI_WARNING ),
+#endif
+        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
+        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
+        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_TEXTURE_BIND_ERROR ),
+        error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
+        error_entry( NPP_NOT_EVEN_STEP_ERROR ),
+        error_entry( NPP_INTERPOLATION_ERROR ),
+        error_entry( NPP_RESIZE_FACTOR_ERROR ),
+        error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
+        error_entry( NPP_MEMCPY_ERROR ),
        error_entry( NPP_ALIGNMENT_ERROR ),
        error_entry( NPP_STEP_ERROR ),
        error_entry( NPP_SIZE_ERROR ),
-        error_entry( NPP_POINTER_ERROR ),
        error_entry( NPP_NULL_POINTER_ERROR ),
        error_entry( NPP_CUDA_KERNEL_EXECUTION_ERROR ),
        error_entry( NPP_NOT_IMPLEMENTED_ERROR ),
        error_entry( NPP_ERROR ),
        error_entry( NPP_NO_ERROR ),
        error_entry( NPP_SUCCESS ),
-        error_entry( NPP_WARNING ),
        error_entry( NPP_WRONG_INTERSECTION_QUAD_WARNING ),
        error_entry( NPP_MISALIGNED_DST_ROI_WARNING ),
        error_entry( NPP_AFFINE_QUAD_INCORRECT_WARNING ),
-        error_entry( NPP_DOUBLE_SIZE_WARNING ),
+        error_entry( NPP_DOUBLE_SIZE_WARNING )
-        error_entry( NPP_ODD_ROI_WARNING )
    };
    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -187,10 +187,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+#if CUDA_VERSION < 5050
-        NppiSize oSizeROI, Npp64f* pRetVal);
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);
-    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+        NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
+    typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+    static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
+#endif
    NppiSize sz;
    sz.width  = src1.cols;
@@ -202,7 +212,16 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    DeviceBuffer dbuf;
-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#if CUDA_VERSION < 5050
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#else
+    int bufSize;
+    buf_size_funcs[funcIdx](sz, &bufSize);
+    GpuMat buf(1, bufSize, CV_8UC1);
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
+#endif
    cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -116,11 +116,13 @@
    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
    #define NPP_MINIMUM_REQUIRED_VERSION 4100
+    #define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient Cuda Runtime library version, please update it."
    #endif
-    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+    #if (NPP_VERSION < NPP_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient NPP version, please update it."
    #endif