added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts

fixed gpu module error reporting added asynchronous version of some functions

added WITH_CUFFT and WITH_CUBLAS flags to cmake scripts
fixed gpu module error reporting added asynchronous version of some functions
40ee754e · Vladislav Vinogradov · a73b509b · 40ee754e · 40ee754e · 40ee754e
Commit 40ee754e authored Oct 19, 2011 by Vladislav Vinogradov
16 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -452,8 +452,12 @@ set(WITH_EIGEN ON CACHE BOOL "Include Eigen2/Eigen3 support")

 if( CMAKE_VERSION VERSION_GREATER "2.8")
    set(WITH_CUDA ON CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT ON CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 else()
    set(WITH_CUDA OFF CACHE BOOL "Include NVidia Cuda Runtime support")
+    set(WITH_CUFFT OFF CACHE BOOL "Include NVidia Cuda Fast Fourier Transform (FFT) library support")
+    set(WITH_CUBLAS OFF CACHE BOOL "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support")
 endif()

 set(WITH_OPENNI OFF CACHE BOOL "Include OpenNI support")
@@ -995,6 +999,15 @@ if(WITH_CUDA)
    
    if(CUDA_FOUND)
        set(HAVE_CUDA 1)
+
+        if(WITH_CUFFT)
+            set(HAVE_CUFFT 1)
+        endif()
+
+        if(WITH_CUBLAS)
+            set(HAVE_CUBLAS 1)
+        endif()
+
        message(STATUS "CUDA detected: " ${CUDA_VERSION})

        set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")

--- a/cvconfig.h.cmake
+++ b/cvconfig.h.cmake
@@ -172,6 +172,12 @@
 /* NVidia Cuda Runtime API*/
 #cmakedefine HAVE_CUDA

+/* NVidia Cuda Fast Fourier Transform (FFT) API*/
+#cmakedefine HAVE_CUFFT
+
+/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
+#cmakedefine HAVE_CUBLAS
+
 /* Compile for 'real' NVIDIA GPU architectures */
 #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"


--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -250,9 +250,7 @@ enum {
 CV_StsBadMemBlock=            -214, /* an allocated block has been corrupted */
 CV_StsAssert=                 -215, /* assertion failed */    
 CV_GpuNotSupported=           -216,  
- CV_GpuApiCallError=           -217, 
- CV_GpuNppCallError=           -218,
- CV_GpuCufftCallError=         -219
+ CV_GpuApiCallError=           -217
 };

 /****************************************************************************************\

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -629,9 +629,8 @@ CV_IMPL const char* cvErrorStr( int status )
    case CV_StsNotImplemented :      return "The function/feature is not implemented";
    case CV_StsBadMemBlock :         return "Memory block has been corrupted";
    case CV_StsAssert :              return "Assertion failed";
-    case CV_GpuNotSupported : return "No GPU support";
-    case CV_GpuApiCallError : return "Gpu Api call";
-    case CV_GpuNppCallError : return "Npp Api call";
+    case CV_GpuNotSupported :        return "No GPU support";
+    case CV_GpuApiCallError :        return "Gpu Api call";
    };

    sprintf(buf, "Unknown %s code %d", status >= 0 ? "status":"error", status);

--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -120,12 +120,19 @@ set_target_properties(${the_target} PROPERTIES
 target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )

 if (HAVE_CUDA)
-    target_link_libraries(${the_target} ${CUDA_LIBRARIES})    
-    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    target_link_libraries(${the_target} ${CUDA_LIBRARIES})

    unset(CUDA_npp_LIBRARY CACHE)
    find_cuda_helper_libs(npp)
    target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
+
+    if(HAVE_CUFFT)
+        CUDA_ADD_CUFFT_TO_TARGET(${the_target})
+    endif()
+
+    if(HAVE_CUBLAS)
+        CUDA_ADD_CUBLAS_TO_TARGET(${the_target})
+    endif()
 endif()

 if(MSVC)

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -141,8 +141,8 @@ namespace cv

        //////////////////////////////// Error handling ////////////////////////

-        CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
-        CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);
+        //CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
+        //CV_EXPORTS void nppError( int err, const char *file, const int line, const char *func);

        //////////////////////////////// CudaMem ////////////////////////////////
        // CudaMem is limited cv::Mat with page locked memory allocation.
@@ -628,11 +628,11 @@ namespace cv

        //! Does mean shift filtering on GPU.
        CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());

        //! Does mean shift procedure on GPU.
        CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr,
-            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
+            TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null());

        //! Does mean shift segmentation with elimination of small regions.
        CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize,
@@ -683,10 +683,12 @@ namespace cv
        //! rotate 8bit single or four channel image
        //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
        //! supports CV_8UC1, CV_8UC4 types
-        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
+        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, 
+            int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());

        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null());
+        CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, 
+            const Scalar& value = Scalar(), Stream& stream = Stream::Null());

        //! computes the integral image
        //! sum will have CV_32S type, but will contain unsigned int values
@@ -715,21 +717,26 @@ namespace cv
        CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());

        //! computes Harris cornerness criteria at each image pixel
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
-        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101);
+        CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, 
+            int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null());

        //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101);
+        CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, 
+            int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null());

        //! performs per-element multiplication of two full (not packed) Fourier spectrums
        //! supports 32FC2 matrixes only (interleaved format)
-        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false);
+        CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());

        //! performs per-element multiplication of two full (not packed) Fourier spectrums
        //! supports 32FC2 matrixes only (interleaved format)
-        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, 
-                                             float scale, bool conjB=false);
+        CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());

        //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
        //! Param dft_size is the size of DFT transform.
@@ -742,19 +749,14 @@ namespace cv
        //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
        //!
        //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
-        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0);
+        CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());

        //! computes convolution (or cross-correlation) of two images using discrete Fourier transform
        //! supports source images of 32FC1 type only
        //! result matrix will have 32FC1 type
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                                 bool ccorr=false);
-
        struct CV_EXPORTS ConvolveBuf;
-
-        //! buffered version
-        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, 
-                                 bool ccorr, ConvolveBuf& buf);
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
+        CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());

        struct CV_EXPORTS ConvolveBuf
        {
@@ -766,7 +768,7 @@ namespace cv

        private:
            static Size estimateBlockSize(Size result_size, Size templ_size);
-            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&);
+            friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream);

            Size result_size;
            Size block_size;
@@ -778,7 +780,7 @@ namespace cv
        };

        //! computes the proximity map for the raster template and the image where the template is searched for
-        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
+        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null());

        //! smoothes the source image and downsamples it
        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -93,7 +93,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
+        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
    }
    else // if (src.elemSize() == 8)
@@ -104,7 +104,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
+        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );		
    }


--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -66,10 +66,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
    CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)
    {
        ncvSetDebugOutputHandler(NCVDebugOutputHandler);
-        if (ncvStat != load(filename))
-        {
-            CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");
-        }
+        ncvSafeCall( load(filename) );
    }


@@ -287,11 +284,7 @@ int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMa
    }

    unsigned int numDetections;
-    NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections);
-    if (ncvStat != NCV_SUCCESS)
-    {
-        CV_Error(CV_GpuApiCallError, "Error in face detectioln");
-    }
+    ncvSafeCall( impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections) );

    return numDetections;
 }

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -45,16 +45,18 @@

 #include "cuda_runtime_api.h"
 #include "cufft.h"
-//#include <nppdefs.h>
+#include "NCV.hpp"

 #if defined(__GNUC__)
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
 #endif

 namespace cv
@@ -62,8 +64,9 @@ namespace cv
    namespace gpu
    {
        void error(const char *error_string, const char *file, const int line, const char *func = "");
-        void nppError(int err, const char *file, const int line, const char *func = "");   
-        void cufftError(int err, const char *file, const int line, const char *func = "");   
+        void nppError(int err, const char *file, const int line, const char *func = "");
+        void ncvError(int err, const char *file, const int line, const char *func = "");
+        void cufftError(int err, const char *file, const int line, const char *func = "");

        static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
        {
@@ -71,17 +74,23 @@ namespace cv
                cv::gpu::error(cudaGetErrorString(err), file, line, func);
        }

-        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-        {
-            if (CUFFT_SUCCESS != err)
-                cv::gpu::cufftError(err, file, line, func);
-        }
-
        static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
        {
            if (err < 0)
                cv::gpu::nppError(err, file, line, func);
        }
+
+        static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+        {
+            if (NCV_SUCCESS != err)
+                cv::gpu::ncvError(err, file, line, func);
+        }
+
+        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+        {
+            if (CUFFT_SUCCESS != err)
+                cv::gpu::cufftError(err, file, line, func);
+        }
    }
 }


--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -42,30 +42,45 @@

 #include "precomp.hpp"

-
 using namespace cv;
 using namespace cv::gpu;
+using namespace std;

-
-#if !defined (HAVE_CUDA)
-
-#else /* !defined (HAVE_CUDA) */
-
+#ifdef HAVE_CUDA

 namespace 
 {
    #define error_entry(entry)  { entry, #entry }

-    //////////////////////////////////////////////////////////////////////////
-    // NPP errors
-
-    struct NppError
+    struct ErrorEntry
    {
-        int error;
+        int code;
        string str;
-    } 
+    }; 
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {};
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    string getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code");
+
+        ostringstream ostr;
+        ostr << msg << " [Code = " << code << "]";
+
+        return ostr.str();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // NPP errors
    
-    npp_errors [] = 
+    const ErrorEntry npp_errors [] = 
    {
        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
@@ -74,6 +89,7 @@ namespace
 #if defined (_MSC_VER)
        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
 #endif
+
        error_entry( NPP_BAD_ARG_ERROR ),
        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
        error_entry( NPP_TEXTURE_BIND_ERROR ),
@@ -110,106 +126,116 @@ namespace
        error_entry( NPP_ODD_ROI_WARNING )
    };

-    const size_t error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);

-    struct Searcher
+    //////////////////////////////////////////////////////////////////////////
+    // NCV errors
+    
+    const ErrorEntry ncv_errors [] = 
    {
-        int err;
-        Searcher(int err_) : err(err_) {};
-        bool operator()(const NppError& e) const { return e.error == err; }
+        error_entry( NCV_SUCCESS ),
+        error_entry( NCV_UNKNOWN_ERROR ),
+        error_entry( NCV_CUDA_ERROR ),
+        error_entry( NCV_NPP_ERROR ),
+        error_entry( NCV_FILE_ERROR ),
+        error_entry( NCV_NULL_PTR ),
+        error_entry( NCV_INCONSISTENT_INPUT ),
+        error_entry( NCV_TEXTURE_BIND_ERROR ),
+        error_entry( NCV_DIMENSIONS_INVALID ),
+        error_entry( NCV_INVALID_ROI ),
+        error_entry( NCV_INVALID_STEP ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
+        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
+        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
+        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
+        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
+        error_entry( NCV_MEM_COPY_ERROR ),
+        error_entry( NCV_MEM_RESIDENCE_ERROR ),
+        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
+        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
+        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
+        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
+        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
+        error_entry( NPPST_SUCCESS ),
+        error_entry( NPPST_ERROR ),
+        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPPST_NULL_POINTER_ERROR ),
+        error_entry( NPPST_TEXTURE_BIND_ERROR ),
+        error_entry( NPPST_MEMCPY_ERROR ),
+        error_entry( NPPST_MEM_ALLOC_ERR ),
+        error_entry( NPPST_MEMFREE_ERR ),
+        error_entry( NPPST_INVALID_ROI ),
+        error_entry( NPPST_INVALID_STEP ),
+        error_entry( NPPST_INVALID_SCALE ),
+        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
+        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
+        error_entry( NPPST_MEM_INTERNAL_ERROR )
    };

+    const size_t ncv_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+
    //////////////////////////////////////////////////////////////////////////
    // CUFFT errors

-    struct CufftError
-    {
-        int code;
-        string message;
-    };
-
-    const CufftError cufft_errors[] = 
-    {
-        error_entry(CUFFT_INVALID_PLAN),
-        error_entry(CUFFT_ALLOC_FAILED),
-        error_entry(CUFFT_INVALID_TYPE),
-        error_entry(CUFFT_INVALID_VALUE),
-        error_entry(CUFFT_INTERNAL_ERROR),
-        error_entry(CUFFT_EXEC_FAILED),
-        error_entry(CUFFT_SETUP_FAILED),
-        error_entry(CUFFT_INVALID_SIZE),
-        error_entry(CUFFT_UNALIGNED_DATA)
-    };
-
-    struct CufftErrorComparer
+    const ErrorEntry cufft_errors[] = 
    {
-        CufftErrorComparer(int code_): code(code_) {}
-        bool operator()(const CufftError& other) const 
-        { 
-            return other.code == code; 
-        }
-        int code;
+        error_entry( CUFFT_INVALID_PLAN ),
+        error_entry( CUFFT_ALLOC_FAILED ),
+        error_entry( CUFFT_INVALID_TYPE ),
+        error_entry( CUFFT_INVALID_VALUE ),
+        error_entry( CUFFT_INTERNAL_ERROR ),
+        error_entry( CUFFT_EXEC_FAILED ),
+        error_entry( CUFFT_SETUP_FAILED ),
+        error_entry( CUFFT_INVALID_SIZE ),
+        error_entry( CUFFT_UNALIGNED_DATA )
    };

    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-
 }

 namespace cv
 {
    namespace gpu
    {
-        const string getNppErrorString( int err )
-        {
-            size_t idx = std::find_if(npp_errors, npp_errors + error_num, Searcher(err)) - npp_errors;
-            const string& msg = (idx != error_num) ? npp_errors[idx].str : string("Unknown error code");
-
-            std::stringstream interpreter;
-            interpreter << msg <<" [Code = " << err << "]";
-
-            return interpreter.str();
-        }
-
-        void nppError( int err, const char *file, const int line, const char *func)
-        {                    
-            cv::error( cv::Exception(CV_GpuNppCallError, getNppErrorString(err), func, file, line) );                
-        }
-
-        const string getCufftErrorString(int err_code)
-        {
-            const CufftError* cufft_error = std::find_if(
-                    cufft_errors, cufft_errors + cufft_error_num, 
-                    CufftErrorComparer(err_code));
-
-            bool found = cufft_error != cufft_errors + cufft_error_num;
-
-            std::stringstream ss;
-            ss << (found ? cufft_error->message : "Unknown error code");
-            ss << " [Code = " << err_code << "]";
-
-            return ss.str();
-        }
-
-        void cufftError(int err, const char *file, const int line, const char *func)
-        {
-            cv::error(cv::Exception(CV_GpuCufftCallError, getCufftErrorString(err), func, file, line));
-        }
-
        void error(const char *error_string, const char *file, const int line, const char *func)
        {          
            int code = CV_GpuApiCallError;

-            if (std::uncaught_exception())
+            if (uncaught_exception())
            {
                const char* errorStr = cvErrorStr(code);            
                const char* function = func ? func : "unknown function";    

-                std::cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
-                std::cerr.flush();            
+                cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
+                cerr.flush();            
            }
            else    
                cv::error( cv::Exception(code, error_string, func, file, line) );
        }
+
+        void nppError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, npp_errors, npp_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void ncvError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, ncv_errors, ncv_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void cufftError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, cufft_errors, cufft_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
    }
 }


--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -59,10 +59,8 @@ namespace
                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
    {
        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
-        CV_Assert(gpuCounter.isInitialized());

-        NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0);
-        CV_Assert(ncvStat == NCV_SUCCESS);
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );

        return gpuCounter.maxSize();
    }
@@ -130,10 +128,8 @@ void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& f
    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);

    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
-    CV_Assert(gpuAllocator.isInitialized());
    
-    NCVStatus ncvStat = NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream);
-    CV_Assert(ncvStat == NCV_SUCCESS);
+    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
 }

 void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, const GpuMat& fu, const GpuMat& fv, const GpuMat& bu, const GpuMat& bv, 
@@ -189,7 +185,7 @@ void cv::gpu::interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, cons
    state.ppBuffers[4] = bui.ptr<Ncv32f>();
    state.ppBuffers[5] = bvi.ptr<Ncv32f>();

-    nppSafeCall( nppiStInterpolateFrames(&state) );
+    ncvSafeCall( nppiStInterpolateFrames(&state) );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -39,15 +39,16 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__

 #if _MSC_VER >= 1200
-#pragma warning( disable: 4251 4710 4711 4514 4996 )
+    #pragma warning( disable: 4251 4710 4711 4514 4996 )
 #endif

 #ifdef HAVE_CVCONFIG_H
-#include "cvconfig.h"
+    #include "cvconfig.h"
 #endif

 #include <iostream>
@@ -65,33 +66,43 @@
 #include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/core/internal.hpp"

-#if defined(HAVE_CUDA)
+#define OPENCV_GPU_UNUSED(x) (void)x
+
+#ifdef HAVE_CUDA

-    #include "internal_shared.hpp"
    #include "cuda_runtime_api.h"
-    #include "cufft.h"
+    #include "npp.h"
+    
+    #ifdef HAVE_CUFFT
+        #include "cufft.h"
+    #endif
+
+    #ifdef HAVE_CUBLAS
+        #include "cublas.h"
+    #endif
+
+    #include "internal_shared.hpp"
    #include "opencv2/gpu/stream_accessor.hpp"
-    #include "npp.h"    
    
    #include "nvidia/core/NCV.hpp"
    #include "nvidia/NPP_staging/NPP_staging.hpp"
    #include "nvidia/NCVHaarObjectDetection.hpp"
    #include "nvidia/NCVBroxOpticalFlow.hpp"

-#define CUDART_MINIMUM_REQUIRED_VERSION 4000
-#define NPP_MINIMUM_REQUIRED_VERSION 4000
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4000
+    #define NPP_MINIMUM_REQUIRED_VERSION 4000

-#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
-    #error "Insufficient Cuda Runtime library version, please update it."
-#endif
+    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient Cuda Runtime library version, please update it."
+    #endif

-#if (NPP_VERSION_MAJOR*1000+NPP_VERSION_MINOR*100+NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
-    #error "Insufficient NPP version, please update it."
-#endif
+    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+        #error "Insufficient NPP version, please update it."
+    #endif

-#if defined(CUDA_ARCH_BIN_OR_PTX_10)
-    #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
-#endif
+    #if defined(CUDA_ARCH_BIN_OR_PTX_10)
+        #error "OpenCV GPU module doesn't support NVIDIA compute capability 1.0"
+    #endif

    static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The called functionality is disabled for current build or platform"); }