fixed build under CUDA 4.1

f8aba860 · Vladislav Vinogradov · 7ddb706b · f8aba860 · f8aba860 · f8aba860
Commit f8aba860 authored Jan 30, 2012 by Vladislav Vinogradov
4 changed files
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -680,6 +680,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
    bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16);
+#if CUDART_VERSION == 4000 
    if (aligned && src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0)
    {
        NppStreamHandler h(stream);
@@ -692,42 +693,48 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    else if (aligned && src1.depth() == CV_8U)
+    else 
+#endif
    {
-        NppStreamHandler h(stream);
+        if (aligned && src1.depth() == CV_8U)
+        {
+            NppStreamHandler h(stream);
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
+            nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-        if (stream == 0)
+            if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+                cudaSafeCall( cudaDeviceSynchronize() );
-    }
+        }
-    else if (aligned && src1.depth() == CV_32S)
+#if CUDART_VERSION == 4000 
-    {
+        else if (aligned && src1.depth() == CV_32S)
-        NppStreamHandler h(stream);
+        {
+            NppStreamHandler h(stream);
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), 
+            nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), static_cast<int>(src1.step), src2.ptr<Npp32s>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
+                dst.ptr<Npp32s>(), static_cast<int>(dst.step), sz) );
-        if (stream == 0)
+            if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+                cudaSafeCall( cudaDeviceSynchronize() );
-    }
+        }
-    else if (aligned && src1.depth() == CV_32F)
+#endif
-    {
+        else if (aligned && src1.depth() == CV_32F)
-        NppStreamHandler h(stream);
+        {
+            NppStreamHandler h(stream);
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), 
+            nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), static_cast<int>(src1.step), src2.ptr<Npp32f>(), static_cast<int>(src2.step), 
-            dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-        if (stream == 0)
+            if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+                cudaSafeCall( cudaDeviceSynchronize() );
-    }
+        }
-    else
+        else
-    {
+        {
-        const func_t func = funcs[src1.depth()];
+            const func_t func = funcs[src1.depth()];
-        CV_Assert(func != 0);
+            CV_Assert(func != 0);
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+            func(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+    }
    }
 }

--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -77,8 +77,18 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
    NppStreamHandler h(stream);
+#if CUDART_VERSION > 4000 
+    NppiGraphcutState* pState;
+    nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, buf.ptr<Npp8u>()) );
+    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), pState) );
+    nppSafeCall( nppiGraphcutFree(pState) );
+#else
    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), buf.ptr<Npp8u>()) );
+#endif
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -935,6 +935,31 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
 void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
 {
+#if CUDART_VERSION > 4000 
+    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);
+    dst.create(src.size(), CV_32FC1);
+    NppiSize sz;
+    sz.width = src.cols;
+    sz.height = src.rows;
+    NppiRect nppRect;
+    nppRect.height = rect.height;
+    nppRect.width = rect.width;
+    nppRect.x = rect.x;
+    nppRect.y = rect.y;
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStreamHandler h(stream);
+    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+#else
    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1);
    dst.create(src.size(), CV_32FC1);
@@ -958,6 +983,7 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
+#endif
 }

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -117,7 +117,15 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
    DeviceBuffer dbuf(2);
+#if CUDART_VERSION > 4000 
+    int bufSize;
+    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
+    GpuMat buf(1, bufSize, CV_8UC1);
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
+#else
    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, dbuf, (double*)dbuf + 1) );
+#endif
    cudaSafeCall( cudaDeviceSynchronize() );