switched to new device layer in bitwize operations

b11cccaa · Vladislav Vinogradov · fdfffa52 · b11cccaa · b11cccaa · b11cccaa
Commit b11cccaa authored Aug 26, 2013 by Vladislav Vinogradov
4 changed files
--- a/modules/cudaarithm/src/cuda/bitwise_mat.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_mat.cu
@@ -40,87 +40,124 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV

-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"

-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else

-namespace cv { namespace cuda { namespace device
-{
-    template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
-    {
-    };
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"

-    template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
-    {
-    };
+using namespace cv::cudev;

-    template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
-    {
-    };
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);

-    template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
-    {
-    };
-}}}
+//////////////////////////////////////////////////////////////////////////////
+/// bitwise_not

-namespace arithm
+namespace
 {
-    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    template <typename T>
+    void bitMatNot(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
    {
+        GlobPtrSz<T> vsrc = globPtr((T*) src.data, src.step, src.rows, src.cols * src.channels());
+        GlobPtrSz<T> vdst = globPtr((T*) dst.data, dst.step, src.rows, src.cols * src.channels());
+
        if (mask.data)
-            device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+            gridTransformUnary(vsrc, vdst, bit_not<T>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
        else
-            device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
+            gridTransformUnary(vsrc, vdst, bit_not<T>(), stream);
    }
+}

-    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    const int depth = src.depth();
+
+    CV_DbgAssert( depth <= CV_32F );
+    CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    if (depth == CV_32F || depth == CV_32S)
    {
-        if (mask.data)
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
-        else
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
+        bitMatNot<uint>(src, dst, mask, stream);
    }
-
-    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    else if (depth == CV_16S || depth == CV_16U)
    {
-        if (mask.data)
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
-        else
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
+        bitMatNot<ushort>(src, dst, mask, stream);
    }
+    else
+    {
+        bitMatNot<uchar>(src, dst, mask, stream);
+    }
+}

-    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+//////////////////////////////////////////////////////////////////////////////
+/// Binary bitwise logical operations
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void bitMatOp(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
    {
+        GlobPtrSz<T> vsrc1 = globPtr((T*) src1.data, src1.step, src1.rows, src1.cols * src1.channels());
+        GlobPtrSz<T> vsrc2 = globPtr((T*) src2.data, src2.step, src1.rows, src1.cols * src1.channels());
+        GlobPtrSz<T> vdst = globPtr((T*) dst.data, dst.step, src1.rows, src1.cols * src1.channels());
+
        if (mask.data)
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+            gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), singleMaskChannels(globPtr<uchar>(mask), src1.channels()), stream);
        else
-            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
+            gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), stream);
    }
+}

-    template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
+{
+    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs32[] =
+    {
+        bitMatOp<bit_and, uint>,
+        bitMatOp<bit_or, uint>,
+        bitMatOp<bit_xor, uint>
+    };
+    static const func_t funcs16[] =
+    {
+        bitMatOp<bit_and, ushort>,
+        bitMatOp<bit_or, ushort>,
+        bitMatOp<bit_xor, ushort>
+    };
+    static const func_t funcs8[] =
+    {
+        bitMatOp<bit_and, uchar>,
+        bitMatOp<bit_or, uchar>,
+        bitMatOp<bit_xor, uchar>
+    };

-    template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    const int depth = src1.depth();

-    template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    CV_DbgAssert( depth <= CV_32F );
+    CV_DbgAssert( op >= 0 && op < 3 );

-    template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    if (depth == CV_32F || depth == CV_32S)
+    {
+        funcs32[op](src1, src2, dst, mask, stream);
+    }
+    else if (depth == CV_16S || depth == CV_16U)
+    {
+        funcs16[op](src1, src2, dst, mask, stream);
+    }
+    else
+    {
+        funcs8[op](src1, src2, dst, mask, stream);
+    }
 }

-#endif // CUDA_DISABLER
+#endif
--- a/modules/cudaarithm/src/cuda/bitwise_scalar.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
@@ -40,65 +40,132 @@
 //
 //M*/

-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"

-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV

-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"

-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else

-namespace cv { namespace cuda { namespace device
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv::cudev;
+
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
+
+namespace
 {
-    template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <template <typename> class Op, typename T>
+    void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
    {
-    };
+        gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
+    }
+
+    typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);

-    template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <typename T, bit_scalar_func_t func> struct BitScalar
    {
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+        {
+            func(src, cv::saturate_cast<T>(value[0]), dst, stream);
+        }
    };

-    template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <bit_scalar_func_t func> struct BitScalar4
    {
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+        {
+            uint packedVal = 0;
+
+            packedVal |= cv::saturate_cast<uchar>(value[0]);
+            packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
+            packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
+            packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
+
+            func(src, packedVal, dst, stream);
+        }
    };
-}}}

-namespace arithm
-{
-    template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    template <int DEPTH, int cn> struct NppBitwiseCFunc
    {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
-    }
+        typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+        typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
+    };

-    template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
    {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
-    }
+        typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
+
+        static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
+        {
+            cudaStream_t stream = StreamAccessor::getStream(_stream);
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = src.cols;
+            oSizeROI.height = src.rows;
+
+            const npp_type pConstants[] =
+            {
+                cv::saturate_cast<npp_type>(value[0]),
+                cv::saturate_cast<npp_type>(value[1]),
+                cv::saturate_cast<npp_type>(value[2]),
+                cv::saturate_cast<npp_type>(value[3])
+            };
+
+            nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+
+            if (stream == 0)
+                CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
+{
+    (void) mask;

-    template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
+    static const func_t funcs[3][6][4] =
    {
-        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
-    }
+        {
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_and, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_or, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        },
+        {
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+            {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
+            {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call    , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        }
+    };

-    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    const int depth = src.depth();
+    const int cn = src.channels();

-    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    CV_DbgAssert( depth <= CV_32F );
+    CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
+    CV_DbgAssert( mask.empty() );
+    CV_DbgAssert( op >= 0 && op < 3 );

-    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    funcs[op][depth][cn - 1](src, value, dst, stream);
 }

-#endif // CUDA_DISABLER
+#endif
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
--- a/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
@@ -62,6 +62,42 @@ struct WithOutMask
    }
 };

+template <class MaskPtr> struct SingleMaskChannels
+{
+    typedef typename PtrTraits<MaskPtr>::value_type value_type;
+    typedef typename PtrTraits<MaskPtr>::index_type index_type;
+
+    MaskPtr mask;
+    int channels;
+
+    __device__ __forceinline__ value_type operator()(index_type y, index_type x) const
+    {
+        return mask(y, x / channels);
+    }
+
+};
+
+template <class MaskPtr> struct SingleMaskChannelsSz : SingleMaskChannels<MaskPtr>
+{
+    int rows, cols;
+};
+
+template <class MaskPtr>
+__host__ SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type>
+singleMaskChannels(const MaskPtr& mask, int channels)
+{
+    SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type> ptr;
+    ptr.mask = shrinkPtr(mask);
+    ptr.channels = channels;
+    ptr.rows = getRows(mask);
+    ptr.cols = getCols(mask) * channels;
+    return ptr;
+}
+
+template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : PtrTraitsBase<SingleMaskChannelsSz<MaskPtr>, SingleMaskChannels<MaskPtr> >
+{
+};
+
 }}

 #endif