added TransformFunctorTraits, optimized some functions that use transform

5e9ae6b1 · Vladislav Vinogradov · 6ce2277c · 5e9ae6b1 · 5e9ae6b1 · 5e9ae6b1
Commit 5e9ae6b1 authored Aug 17, 2011 by Vladislav Vinogradov
11 changed files
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -47,37 +47,33 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "internal_shared.hpp"
-using namespace cv::gpu;
+namespace cv { namespace gpu { namespace device
-using namespace cv::gpu::device;
-namespace cv { namespace gpu { namespace mathfunc
 {
    //////////////////////////////////////////////////////////////////////////////////////
    // Compare
-    template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
+    template <typename T> struct NotEqual : binary_function<T, T, uchar>
    {
-        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
+        __device__ __forceinline__ uchar operator()(T src1, T src2) const
        {
            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
        }
    };
-    template <typename T1, typename T2>
+    template <typename T>
    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        NotEqual<T1, T2> op;
+        NotEqual<T> op;
-        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
+        transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, stream);
    }
    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        compare_ne<uint, uint>(src1, src2, dst, stream);
+        compare_ne<uint>(src1, src2, dst, stream);
    }
    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
    {
-        compare_ne<float, float>(src1, src2, dst, stream);
+        compare_ne<float>(src1, src2, dst, stream);
    }
@@ -354,6 +350,35 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////
    // min/max
+    namespace detail
+    {
+        template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+    template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
+    {
+    };
    template <typename T>
    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
@@ -413,7 +438,39 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////
-    // threshold  
+    // threshold
+    namespace detail
+    {
+        template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ThresholdTraits<2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ThresholdTraits<4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_binary_inv_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_trunc_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_func<T> >
+    {
+    };
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
+    {
+    };
    template <template <typename> class Op, typename T>
    void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
@@ -454,8 +511,13 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////
    // subtract
-    template <typename T>
+    template <> struct TransformFunctorTraits< minus<short> > : DefaultTransformFunctorTraits< minus<short> >
-    void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <typename T> void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
    {
        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
    }
@@ -499,10 +561,35 @@ namespace cv { namespace gpu { namespace mathfunc
        __device__ __forceinline__ float operator()(const float& e) const
        {
-            return __powf(fabs(e), power);
+            return __powf(::fabs(e), power);
        }
    };
+    namespace detail
+    {
+        template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+        };
+        template <typename T> struct PowOpTraits<1, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 8 };
+        };
+        template <typename T> struct PowOpTraits<2, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename T> struct PowOpTraits<4, T> : DefaultTransformFunctorTraits< PowOp<T> >
+        {
+            enum { smart_block_dim_y = 4 };
+            enum { smart_shift = 4 };
+        };
+    }
+    template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
+    {
+    };
    template<typename T>
    void pow_caller(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream)
    {
@@ -514,6 +601,5 @@ namespace cv { namespace gpu { namespace mathfunc
    template void pow_caller<short>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
    template void pow_caller<ushort>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
    template void pow_caller<int>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
-    template void pow_caller<uint>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
    template void pow_caller<float>(const DevMem2D& src, float power, DevMem2D dst, cudaStream_t stream);
 }}}
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -40,14 +40,9 @@
 //
 //M*/
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"
 using namespace cv::gpu;
-using namespace cv::gpu::device;
 #ifndef CV_PI
 #define CV_PI   3.1415926535897932384626433832795f

--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -45,9 +45,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
-using namespace cv::gpu::device;
+namespace cv { namespace gpu { namespace device {
-namespace cv { namespace gpu { namespace matrix_operations {
    template <typename T> struct shift_and_sizeof;
    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
@@ -249,7 +247,55 @@ namespace cv { namespace gpu { namespace matrix_operations {
        const double alpha, beta;
    };
+    namespace detail
+    {
+        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 8 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 2 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 2 };
+        };
+        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+        {
+        };
+    }
+    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    {
+    };
    template<typename T, typename D>
    void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
    {

--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -71,23 +71,16 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
 #include "opencv2/gpu/stream_accessor.hpp"
-namespace cv 
+namespace cv { namespace gpu { namespace device {            
-{
+    void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-    namespace gpu
-    {
-        namespace matrix_operations
-        {            
-            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-            template <typename T>
+    template <typename T>
-            void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
+    void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream);
-            template <typename T>
+    template <typename T>
-            void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
+    void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
-            void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+    void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
-        }
+}}}
-    }
-}
 struct Stream::Impl
 {
@@ -108,14 +101,14 @@ namespace
    void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, src.channels(), stream);
+        device::set_to_gpu(src, sf.val, src.channels(), stream);
    }
    template <typename T>
    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+        device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
    }
 }
@@ -262,7 +255,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
        psrc = &(temp = src);
    dst.create( src.size(), rtype );
-    matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
+    device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
 }
 cv::gpu::Stream::operator bool() const

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/gpumat.cpp
+++ b/modules/gpu/src/gpumat.cpp
@@ -393,7 +393,7 @@ void cv::gpu::ensureSizeIsEnough(int, int, int, GpuMat&) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
-namespace cv { namespace gpu { namespace matrix_operations
+namespace cv { namespace gpu { namespace device
 {
    void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
@@ -449,7 +449,7 @@ void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
    else
    {
        mat.create(size(), type());
-        cv::gpu::matrix_operations::copy_to_with_mask(*this, mat, depth(), mask, channels());
+        device::copy_to_with_mask(*this, mat, depth(), mask, channels());
    }
 }
@@ -508,7 +508,7 @@ namespace
    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
    {
-        matrix_operations::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
+        device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
    }
 }
@@ -540,7 +540,7 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be
    dst.create( size(), rtype );
    if (!noScale)
-        matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
+        device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta);
    else
    {
        typedef void (*convert_caller_t)(const GpuMat& src, GpuMat& dst);
@@ -681,7 +681,7 @@ namespace
    void kernelSet(GpuMat& src, const Scalar& s)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, src.channels(), 0);
+        device::set_to_gpu(src, sf.val, src.channels(), 0);
    }
    template<int SDEPTH, int SCN> struct NppSetMaskFunc
@@ -732,7 +732,7 @@ namespace
    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
    {
        Scalar_<T> sf = s;
-        matrix_operations::set_to_gpu(src, sf.val, mask, src.channels(), 0);
+        device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
    }
 }

--- a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -46,18 +46,25 @@
 #include <thrust/functional.h>
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
+#include "vec_traits.hpp"
 namespace cv { namespace gpu { namespace device
 {
+    // Function Objects
    using thrust::unary_function;
    using thrust::binary_function;
+    // Arithmetic Operations
    using thrust::plus;
    using thrust::minus;
    using thrust::multiplies;
    using thrust::divides;
    using thrust::modulus;
    using thrust::negate;
+    // Comparison Operations
    using thrust::equal_to;
    using thrust::not_equal_to;
@@ -65,11 +72,15 @@ namespace cv { namespace gpu { namespace device
    using thrust::less;
    using thrust::greater_equal;
    using thrust::less_equal;
+    // Logical Operations
    using thrust::logical_and;
    using thrust::logical_or;
    using thrust::logical_not;
+    // Bitwise Operations
    using thrust::bit_and;
    using thrust::bit_or;
    using thrust::bit_xor;
@@ -78,7 +89,13 @@ namespace cv { namespace gpu { namespace device
        __forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
    };
-    using thrust::identity;
+    // Generalized Identity Operations
+    using thrust::identity;    
+    using thrust::project1st;
+    using thrust::project2nd;
+    // Min/Max Operations
 #define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
    template <> struct name<type> : binary_function<type, type, type> \
@@ -115,15 +132,8 @@ namespace cv { namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)
 #undef OPENCV_GPU_IMPLEMENT_MINMAX
-    using thrust::project1st;
-    using thrust::project2nd;
-    using thrust::unary_negate;
-    using thrust::not1;
-    using thrust::binary_negate;
+    // Math functions
-    using thrust::not2;
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
    template <typename T> struct func ## _func : unary_function<T, float> \
@@ -192,6 +202,8 @@ namespace cv { namespace gpu { namespace device
        }
    };
+    // Saturate Cast Functor
    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
    {
        __forceinline__ __device__ D operator ()(const T& v) const
@@ -200,6 +212,8 @@ namespace cv { namespace gpu { namespace device
        }
    };
+    // Threshold Functors
    template <typename T> struct thresh_binary_func : unary_function<T, T>
    {
        __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
@@ -256,7 +270,15 @@ namespace cv { namespace gpu { namespace device
        }
        const T thresh;
-    };
+    };    
+    // Function Object Adaptors
+    using thrust::unary_negate;
+    using thrust::not1;
+    using thrust::binary_negate;
+    using thrust::not2;
    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
    {
@@ -291,46 +313,77 @@ namespace cv { namespace gpu { namespace device
        return binder2nd<Op>(op, typename Op::second_argument_type(x));
    }
-    template <typename T1, typename T2> struct BinOpTraits
+    // Functor Traits
-    {
-        typedef int argument_type;
+    template <typename F> struct IsUnaryFunction
-    };
-    template <typename T> struct BinOpTraits<T, T>
-    {
-        typedef T argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, double>
-    {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<double, T>
-    {
-        typedef double argument_type;
-    };
-    template <> struct BinOpTraits<double, double>
-    {
-        typedef double argument_type;
-    };
-    template <typename T> struct BinOpTraits<T, float>
    {
-        typedef float argument_type;
+        struct Yes {};
+        struct No {Yes a[2];};
+        template <typename T, typename D> static Yes check(unary_function<T, D>*);
+        static No check(...);
+        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
    };
-    template <typename T> struct BinOpTraits<float, T>
+    template <typename F> struct IsBinaryFunction
    {
-        typedef float argument_type;
+        struct Yes {};
+        struct No {Yes a[2];};
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>*);
+        static No check(...);
+        enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
    };
-    template <> struct BinOpTraits<float, float>
+    namespace detail
    {
-        typedef float argument_type;
+        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
-    };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
-    template <> struct BinOpTraits<double, float>
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+        template <typename T, typename D> struct DefaultUnaryShift
+        {
+            enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
+        };
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
+        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+        {
+            enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
+        };
+        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+        template <typename Func> struct ShiftDispatcher<Func, true>
+        {
+            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+        };
+        template <typename Func> struct ShiftDispatcher<Func, false>
+        {
+            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        };
+    }
+    template <typename Func> struct DefaultTransformShift
    {
-        typedef double argument_type;
+        enum { shift = detail::ShiftDispatcher<Func>::shift };
    };
-    template <> struct BinOpTraits<float, double>
+    template <typename Func> struct DefaultTransformFunctorTraits
    {
-        typedef double argument_type;
+        enum { simple_block_dim_x = 16 };
+        enum { simple_block_dim_y = 16 };
+        enum { smart_block_dim_x = 16 };
+        enum { smart_block_dim_y = 16 };
+        enum { smart_shift = DefaultTransformShift<Func>::shift };
    };
+    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
 }}}
 #endif // __OPENCV_GPU_FUNCTIONAL_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -150,6 +150,50 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
    }
+    namespace detail
+    {    
+        template <typename T1, typename T2> struct BinOpTraits
+        {
+            typedef int argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, T>
+        {
+            typedef T argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<double, T>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<double, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, float>
+        {
+            typedef float argument_type;
+        };
+        template <typename T> struct BinOpTraits<float, T>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<float, float>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<double, float>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<float, double>
+        {
+            typedef double argument_type;
+        };
+    }
 #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
    static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
    { \
@@ -157,16 +201,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
    } \
    static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
    { \
@@ -174,16 +218,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
    } \
    static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
    { \
@@ -191,16 +235,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
    } \
    static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
    { \
@@ -208,16 +252,16 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
    { \
-        func<typename BinOpTraits<type, T>::argument_type> f; \
+        func<typename detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
    } \
    template <typename T> \
-    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
+    static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
    { \
-        func<typename BinOpTraits<T, type>::argument_type> f; \
+        func<typename detail::BinOpTraits<T, type>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+        return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
    }
 #define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \

--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -49,6 +49,79 @@ namespace cv { namespace gpu { namespace device
 {
    template<typename T, int N> struct TypeVec;
+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
 #define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
@@ -57,7 +130,9 @@ namespace cv { namespace gpu { namespace device
    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
-    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; };
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
+    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
@@ -74,11 +149,13 @@ namespace cv { namespace gpu { namespace device
    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
    template<typename T> struct VecTraits;
@@ -87,36 +164,43 @@ namespace cv { namespace gpu { namespace device
    { \
        typedef type elem_type; \
        enum {cn=1}; \
-        static __device__ __host__ type all(type v) {return v;} \
+        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
-        static __device__ __host__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
    }; \
    template<> struct VecTraits<type ## 1> \
    { \
        typedef type elem_type; \
        enum {cn=1}; \
-        static __device__ __host__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
-        static __device__ __host__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
    }; \
    template<> struct VecTraits<type ## 2> \
    { \
        typedef type elem_type; \
        enum {cn=2}; \
-        static __device__ __host__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
-        static __device__ __host__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
    }; \
    template<> struct VecTraits<type ## 3> \
    { \
        typedef type elem_type; \
        enum {cn=3}; \
-        static __device__ __host__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
-        static __device__ __host__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
    }; \
    template<> struct VecTraits<type ## 4> \
    { \
        typedef type elem_type; \
        enum {cn=4}; \
-        static __device__ __host__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
-        static __device__ __host__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+    }; \
+    template<> struct VecTraits<type ## 8> \
+    { \
+        typedef type elem_type; \
+        enum {cn=8}; \
+        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
    };
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
@@ -134,8 +218,8 @@ namespace cv { namespace gpu { namespace device
    { 
        typedef schar elem_type; 
        enum {cn=1}; 
-        static __device__ __host__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
-        static __device__ __host__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
    };
 }}}

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -286,7 +286,7 @@ TEST(BruteForceMatcher)
 {
    // Init CPU matcher
-    int desc_len = 64;
+    int desc_len = 128;
    BruteForceMatcher< L2<float> > matcher;
@@ -328,7 +328,7 @@ TEST(BruteForceMatcher)
    d_matcher.knnMatch(d_query, d_train, d_matches, knn);
    GPU_OFF;
-    /*SUBTEST << "radiusMatch";
+    SUBTEST << "radiusMatch";
    float max_distance = 3.8f;
    CPU_ON;
@@ -337,7 +337,7 @@ TEST(BruteForceMatcher)
    GPU_ON;
    d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
-    GPU_OFF;*/
+    GPU_OFF;
 }
@@ -689,60 +689,66 @@ TEST(threshold)
    Mat src, dst;
    gpu::GpuMat d_src, d_dst;
-    for (int size = 2000; size <= 4000; size += 1000)
+    for (int size = 1000; size <= 4000; size += 1000)
    {
-        SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
+        SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
        gen(src, size, size, CV_8U, 0, 100);
        dst.create(size, size, CV_8U);
        CPU_ON; 
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
        CPU_OFF;
        d_src = src;
        d_dst.create(size, size, CV_8U);
        GPU_ON;
-        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        GPU_OFF;
    }
-    for (int size = 2000; size <= 4000; size += 1000)
+    for (int size = 1000; size <= 4000; size += 1000)
    {
-        SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
+        SUBTEST << "size " << size << ", 32F, THRESH_BINARY";
-        gen(src, size, size, CV_8U, 0, 100);
+        gen(src, size, size, CV_32F, 0, 100);
-        dst.create(size, size, CV_8U);
+        dst.create(size, size, CV_32F);
        CPU_ON; 
        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
        CPU_OFF;
        d_src = src;
-        d_dst.create(size, size, CV_8U);
+        d_dst.create(size, size, CV_32F);
        GPU_ON;
        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        GPU_OFF;
    }
+}
-    for (int size = 2000; size <= 4000; size += 1000)
+TEST(pow)
+{
+    Mat src, dst;
+    gpu::GpuMat d_src, d_dst;
+    for (int size = 1000; size <= 4000; size += 1000)
    {
-        SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
+        SUBTEST << "size " << size << ", 32F";
        gen(src, size, size, CV_32F, 0, 100);
        dst.create(size, size, CV_32F);
-        CPU_ON; 
+        CPU_ON;
-        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        pow(src, -2.0, dst);
        CPU_OFF;
        d_src = src;
        d_dst.create(size, size, CV_32F);
        GPU_ON;
-        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        gpu::pow(d_src, -2.0, d_dst);
        GPU_OFF;
    }
 }