Merge remote-tracking branch 'upstream/3.4' into merge-3.4

51e54305 · Alexander Alekhin · 000a13b6 · 9e3b6a22 · 51e54305 · 51e54305
Commit 51e54305 authored May 10, 2018 by Alexander Alekhin
20 changed files
--- a/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
+++ b/doc/tutorials/core/interoperability_with_OpenCV_1/interoperability_with_OpenCV_1.markdown
@@ -69,7 +69,7 @@ CvMat* mI        =  &I.operator CvMat();
 @endcode
 One of the biggest complaints of the C interface is that it leaves all the memory management to you.
 You need to figure out when it is safe to release your unused objects and make sure you do so before
-the program finishes or you could have troublesome memory leeks. To work around this issue in OpenCV
+the program finishes or you could have troublesome memory leaks. To work around this issue in OpenCV
 there is introduced a sort of smart pointer. This will automatically release the object when it's no
 longer in use. To use this declare the pointers as a specialization of the *Ptr* :
 @code{.cpp}

--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@@ -29,7 +29,7 @@ Execute in webcam:

 @code{.bash}

-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392
+$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --rgb

 @endcode

@@ -37,7 +37,7 @@ Execute with image or video file:

 @code{.bash}

-$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE]
+$ example_dnn_object_detection --config=[PATH-TO-DARKNET]/cfg/yolo.cfg --model=[PATH-TO-DARKNET]/yolo.weights --classes=object_detection_classes_pascal_voc.txt --width=416 --height=416 --scale=0.00392 --input=[PATH-TO-IMAGE-OR-VIDEO-FILE] --rgb

 @endcode


--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -786,10 +786,14 @@ template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
+{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }

 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
 OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)

--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -589,7 +589,7 @@ inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
    return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
 }

-#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2)       \
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2)   \
 template<int imm>                                                 \
 inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
 {                                                                 \
@@ -598,11 +598,13 @@ inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
    return imm ? b : a;                                           \
 }

-OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  right, a, b)
-OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec)    \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left,  b, a)  \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)

-OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  left, b, a)
-OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)

 /* Extract */
 template<int s, typename _Tpvec>
@@ -716,26 +718,33 @@ inline int v_signmask(const v_uint64x2& a)
 inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_s64(a)); }

-
 template<typename _Tpvec>
 inline bool v_check_all(const _Tpvec& a)
-{ return vec_all_lt(a.val, _Tpvec().val);}
-inline bool v_check_all(const v_uint8x16 &a)
+{ return vec_all_lt(a.val, _Tpvec().val); }
+inline bool v_check_all(const v_uint8x16& a)
 { return v_check_all(v_reinterpret_as_s8(a)); }
-inline bool v_check_all(const v_uint16x8 &a)
+inline bool v_check_all(const v_uint16x8& a)
 { return v_check_all(v_reinterpret_as_s16(a)); }
-inline bool v_check_all(const v_uint32x4 &a)
+inline bool v_check_all(const v_uint32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float32x4& a)
 { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }

 template<typename _Tpvec>
 inline bool v_check_any(const _Tpvec& a)
-{ return vec_any_lt(a.val, _Tpvec().val);}
-inline bool v_check_any(const v_uint8x16 &a)
+{ return vec_any_lt(a.val, _Tpvec().val); }
+inline bool v_check_any(const v_uint8x16& a)
 { return v_check_any(v_reinterpret_as_s8(a)); }
-inline bool v_check_any(const v_uint16x8 &a)
+inline bool v_check_any(const v_uint16x8& a)
 { return v_check_any(v_reinterpret_as_s16(a)); }
-inline bool v_check_any(const v_uint32x4 &a)
+inline bool v_check_any(const v_uint32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x4& a)
 { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }

 ////////// Other math /////////


--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -194,8 +194,8 @@ Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
 {
    Matx<_Tp, n, m> b;
    bool ok;
-    if( method == DECOMP_LU || method == DECOMP_CHOLESKY )
-        ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*this, b, method);
+    if( m == n && (method == DECOMP_LU || method == DECOMP_CHOLESKY) )
+        ok = cv::internal::Matx_FastInvOp<_Tp, m>()(*reinterpret_cast<const Matx<_Tp, m, m>*>(this), reinterpret_cast<Matx<_Tp, m, m>&>(b), method);
    else
    {
        Mat A(*this, false), B(b, false);

--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -837,17 +837,28 @@ template<typename R> struct TheTest
        Data<R> resC = v_rotate_right<s>(a);
        Data<R> resD = v_rotate_right<s>(a, b);

+        Data<R> resE = v_rotate_left<s>(a);
+        Data<R> resF = v_rotate_left<s>(a, b);
+
        for (int i = 0; i < R::nlanes; ++i)
        {
            if (i + s >= R::nlanes)
            {
                EXPECT_EQ((LaneType)0, resC[i]);
                EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]);
+
+                EXPECT_EQ((LaneType)0, resE[i - R::nlanes + s]);
+                EXPECT_EQ(dataB[i], resF[i - R::nlanes + s]);
            }
            else
+            {
                EXPECT_EQ(dataA[i + s], resC[i]);
-        }
+                EXPECT_EQ(dataA[i + s], resD[i]);

+                EXPECT_EQ(dataA[i], resE[i + s]);
+                EXPECT_EQ(dataA[i], resF[i + s]);
+            }
+        }
        return *this;
    }


--- a/modules/cudafilters/src/cuda/column_filter.hpp
+++ b/modules/cudafilters/src/cuda/column_filter.hpp
@@ -52,10 +52,8 @@ namespace column_filter
 {
    #define MAX_KERNEL_SIZE 32

-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
    template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
    {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            const int BLOCK_DIM_X = 16;
@@ -135,7 +133,7 @@ namespace column_filter

                #pragma unroll
                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k];

                dst(y, x) = saturate_cast<D>(sum);
            }
@@ -143,7 +141,7 @@ namespace column_filter
    }

    template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
    {
        int BLOCK_DIM_X;
        int BLOCK_DIM_Y;
@@ -167,7 +165,7 @@ namespace column_filter

        B<T> brd(src.rows);

-        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);

        cudaSafeCall( cudaGetLastError() );

@@ -181,7 +179,7 @@ namespace filter
    template <typename T, typename D>
    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
    {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);

        static const caller_t callers[5][33] =
        {
@@ -362,11 +360,6 @@ namespace filter
            }
        };

-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
    }
 }
--- a/modules/cudafilters/src/cuda/row_filter.hpp
+++ b/modules/cudafilters/src/cuda/row_filter.hpp
@@ -52,10 +52,8 @@ namespace row_filter
 {
    #define MAX_KERNEL_SIZE 32

-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
    template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
    {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            const int BLOCK_DIM_X = 32;
@@ -135,7 +133,7 @@ namespace row_filter

                #pragma unroll
                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k];

                dst(y, x) = saturate_cast<D>(sum);
            }
@@ -143,7 +141,7 @@ namespace row_filter
    }

    template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
    {
        int BLOCK_DIM_X;
        int BLOCK_DIM_Y;
@@ -167,7 +165,7 @@ namespace row_filter

        B<T> brd(src.cols);

-        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
@@ -180,7 +178,7 @@ namespace filter
    template <typename T, typename D>
    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
    {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);

        static const caller_t callers[5][33] =
        {
@@ -361,11 +359,6 @@ namespace filter
            }
        };

-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
    }
 }
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -58,9 +58,9 @@ namespace canny

    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);

-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream);

-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream);

    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
 }
@@ -127,6 +127,8 @@ namespace
        Ptr<Filter> filterDX_, filterDY_;
 #endif
        int old_apperture_size_;
+
+        int *d_counter;
    };

    void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
@@ -218,12 +220,17 @@ namespace

    void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
    {
-        map_.setTo(Scalar::all(0));
+        map_.setTo(Scalar::all(0), stream);
+
        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));

-        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));
+        cudaSafeCall( cudaMalloc(&d_counter, sizeof(int)) );
+
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
+
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));

-        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));
+        cudaSafeCall( cudaFree(d_counter) );

        canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
    }

--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -47,6 +47,7 @@
 #include "opencv2/core/cuda/transform.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda.hpp"

 using namespace cv::cuda;
 using namespace cv::cuda::device;
@@ -102,6 +103,20 @@ namespace canny
        }
    };

+    struct SrcTexObject
+    {
+        int xoff;
+        int yoff;
+        cudaTextureObject_t tex_src_object;
+        __host__ SrcTexObject(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : xoff(_xoff), yoff(_yoff), tex_src_object(_tex_src_object) { }
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
+        }
+
+    };
+
    template <class Norm> __global__
    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
    {
@@ -120,29 +135,94 @@ namespace canny
        mag(y, x) = norm(dxVal, dyVal);
    }

+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTexObject src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= mag.rows || x >= mag.cols)
+            return;
+
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));

-        bindTexture(&tex_src, srcWhole);
-        SrcTex src(xoff, yoff);
+        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);

-        if (L2Grad)
+        if (cc30)
        {
-            L2 norm;
-            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = srcWhole.ptr();
+            resDesc.res.pitch2D.height = srcWhole.rows;
+            resDesc.res.pitch2D.width = srcWhole.cols;
+            resDesc.res.pitch2D.pitchInBytes = srcWhole.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<uchar>();
+
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
+
+            cudaTextureObject_t tex = 0;
+            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+
+            SrcTexObject src(xoff, yoff, tex);
+
+            if (L2Grad)
+            {
+                L2 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+            else
+            {
+                L1 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+            else
+                cudaSafeCall( cudaStreamSynchronize(stream) );
+
+            cudaSafeCall( cudaDestroyTextureObject(tex) );
        }
        else
        {
-            L1 norm;
-            calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-        }
+            bindTexture(&tex_src, srcWhole);
+            SrcTex src(xoff, yoff);

-        cudaSafeCall( cudaGetLastError() );
+            if (L2Grad)
+            {
+                L2 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }
+            else
+            {
+                L1 norm;
+                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
+            }

-        if (stream == NULL)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
    }

    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@@ -165,7 +245,6 @@ namespace canny
 namespace canny
 {
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
-
    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
    {
        const int CANNY_SHIFT = 15;
@@ -218,18 +297,103 @@ namespace canny
        map(y, x) = edge_type;
    }

+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D<float>(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
+        {
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
+
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+        }
+
+        map(y, x) = edge_type;
+    }
+
    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));

-        bindTexture(&tex_mag, mag);
+        if (deviceSupports(FEATURE_SET_COMPUTE_30))
+        {
+            // Use the texture object
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = mag.ptr();
+            resDesc.res.pitch2D.height = mag.rows;
+            resDesc.res.pitch2D.width = mag.cols;
+            resDesc.res.pitch2D.pitchInBytes = mag.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
+
+            cudaTextureDesc texDesc;
+            memset(&texDesc, 0, sizeof(texDesc));
+            texDesc.addressMode[0] = cudaAddressModeClamp;
+            texDesc.addressMode[1] = cudaAddressModeClamp;
+            texDesc.addressMode[2] = cudaAddressModeClamp;
+
+            cudaTextureObject_t tex=0;
+            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
+            cudaSafeCall( cudaGetLastError() );

-        calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
-        cudaSafeCall( cudaGetLastError() );
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+            else
+                cudaSafeCall( cudaStreamSynchronize(stream) );

-        if (stream == NULL)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            cudaSafeCall( cudaDestroyTextureObject(tex) );
+        }
+        else
+        {
+            // Use the texture reference
+            bindTexture(&tex_mag, mag);
+            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == NULL)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
    }
 }

@@ -237,14 +401,12 @@ namespace canny

 namespace canny
 {
-    __device__ int counter = 0;
-
    __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
    {
        return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
    }

-    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter)
    {
        __shared__ volatile int smem[18][18];

@@ -325,22 +487,19 @@ namespace canny

        if (n > 0)
        {
-            const int ind =  ::atomicAdd(&counter, 1);
+            const int ind =  ::atomicAdd(d_counter, 1);
            st[ind] = make_short2(x, y);
        }
    }

-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream)
    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-        cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
+        cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );

        const dim3 block(16, 16);
        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));

-        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
+        edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1, d_counter);
        cudaSafeCall( cudaGetLastError() );

        if (stream == NULL)
@@ -355,7 +514,7 @@ namespace canny
    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};

-    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count)
    {
        const int stack_size = 512;

@@ -429,7 +588,7 @@ namespace canny
        {
            if (threadIdx.x == 0)
            {
-                s_ind = ::atomicAdd(&counter, s_counter);
+                s_ind = ::atomicAdd(d_counter, s_counter);

                if (s_ind + s_counter > map.cols * map.rows)
                    s_counter = 0;
@@ -444,29 +603,26 @@ namespace canny
        }
    }

-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream)
    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
-
        int count;
-        cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+        cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
        cudaSafeCall( cudaStreamSynchronize(stream) );

        while (count > 0)
        {
-            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
+            cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );

            const dim3 block(128);
            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);

-            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
+            edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, d_counter, count);
            cudaSafeCall( cudaGetLastError() );

            if (stream == NULL)
                cudaSafeCall( cudaDeviceSynchronize() );

-            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
            cudaSafeCall( cudaStreamSynchronize(stream) );

            count = min(count, map.cols * map.rows);

--- a/modules/cudaimgproc/test/test_canny.cpp
+++ b/modules/cudaimgproc/test/test_canny.cpp
@@ -92,9 +92,66 @@ CUDA_TEST_P(Canny, Accuracy)
    EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
 }

+class CannyAsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+    CannyAsyncParallelLoopBody(const cv::cuda::GpuMat& d_img_, cv::cuda::GpuMat* edges_, double low_thresh_, double high_thresh_, int apperture_size_, bool useL2gradient_)
+        : d_img(d_img_), edges(edges_), low_thresh(low_thresh_), high_thresh(high_thresh_), apperture_size(apperture_size_), useL2gradient(useL2gradient_) {}
+    ~CannyAsyncParallelLoopBody() {};
+    void operator()(const cv::Range& r) const
+    {
+        for (int i = r.start; i < r.end; i++) {
+            cv::cuda::Stream stream;
+            cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+            canny->detect(d_img, edges[i], stream);
+            stream.waitForCompletion();
+        }
+    }
+protected:
+    const cv::cuda::GpuMat& d_img;
+    cv::cuda::GpuMat* edges;
+    double low_thresh;
+    double high_thresh;
+    int apperture_size;
+    bool useL2gradient;
+};
+
+#define NUM_STREAMS 64
+
+CUDA_TEST_P(Canny, Async)
+{
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
+    {
+        throw SkipTestException("CUDA device doesn't support texture objects");
+    }
+    else
+    {
+        const cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+
+        const cv::cuda::GpuMat d_img_roi = loadMat(img, useRoi);
+
+        double low_thresh = 50.0;
+        double high_thresh = 100.0;
+
+        // Synchronous call
+        cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+        cv::cuda::GpuMat edges_gold;
+        canny->detect(d_img_roi, edges_gold);
+
+        // Asynchronous call
+        cv::cuda::GpuMat edges[NUM_STREAMS];
+        cv::parallel_for_(cv::Range(0, NUM_STREAMS), CannyAsyncParallelLoopBody(d_img_roi, edges, low_thresh, high_thresh, apperture_size, useL2gradient));
+
+        // Compare the results of synchronous call and asynchronous call
+        for (int i = 0; i < NUM_STREAMS; i++)
+            EXPECT_MAT_NEAR(edges_gold, edges[i], 0.0);
+    }
+ }
+
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Canny, testing::Combine(
    ALL_DEVICES,
-    testing::Values(AppertureSize(3), AppertureSize(5)),
+    testing::Values(AppertureSize(3), AppertureSize(5), AppertureSize(7)),
    testing::Values(L2gradient(false), L2gradient(true)),
    WHOLE_SUBMAT));


--- a/modules/imgproc/src/color_hsv.cpp
+++ b/modules/imgproc/src/color_hsv.cpp
@@ -84,15 +84,97 @@ struct RGB2HSV_f
    typedef float channel_type;

    RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
-    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
+    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {
+        #if CV_SIMD128
+        hasSIMD = hasSIMD128();
+        #endif
+    }
+
+    #if CV_SIMD128
+    inline void process(v_float32x4& v_r, v_float32x4& v_g,
+                        v_float32x4& v_b, float hscale) const
+    {
+        v_float32x4 v_min_rgb = v_min(v_min(v_r, v_g), v_b);
+        v_float32x4 v_max_rgb = v_max(v_max(v_r, v_g), v_b);
+
+        v_float32x4 v_eps = v_setall_f32(FLT_EPSILON);
+        v_float32x4 v_diff = v_max_rgb - v_min_rgb;
+        v_float32x4 v_s = v_diff / (v_abs(v_max_rgb) + v_eps);
+
+        v_float32x4 v_r_eq_max = v_r == v_max_rgb;
+        v_float32x4 v_g_eq_max = v_g == v_max_rgb;
+        v_float32x4 v_h = v_select(v_r_eq_max, v_g - v_b,
+                          v_select(v_g_eq_max, v_b - v_r, v_r - v_g));
+        v_float32x4 v_res = v_select(v_r_eq_max, (v_g < v_b) & v_setall_f32(360.0f),
+                            v_select(v_g_eq_max, v_setall_f32(120.0f), v_setall_f32(240.0f)));
+        v_float32x4 v_rev_diff = v_setall_f32(60.0f) / (v_diff + v_eps);
+        v_r = v_muladd(v_h, v_rev_diff, v_res) * v_setall_f32(hscale);
+
+        v_g = v_s;
+        v_b = v_max_rgb;
+    }
+    #endif

    void operator()(const float* src, float* dst, int n) const
    {
-        int i, bidx = blueIdx, scn = srccn;
+        int i = 0, bidx = blueIdx, scn = srccn;
        float hscale = hrange*(1.f/360.f);
        n *= 3;

-        for( i = 0; i < n; i += 3, src += scn )
+        #if CV_SIMD128
+        if (hasSIMD)
+        {
+            if (scn == 3) {
+                if (bidx) {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_load_deinterleave(src, v_r, v_g, v_b);
+                        process(v_r, v_g, v_b, hscale);
+                        v_store_interleave(dst + i, v_r, v_g, v_b);
+                    }
+                } else {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_load_deinterleave(src, v_r, v_g, v_b);
+                        process(v_b, v_g, v_r, hscale);
+                        v_store_interleave(dst + i, v_b, v_g, v_r);
+                    }
+                }
+            } else { // scn == 4
+                if (bidx) {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_float32x4 v_a;
+                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
+                        process(v_r, v_g, v_b, hscale);
+                        v_store_interleave(dst + i, v_r, v_g, v_b);
+                    }
+                } else {
+                    for ( ; i <= n - 12; i += 12, src += scn * 4)
+                    {
+                        v_float32x4 v_r;
+                        v_float32x4 v_g;
+                        v_float32x4 v_b;
+                        v_float32x4 v_a;
+                        v_load_deinterleave(src, v_r, v_g, v_b, v_a);
+                        process(v_b, v_g, v_r, hscale);
+                        v_store_interleave(dst + i, v_b, v_g, v_r);
+                    }
+                }
+            }
+        }
+        #endif
+
+        for( ; i < n; i += 3, src += scn )
        {
            float b = src[bidx], g = src[1], r = src[bidx^2];
            float h, s, v;
@@ -125,6 +207,9 @@ struct RGB2HSV_f

    int srccn, blueIdx;
    float hrange;
+    #if CV_SIMD128
+    bool hasSIMD;
+    #endif
 };


@@ -855,167 +940,111 @@ struct HLS2RGB_f

    HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {
-        #if CV_SSE2
-        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
+        #if CV_SIMD128
+        hasSIMD = hasSIMD128();
        #endif
    }

-    #if CV_SSE2
-    void process(__m128& v_h0, __m128& v_h1, __m128& v_l0,
-                 __m128& v_l1, __m128& v_s0, __m128& v_s1) const
+    #if CV_SIMD128
+    inline void process(v_float32x4& v_h, v_float32x4& v_l, v_float32x4& v_s) const
    {
-        __m128 v_lel0 = _mm_cmple_ps(v_l0, _mm_set1_ps(0.5f));
-        __m128 v_lel1 = _mm_cmple_ps(v_l1, _mm_set1_ps(0.5f));
-        __m128 v_p20 = _mm_andnot_ps(v_lel0, _mm_sub_ps(_mm_add_ps(v_l0, v_s0), _mm_mul_ps(v_l0, v_s0)));
-        __m128 v_p21 = _mm_andnot_ps(v_lel1, _mm_sub_ps(_mm_add_ps(v_l1, v_s1), _mm_mul_ps(v_l1, v_s1)));
-        v_p20 = _mm_or_ps(v_p20, _mm_and_ps(v_lel0, _mm_mul_ps(v_l0, _mm_add_ps(_mm_set1_ps(1.0f), v_s0))));
-        v_p21 = _mm_or_ps(v_p21, _mm_and_ps(v_lel1, _mm_mul_ps(v_l1, _mm_add_ps(_mm_set1_ps(1.0f), v_s1))));
-
-        __m128 v_p10 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l0), v_p20);
-        __m128 v_p11 = _mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2.0f), v_l1), v_p21);
-
-        v_h0 = _mm_mul_ps(v_h0, _mm_set1_ps(hscale));
-        v_h1 = _mm_mul_ps(v_h1, _mm_set1_ps(hscale));
-
-        __m128 v_pre_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h0));
-        __m128 v_pre_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_h1));
-
-        v_h0 = _mm_sub_ps(v_h0, v_pre_sector0);
-        v_h1 = _mm_sub_ps(v_h1, v_pre_sector1);
-
-        __m128 v_p2_p10 = _mm_sub_ps(v_p20, v_p10);
-        __m128 v_p2_p11 = _mm_sub_ps(v_p21, v_p11);
-        __m128 v_tab20 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, _mm_sub_ps(_mm_set1_ps(1.0f), v_h0)));
-        __m128 v_tab21 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, _mm_sub_ps(_mm_set1_ps(1.0f), v_h1)));
-        __m128 v_tab30 = _mm_add_ps(v_p10, _mm_mul_ps(v_p2_p10, v_h0));
-        __m128 v_tab31 = _mm_add_ps(v_p11, _mm_mul_ps(v_p2_p11, v_h1));
-
-        __m128 v_sector0 = _mm_div_ps(v_pre_sector0, _mm_set1_ps(6.0f));
-        __m128 v_sector1 = _mm_div_ps(v_pre_sector1, _mm_set1_ps(6.0f));
-        v_sector0 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector0));
-        v_sector1 = _mm_cvtepi32_ps(_mm_cvttps_epi32(v_sector1));
-        v_sector0 = _mm_mul_ps(v_sector0, _mm_set1_ps(6.0f));
-        v_sector1 = _mm_mul_ps(v_sector1, _mm_set1_ps(6.0f));
-        v_sector0 = _mm_sub_ps(v_pre_sector0, v_sector0);
-        v_sector1 = _mm_sub_ps(v_pre_sector1, v_sector1);
-
-        v_h0 = _mm_and_ps(v_p10, _mm_cmplt_ps(v_sector0, _mm_set1_ps(2.0f)));
-        v_h1 = _mm_and_ps(v_p11, _mm_cmplt_ps(v_sector1, _mm_set1_ps(2.0f)));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
-        v_h0 = _mm_or_ps(v_h0, _mm_and_ps(v_tab20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_h1 = _mm_or_ps(v_h1, _mm_and_ps(v_tab21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
-        v_l0 = _mm_and_ps(v_tab30, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
-        v_l1 = _mm_and_ps(v_tab31, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_l0 = _mm_or_ps(v_l0, _mm_and_ps(v_p10, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_l1 = _mm_or_ps(v_l1, _mm_and_ps(v_p11, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_s0 = _mm_and_ps(v_p20, _mm_cmplt_ps(v_sector0, _mm_set1_ps(1.0f)));
-        v_s1 = _mm_and_ps(v_p21, _mm_cmplt_ps(v_sector1, _mm_set1_ps(1.0f)));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab20, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(1.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab21, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(1.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(2.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(2.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p10, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(3.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p11, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(3.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_tab30, _mm_cmpeq_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_tab31, _mm_cmpeq_ps(v_sector1, _mm_set1_ps(4.0f))));
-        v_s0 = _mm_or_ps(v_s0, _mm_and_ps(v_p20, _mm_cmpgt_ps(v_sector0, _mm_set1_ps(4.0f))));
-        v_s1 = _mm_or_ps(v_s1, _mm_and_ps(v_p21, _mm_cmpgt_ps(v_sector1, _mm_set1_ps(4.0f))));
+        v_float32x4 v_one = v_setall_f32(1.0f);
+
+        v_float32x4 v_l_le_half = v_l <= v_setall_f32(0.5f);
+        v_float32x4 v_ls = v_l * v_s;
+        v_float32x4 v_elem0 = v_select(v_l_le_half, v_ls, v_s - v_ls);
+
+        v_float32x4 v_hs_raw = v_h * v_setall_f32(hscale);
+        v_float32x4 v_pre_hs = v_cvt_f32(v_trunc(v_hs_raw));
+        v_float32x4 v_hs = v_hs_raw - v_pre_hs;
+        v_float32x4 v_sector = v_pre_hs - v_setall_f32(6.0f) * v_cvt_f32(v_trunc(v_hs_raw * v_setall_f32(1.0f / 6.0f)));
+        v_float32x4 v_elem1 = v_hs + v_hs;
+
+        v_float32x4 v_tab0 = v_l + v_elem0;
+        v_float32x4 v_tab1 = v_l - v_elem0;
+        v_float32x4 v_tab2 = v_l + v_elem0 - v_elem0 * v_elem1;
+        v_float32x4 v_tab3 = v_l - v_elem0 + v_elem0 * v_elem1;
+
+        v_float32x4 v_two  = v_setall_f32(2.0f);
+        v_float32x4 v_four = v_setall_f32(4.0f);
+
+        v_h = v_select(v_sector <  v_two , v_tab1,
+              v_select(v_sector <= v_two , v_tab3,
+              v_select(v_sector <= v_four, v_tab0, v_tab2)));
+
+        v_l = v_select(v_sector <  v_one , v_tab3,
+              v_select(v_sector <= v_two , v_tab0,
+              v_select(v_sector <  v_four, v_tab2, v_tab1)));
+
+        v_s = v_select(v_sector <  v_one , v_tab0,
+              v_select(v_sector <  v_two , v_tab2,
+              v_select(v_sector <  v_four, v_tab1,
+              v_select(v_sector <= v_four, v_tab3, v_tab0))));
    }
    #endif

    void operator()(const float* src, float* dst, int n) const
    {
        int i = 0, bidx = blueIdx, dcn = dstcn;
-        float _hscale = hscale;
        float alpha = ColorChannel<float>::max();
        n *= 3;

-        #if CV_SSE2
-        if (haveSIMD)
+        #if CV_SIMD128
+        if (hasSIMD)
        {
-            for( ; i <= n - 24; i += 24, dst += dcn * 8 )
+            if (dcn == 3)
            {
-                __m128 v_h0 = _mm_loadu_ps(src + i +  0);
-                __m128 v_h1 = _mm_loadu_ps(src + i +  4);
-                __m128 v_l0 = _mm_loadu_ps(src + i +  8);
-                __m128 v_l1 = _mm_loadu_ps(src + i + 12);
-                __m128 v_s0 = _mm_loadu_ps(src + i + 16);
-                __m128 v_s1 = _mm_loadu_ps(src + i + 20);
-
-                _mm_deinterleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
-
-                process(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
-
-                if (dcn == 3)
+                if (bidx)
                {
-                    if (bidx)
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                    {
-                        _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1);
-
-                        _mm_storeu_ps(dst +  0, v_s0);
-                        _mm_storeu_ps(dst +  4, v_s1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_h0);
-                        _mm_storeu_ps(dst + 20, v_h1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_store_interleave(dst, v_s, v_l, v_h);
                    }
-                    else
+                } else {
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                    {
-                        _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
-
-                        _mm_storeu_ps(dst +  0, v_h0);
-                        _mm_storeu_ps(dst +  4, v_h1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_s0);
-                        _mm_storeu_ps(dst + 20, v_s1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_store_interleave(dst, v_h, v_l, v_s);
                    }
                }
-                else
+            } else { // dcn == 4
+                if (bidx)
                {
-                    __m128 v_a0 = _mm_set1_ps(alpha);
-                    __m128 v_a1 = _mm_set1_ps(alpha);
-                    if (bidx)
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                    {
-                        _mm_interleave_ps(v_s0, v_s1, v_l0, v_l1, v_h0, v_h1, v_a0, v_a1);
-
-                        _mm_storeu_ps(dst +  0, v_s0);
-                        _mm_storeu_ps(dst +  4, v_s1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_h0);
-                        _mm_storeu_ps(dst + 20, v_h1);
-                        _mm_storeu_ps(dst + 24, v_a0);
-                        _mm_storeu_ps(dst + 28, v_a1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_float32x4 v_a = v_setall_f32(alpha);
+                        v_store_interleave(dst, v_s, v_l, v_h, v_a);
                    }
-                    else
+                } else {
+                    for (; i <= n - 12; i += 12, dst += dcn * 4)
                    {
-                        _mm_interleave_ps(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1, v_a0, v_a1);
-
-                        _mm_storeu_ps(dst +  0, v_h0);
-                        _mm_storeu_ps(dst +  4, v_h1);
-                        _mm_storeu_ps(dst +  8, v_l0);
-                        _mm_storeu_ps(dst + 12, v_l1);
-                        _mm_storeu_ps(dst + 16, v_s0);
-                        _mm_storeu_ps(dst + 20, v_s1);
-                        _mm_storeu_ps(dst + 24, v_a0);
-                        _mm_storeu_ps(dst + 28, v_a1);
+                        v_float32x4 v_h;
+                        v_float32x4 v_l;
+                        v_float32x4 v_s;
+                        v_load_deinterleave(src + i, v_h, v_l, v_s);
+                        process(v_h, v_l, v_s);
+                        v_float32x4 v_a = v_setall_f32(alpha);
+                        v_store_interleave(dst, v_h, v_l, v_s, v_a);
                    }
                }
            }
        }
        #endif
+
        for( ; i < n; i += 3, dst += dcn )
        {
            float h = src[i], l = src[i+1], s = src[i+2];
@@ -1033,7 +1062,7 @@ struct HLS2RGB_f
                float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
                float p1 = 2*l - p2;

-                h *= _hscale;
+                h *= hscale;
                if( h < 0 )
                    do h += 6; while( h < 0 );
                else if( h >= 6 )
@@ -1063,8 +1092,8 @@ struct HLS2RGB_f

    int dstcn, blueIdx;
    float hscale;
-    #if CV_SSE2
-    bool haveSIMD;
+    #if CV_SIMD128
+    bool hasSIMD;
    #endif
 };


--- a/modules/photo/test/test_hdr.cpp
+++ b/modules/photo/test/test_hdr.cpp
@@ -213,7 +213,7 @@ TEST(Photo_MergeRobertson, regression)
    loadImage(test_path + "merge/robertson.hdr", expected);
    merge->process(images, result, times);

-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__PPC64__)
    const float eps = 6.f;
 #else
    const float eps = 5.f;

--- a/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/blenders.hpp
@@ -48,6 +48,7 @@
 #endif

 #include "opencv2/core.hpp"
+#include "opencv2/core/cuda.hpp"

 namespace cv {
 namespace detail {

--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -870,7 +870,12 @@ bool CvCapture_FFMPEG::open( const char* _filename )
            int enc_width = enc->width;
            int enc_height = enc->height;

-            AVCodec *codec = avcodec_find_decoder(enc->codec_id);
+            AVCodec *codec;
+            if(av_dict_get(dict, "video_codec", NULL, 0) == NULL) {
+                codec = avcodec_find_decoder(enc->codec_id);
+            } else {
+                codec = avcodec_find_decoder_by_name(av_dict_get(dict, "video_codec", NULL, 0)->value);
+            }
            if (!codec ||
 #if LIBAVCODEC_VERSION_INT >= ((53<<16)+(8<<8)+0)
                avcodec_open2(enc, codec, NULL)

--- a/modules/videoio/src/cap_qt.cpp
+++ b/modules/videoio/src/cap_qt.cpp
@@ -634,11 +634,11 @@ static int icvOpenCamera_QT (CvCapture_QT_Cam * capture, const int index)
    OPENCV_ASSERT (capture,            "icvOpenCamera_QT", "'capture' is a NULL-pointer");
    OPENCV_ASSERT (index >=0, "icvOpenCamera_QT", "camera index is negative");

-    ComponentDescription	component_description;
-    Component				component = 0;
+    ComponentDescription    component_description;
+    Component               component = 0;
    int                     number_of_inputs = 0;
    Rect                    myRect;
-    ComponentResult			result = noErr;
+    ComponentResult         result = noErr;


    // travers all components and count video digitizer channels

--- a/samples/cpp/tutorial_code/dnn/custom_layers.cpp
+++ b/samples/cpp/tutorial_code/dnn/custom_layers.cpp
@@ -16,18 +16,18 @@ public:
    virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
                                 const int requiredOutputs,
                                 std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const;
+                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE;
    //! [MyLayer::getMemoryShapes]

    //! [MyLayer::forward]
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals);
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE;
    //! [MyLayer::forward]

    //! [MyLayer::finalize]
-    virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs);
+    virtual void finalize(const std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs) CV_OVERRIDE;
    //! [MyLayer::finalize]

-    virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals);
+    virtual void forward(cv::InputArrayOfArrays inputs, cv::OutputArrayOfArrays outputs, cv::OutputArrayOfArrays internals) CV_OVERRIDE;
 };
 //! [A custom layer interface]

@@ -49,7 +49,7 @@ public:
    virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
                                 const int requiredOutputs,
                                 std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const
+                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE
    {
        CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
        std::vector<int> outShape(4);
@@ -62,7 +62,7 @@ public:
    }

    // Implementation of this custom layer is based on https://github.com/cdmh/deeplab-public/blob/master/src/caffe/layers/interp_layer.cpp
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals)
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
    {
        CV_UNUSED(internals);
        cv::Mat& inp = *inputs[0];
@@ -105,7 +105,7 @@ public:
        }
    }

-    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {}
+    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}

 private:
    int outWidth, outHeight;
@@ -132,7 +132,7 @@ public:
    virtual bool getMemoryShapes(const std::vector<std::vector<int> > &inputs,
                                 const int requiredOutputs,
                                 std::vector<std::vector<int> > &outputs,
-                                 std::vector<std::vector<int> > &internals) const
+                                 std::vector<std::vector<int> > &internals) const CV_OVERRIDE
    {
        CV_UNUSED(requiredOutputs); CV_UNUSED(internals);
        std::vector<int> outShape(4);
@@ -146,7 +146,7 @@ public:

    // This implementation is based on a reference implementation from
    // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
-    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals)
+    virtual void forward(std::vector<cv::Mat*> &inputs, std::vector<cv::Mat> &outputs, std::vector<cv::Mat> &internals) CV_OVERRIDE
    {
        CV_UNUSED(internals);
        cv::Mat& inp = *inputs[0];
@@ -187,7 +187,7 @@ public:
        }
    }

-    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) {}
+    virtual void forward(cv::InputArrayOfArrays, cv::OutputArrayOfArrays, cv::OutputArrayOfArrays) CV_OVERRIDE {}

 private:
    static inline int offset(const cv::MatSize& size, int c, int x, int y, int b)

--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -7,12 +7,13 @@

 const char* keys =
    "{ help  h     | | Print help message. }"
-    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
+    "{ device      |  0 | camera device number. }"
+    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
    "{ model m     | | Path to a binary file of model contains trained weights. "
                      "It could be a file with extensions .caffemodel (Caffe), "
-                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }"
+                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet).}"
    "{ config c    | | Path to a text file of model contains network configuration. "
-                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }"
+                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet).}"
    "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
    "{ classes     | | Optional path to a text file with names of classes to label detected objects. }"
    "{ mean        | | Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces. }"
@@ -91,7 +92,7 @@ int main(int argc, char** argv)
    if (parser.has("input"))
        cap.open(parser.get<String>("input"));
    else
-        cap.open(0);
+        cap.open(parser.get<int>("device"));

    // Process frames.
    Mat frame, blob;

--- a/samples/dnn/openpose.cpp
+++ b/samples/dnn/openpose.cpp
@@ -61,12 +61,16 @@ int main(int argc, char **argv)
        "{ p proto          |       | (required) model configuration, e.g. hand/pose.prototxt }"
        "{ m model          |       | (required) model weights, e.g. hand/pose_iter_102000.caffemodel }"
        "{ i image          |       | (required) path to image file (containing a single person, or hand) }"
+        "{ width            |  368  | Preprocess input image by resizing to a specific width. }"
+        "{ height           |  368  | Preprocess input image by resizing to a specific height. }"
        "{ t threshold      |  0.1  | threshold or confidence value for the heatmap }"
    );

    String modelTxt = parser.get<string>("proto");
    String modelBin = parser.get<string>("model");
    String imageFile = parser.get<String>("image");
+    int W_in = parser.get<int>("width");
+    int H_in = parser.get<int>("height");
    float thresh = parser.get<float>("threshold");
    if (parser.get<bool>("help") || modelTxt.empty() || modelBin.empty() || imageFile.empty())
    {
@@ -75,10 +79,6 @@ int main(int argc, char **argv)
        return 0;
    }

-    // fixed input size for the pretrained network
-    int W_in = 368;
-    int H_in = 368;
-
    // read the network model
    Net net = readNetFromCaffe(modelTxt, modelBin);


--- a/samples/dnn/segmentation.cpp
+++ b/samples/dnn/segmentation.cpp
@@ -7,12 +7,13 @@

 const char* keys =
    "{ help  h     | | Print help message. }"
-    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
+    "{ device      |  0 | camera device number. }"
+    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
    "{ model m     | | Path to a binary file of model contains trained weights. "
                      "It could be a file with extensions .caffemodel (Caffe), "
-                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet) }"
+                      ".pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet). }"
    "{ config c    | | Path to a text file of model contains network configuration. "
-                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet) }"
+                      "It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet). }"
    "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
    "{ classes     | | Optional path to a text file with names of classes. }"
    "{ colors      | | Optional path to a text file with colors for an every class. "
@@ -111,7 +112,7 @@ int main(int argc, char** argv)
    if (parser.has("input"))
        cap.open(parser.get<String>("input"));
    else
-        cap.open(0);
+        cap.open(parser.get<int>("device"));
    //! [Open a video file or an image file or a camera stream]

    // Process frames.