Merge pull request #557 from jet47:gpu-sanity

0771fd82 · cuda-geek · OpenCV Buildbot · 2be893a2 · c9868fc0 · 0771fd82
Commit 0771fd82 authored Feb 28, 2013 by cuda-geek Committed by OpenCV Buildbot Feb 28, 2013
23 changed files
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_denoising.cpp
+++ b/modules/gpu/perf/perf_denoising.cpp
@@ -3,8 +3,7 @@
 using namespace std;
 using namespace testing;
-#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::szXGA, perf::sz720p, perf::sz1080p)
+#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)
 //////////////////////////////////////////////////////////////////////
 // BilateralFilter
@@ -12,96 +11,86 @@ using namespace testing;
 DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, MatCn, int);
 PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values(CV_8U, CV_32F), GPU_CHANNELS_1_3, Values(3, 5, 9)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values(CV_8U, CV_32F),
+                    GPU_CHANNELS_1_3,
+                    Values(3, 5, 9)))
 {
    declare.time(60.0);
-    cv::Size size = GET_PARAM(0);
+    const cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    const int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const int channels = GET_PARAM(2);
-    int kernel_size = GET_PARAM(3);
+    const int kernel_size = GET_PARAM(3);
-    float sigma_color = 7;
+    const float sigma_color = 7;
-    float sigma_spatial = 5;
+    const float sigma_spatial = 5;
-    int borderMode = cv::BORDER_REFLECT101;
+    const int borderMode = cv::BORDER_REFLECT101;
-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);
    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);
-     if (PERF_RUN_GPU())
+    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
+        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat dst;
-        cv::gpu::bilateralFilter(d_src, d_dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        TEST_CYCLE()
+        TEST_CYCLE() cv::gpu::bilateralFilter(d_src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        {
-            cv::gpu::bilateralFilter(d_src, d_dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        }
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;
-        cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+        TEST_CYCLE() cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        TEST_CYCLE()
-        {
-            cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        }
        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
 // nonLocalMeans
 DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
 PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), GPU_CHANNELS_1_3, Values(21), Values(5, 7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(5)))
 {
    declare.time(60.0);
-    cv::Size size = GET_PARAM(0);
+    const cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    const int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const int channels = GET_PARAM(2);
+    const int search_widow_size = GET_PARAM(3);
-    int search_widow_size = GET_PARAM(3);
+    const int block_size = GET_PARAM(4);
-    int block_size = GET_PARAM(4);
-    float h = 10;
+    const float h = 10;
-    int borderMode = cv::BORDER_REFLECT101;
+    const int borderMode = cv::BORDER_REFLECT101;
-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);
    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
+        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat dst;
-        cv::gpu::nonLocalMeans(d_src, d_dst, h, search_widow_size, block_size, borderMode);
+        TEST_CYCLE() cv::gpu::nonLocalMeans(d_src, dst, h, search_widow_size, block_size, borderMode);
-        TEST_CYCLE()
+        GPU_SANITY_CHECK(dst);
-        {
-            cv::gpu::nonLocalMeans(d_src, d_dst, h, search_widow_size, block_size, borderMode);
-        }
-        GPU_SANITY_CHECK(d_dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
    }
 }
@@ -112,46 +101,41 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
 DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
 PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), GPU_CHANNELS_1_3, Values(21), Values(7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(7)))
 {
-    declare.time(150.0);
+    declare.time(60.0);
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int search_widow_size = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
-    int block_size = GET_PARAM(3);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);
-    float h = 10;
+    const float h = 10;
-    int type = CV_MAKE_TYPE(depth, 1);
+    const int type = CV_MAKE_TYPE(depth, 1);
    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
        cv::gpu::FastNonLocalMeansDenoising fnlmd;
-        fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
-        TEST_CYCLE()
+        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);
-        {
-            fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size);
-        }
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;
-        cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
-        TEST_CYCLE()
+        TEST_CYCLE() cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
-        {
-            cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
-        }
        CPU_SANITY_CHECK(dst);
    }
@@ -163,47 +147,41 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
 DEF_PARAM_TEST(Sz_Depth_WinSz_BlockSz, cv::Size, MatDepth, int, int);
 PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeansColored,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), Values(21), Values(7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    Values(21),
+                    Values(7)))
 {
-    declare.time(350.0);
+    declare.time(60.0);
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int search_widow_size = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
-    int block_size = GET_PARAM(3);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);
-    float h = 10;
+    const float h = 10;
-    int type = CV_MAKE_TYPE(depth, 3);
+    const int type = CV_MAKE_TYPE(depth, 3);
    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
        cv::gpu::FastNonLocalMeansDenoising fnlmd;
-        fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
-        TEST_CYCLE()
+        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);
-        {
-            fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size);
-        }
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;
-        cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
-        TEST_CYCLE()
+        TEST_CYCLE() cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
-        {
-            cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
-        }
        CPU_SANITY_CHECK(dst);
    }
 }
\ No newline at end of file
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@@ -3,8 +3,6 @@
 using namespace std;
 using namespace testing;
-namespace {
 DEF_PARAM_TEST_1(Image, string);
 struct GreedyLabeling
@@ -100,28 +98,45 @@ struct GreedyLabeling
    dot* stack;
 };
-PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/aloe-disp.png"))
+PERF_TEST_P(Image, Labeling_ConnectivityMask,
+            Values<string>("gpu/labeling/aloe-disp.png"))
 {
    declare.time(1.0);
-    cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
    if (PERF_RUN_GPU())
    {
+        cv::gpu::GpuMat d_image(image);
        cv::gpu::GpuMat mask;
-        mask.create(image.rows, image.cols, CV_8UC1);
-        cv::gpu::GpuMat components;
+        TEST_CYCLE() cv::gpu::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));
-        components.create(image.rows, image.cols, CV_32SC1);
-        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+        GPU_SANITY_CHECK(mask);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
-        ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
+PERF_TEST_P(Image, Labeling_ConnectedComponents,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
-        TEST_CYCLE()
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-        {
+    ASSERT_FALSE(image.empty());
-            cv::gpu::labelComponents(mask, components);
-        }
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_mask;
+        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));
+        cv::gpu::GpuMat components;
+        TEST_CYCLE() cv::gpu::labelComponents(d_mask, components);
        GPU_SANITY_CHECK(components);
    }
@@ -129,17 +144,9 @@ PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/al
    {
        GreedyLabeling host(image);
-        host(host._labels);
+        TEST_CYCLE() host(host._labels);
-        declare.time(1.0);
+        cv::Mat components = host._labels;
+        CPU_SANITY_CHECK(components);
-        TEST_CYCLE()
-        {
-            host(host._labels);
-        }
-        CPU_SANITY_CHECK(host._labels);
    }
 }
-} // namespace
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
 #include "perf_precomp.hpp"
-namespace{
 static void printOsInfo()
 {
 #if defined _WIN32
@@ -69,6 +67,4 @@ static void printCudaInfo()
 #endif
 }
-}
+CV_PERF_TEST_MAIN(gpu, printCudaInfo())
-CV_PERF_TEST_MAIN(gpu, printCudaInfo())
\ No newline at end of file
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@@ -3,137 +3,112 @@
 using namespace std;
 using namespace testing;
-namespace {
 //////////////////////////////////////////////////////////////////////
 // SetTo
-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
+    const cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    const int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const int channels = GET_PARAM(2);
-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);
-    cv::Scalar val(1, 2, 3, 4);
+    const cv::Scalar val(1, 2, 3, 4);
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(size, type);
+        cv::gpu::GpuMat dst(size, type);
-        d_src.setTo(val);
-        TEST_CYCLE()
+        TEST_CYCLE() dst.setTo(val);
-        {
-            d_src.setTo(val);
-        }
-        GPU_SANITY_CHECK(d_src);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        cv::Mat src(size, type);
+        cv::Mat dst(size, type);
-        src.setTo(val);
+        TEST_CYCLE() dst.setTo(val);
-        TEST_CYCLE()
+        CPU_SANITY_CHECK(dst);
-        {
-            src.setTo(val);
-        }
-        CPU_SANITY_CHECK(src);
    }
 }
 //////////////////////////////////////////////////////////////////////
 // SetToMasked
-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
+    const cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    const int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const int channels = GET_PARAM(2);
-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);
    cv::Mat src(size, type);
-    fillRandom(src);
    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
+    declare.in(src, mask, WARMUP_RNG);
-    cv::Scalar val(1, 2, 3, 4);
+    const cv::Scalar val(1, 2, 3, 4);
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst(src);
-        cv::gpu::GpuMat d_mask(mask);
+        const cv::gpu::GpuMat d_mask(mask);
-        d_src.setTo(val, d_mask);
-        TEST_CYCLE()
+        TEST_CYCLE() dst.setTo(val, d_mask);
-        {
-            d_src.setTo(val, d_mask);
-        }
-        GPU_SANITY_CHECK(d_src);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        src.setTo(val, mask);
+        cv::Mat dst = src;
-        TEST_CYCLE()
+        TEST_CYCLE() dst.setTo(val, mask);
-        {
-            src.setTo(val, mask);
-        }
-        CPU_SANITY_CHECK(src);
+        CPU_SANITY_CHECK(dst);
    }
 }
 //////////////////////////////////////////////////////////////////////
 // CopyToMasked
-PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
+    const cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    const int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const int channels = GET_PARAM(2);
-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);
    cv::Mat src(size, type);
-    fillRandom(src);
    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
+    declare.in(src, mask, WARMUP_RNG);
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
+        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_mask(mask);
+        const cv::gpu::GpuMat d_mask(mask);
-        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));
-        d_src.copyTo(d_dst, d_mask);
+        TEST_CYCLE() d_src.copyTo(dst, d_mask);
-        TEST_CYCLE()
+        GPU_SANITY_CHECK(dst);
-        {
-            d_src.copyTo(d_dst, d_mask);
-        }
-        GPU_SANITY_CHECK(d_dst);
    }
    else
    {
-        cv::Mat dst;
+        cv::Mat dst(src.size(), src.type(), cv::Scalar::all(0));
-        src.copyTo(dst, mask);
+        TEST_CYCLE() src.copyTo(dst, mask);
-        TEST_CYCLE()
-        {
-            src.copyTo(dst, mask);
-        }
        CPU_SANITY_CHECK(dst);
    }
@@ -144,42 +119,36 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Valu
 DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);
-PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
-    cv::Size size = GET_PARAM(0);
+    const cv::Size size = GET_PARAM(0);
-    int depth1 = GET_PARAM(1);
+    const int depth1 = GET_PARAM(1);
-    int depth2 = GET_PARAM(2);
+    const int depth2 = GET_PARAM(2);
    cv::Mat src(size, depth1);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);
+    const double a = 0.5;
+    const double b = 1.0;
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
+        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat dst;
-        d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+        TEST_CYCLE() d_src.convertTo(dst, depth2, a, b);
-        TEST_CYCLE()
+        GPU_SANITY_CHECK(dst);
-        {
-            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
-        }
-        GPU_SANITY_CHECK(d_dst);
    }
    else
    {
        cv::Mat dst;
-        src.convertTo(dst, depth2, 0.5, 1.0);
+        TEST_CYCLE() src.convertTo(dst, depth2, a, b);
-        TEST_CYCLE()
-        {
-            src.convertTo(dst, depth2, 0.5, 1.0);
-        }
        CPU_SANITY_CHECK(dst);
    }
 }
-} // namespace
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -3,90 +3,47 @@
 using namespace std;
 using namespace testing;
-namespace {
 ///////////////////////////////////////////////////////////////
 // HOG
 DEF_PARAM_TEST_1(Image, string);
-PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
+PERF_TEST_P(Image, ObjDetect_HOG,
+            Values<string>("gpu/hog/road.png",
+                           "gpu/caltech/image_00000009_0.png",
+                           "gpu/caltech/image_00000032_0.png",
+                           "gpu/caltech/image_00000165_0.png",
+                           "gpu/caltech/image_00000261_0.png",
+                           "gpu/caltech/image_00000469_0.png",
+                           "gpu/caltech/image_00000527_0.png",
+                           "gpu/caltech/image_00000574_0.png"))
 {
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());
-    std::vector<cv::Rect> found_locations;
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
+        std::vector<cv::Rect> gpu_found_locations;
        cv::gpu::HOGDescriptor d_hog;
        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-        d_hog.detectMultiScale(d_img, found_locations);
+        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
-        TEST_CYCLE()
+        SANITY_CHECK(gpu_found_locations);
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
    }
    else
    {
-        cv::HOGDescriptor hog;
+        std::vector<cv::Rect> cpu_found_locations;
-        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-        hog.detectMultiScale(img, found_locations);
-        TEST_CYCLE()
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
-    }
-    SANITY_CHECK(found_locations);
-}
-//===========test for CalTech data =============//
-DEF_PARAM_TEST_1(HOG, string);
-PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gpu/caltech/image_00000032_0.png",
-    "gpu/caltech/image_00000165_0.png", "gpu/caltech/image_00000261_0.png", "gpu/caltech/image_00000469_0.png",
-    "gpu/caltech/image_00000527_0.png", "gpu/caltech/image_00000574_0.png"))
-{
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-    std::vector<cv::Rect> found_locations;
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-        d_hog.detectMultiScale(d_img, found_locations);
-        TEST_CYCLE()
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
-    }
-    else
-    {
        cv::HOGDescriptor hog;
        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-        hog.detectMultiScale(img, found_locations);
+        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);
-        TEST_CYCLE()
+        SANITY_CHECK(cpu_found_locations);
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
    }
-    SANITY_CHECK(found_locations);
 }
 ///////////////////////////////////////////////////////////////
@@ -96,9 +53,9 @@ typedef pair<string, string> pair_string;
 DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
 PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
 {
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());
    if (PERF_RUN_GPU())
@@ -106,33 +63,28 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
        cv::gpu::CascadeClassifier_GPU d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_objects_buffer;
+        cv::gpu::GpuMat objects_buffer;
+        int detections_num = 0;
-        d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
-        TEST_CYCLE()
+        std::vector<cv::Rect> gpu_rects(detections_num);
-        {
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-            d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
-        }
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
-        GPU_SANITY_CHECK(d_objects_buffer);
    }
    else
    {
        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
-        std::vector<cv::Rect> rects;
+        std::vector<cv::Rect> cpu_rects;
-        cascade.detectMultiScale(img, rects);
-        TEST_CYCLE()
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
-        {
-            cascade.detectMultiScale(img, rects);
-        }
-        CPU_SANITY_CHECK(rects);
+        SANITY_CHECK(cpu_rects);
    }
 }
@@ -140,9 +92,9 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
 // LBP cascade
 PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
 {
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());
    if (PERF_RUN_GPU())
@@ -150,34 +102,27 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
        cv::gpu::CascadeClassifier_GPU d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_gpu_rects;
+        cv::gpu::GpuMat objects_buffer;
+        int detections_num = 0;
-        d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);
-        TEST_CYCLE()
+        std::vector<cv::Rect> gpu_rects(detections_num);
-        {
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
-            d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
-        }
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
-        GPU_SANITY_CHECK(d_gpu_rects);
    }
    else
    {
        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
-        std::vector<cv::Rect> rects;
+        std::vector<cv::Rect> cpu_rects;
-        cascade.detectMultiScale(img, rects);
-        TEST_CYCLE()
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);
-        {
-            cascade.detectMultiScale(img, rects);
-        }
-        CPU_SANITY_CHECK(rects);
+        SANITY_CHECK(cpu_rects);
    }
 }
-} // namespace
\ No newline at end of file
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf/utility.cpp
+++ b/modules/gpu/perf/utility.cpp
@@ -2,13 +2,6 @@
 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
-void fillRandom(Mat& m, double a, double b)
-{
-    RNG rng(123456789);
-    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
-}
 Mat readImage(const string& fileName, int flags)
 {
@@ -188,4 +181,4 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
    };
    *os << str[info.code];
 }
\ No newline at end of file
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@@ -2,11 +2,9 @@
 #define __OPENCV_PERF_GPU_UTILITY_HPP__
 #include "opencv2/core/core.hpp"
-#include "opencv2/core/gpumat.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/ts/ts_perf.hpp"
-void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
 cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
 using perf::MatType;
@@ -17,12 +15,13 @@ CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONS
 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
 #define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
 CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)
-const int Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4;
+enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 };
 CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
-#define GPU_CHANNELS_1_3_4 testing::Values(Gray, BGR, BGRA)
+#define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA))
-#define GPU_CHANNELS_1_3 testing::Values(Gray, BGR)
+#define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR))
 struct CvtColorInfo
 {
@@ -30,7 +29,8 @@ struct CvtColorInfo
    int dcn;
    int code;
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
+    CvtColorInfo() {}
+    explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {}
 };
 void PrintTo(const CvtColorInfo& info, std::ostream* os);
@@ -46,39 +46,18 @@ DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
 #define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)
-#define GPU_SANITY_CHECK(dmat, ...) \
+#define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy"
-    do{ \
-        cv::Mat d##dmat(dmat); \
-        SANITY_CHECK(d##dmat, ## __VA_ARGS__); \
-    } while(0)
-#define CPU_SANITY_CHECK(cmat, ...) \
+#define GPU_SANITY_CHECK(mat, ...) \
    do{ \
-        SANITY_CHECK(cmat, ## __VA_ARGS__); \
+        cv::Mat gpu_##mat(mat); \
+        SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \
    } while(0)
-#define GPU_SANITY_CHECK_KEYPOINTS(alg, dmat, ...)                                          \
+#define CPU_SANITY_CHECK(mat, ...) \
-    do{                                                                                     \
+    do{ \
-        cv::Mat d##dmat(dmat);                                                              \
+        cv::Mat cpu_##mat(mat); \
-        cv::Mat __pt_x      = d##dmat.row(cv::gpu::alg##_GPU::X_ROW);                       \
+        SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \
-        cv::Mat __pt_y      = d##dmat.row(cv::gpu::alg##_GPU::Y_ROW);                       \
-        cv::Mat __angle     = d##dmat.row(cv::gpu::alg##_GPU::ANGLE_ROW);                   \
-        cv::Mat __octave    = d##dmat.row(cv::gpu::alg##_GPU::OCTAVE_ROW);                               \
-        cv::Mat __size      = d##dmat.row(cv::gpu::alg##_GPU::SIZE_ROW);                                 \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-x-row",     __pt_x,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-y-row",     __pt_y,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-angle-row",    __angle,    ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "octave-row",    __octave,   ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-size-row",  __size,     ## __VA_ARGS__); \
-    } while(0)
-#define GPU_SANITY_CHECK_RESPONSE(alg, dmat, ...) \
-    do{                                                                                     \
-        cv::Mat d##dmat(dmat);                                                              \
-        cv::Mat __response  = d##dmat.row(cv::gpu::alg##_GPU::RESPONSE_ROW);                \
-        ::perf::Regression::add(this, std::string(#dmat) + "-response-row", __response, ## __VA_ARGS__); \
    } while(0)
-#define FAIL_NO_CPU()   FAIL() << "No such CPU implementation analogy"
 #endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -2284,15 +2284,18 @@ namespace arithm
    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarAnd<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarOr<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 }
 //////////////////////////////////////////////////////////////////////////

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -2280,11 +2280,11 @@ namespace
 {
    typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
-    template <bit_scalar_func_t func> struct BitScalar
+    template <typename T, bit_scalar_func_t func> struct BitScalar
    {
        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
        {
-            func(src, static_cast<unsigned int>(sc.val[0]), dst, stream);
+            func(src, saturate_cast<T>(sc.val[0]), dst, stream);
        }
    };
@@ -2292,14 +2292,12 @@ namespace
    {
        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
        {
-            Scalar_<unsigned int> isc = sc;
            unsigned int packedVal = 0;
-            packedVal |= (isc.val[0] & 0xffff);
+            packedVal |= (saturate_cast<unsigned char>(sc.val[0]) & 0xffff);
-            packedVal |= (isc.val[1] & 0xffff) << 8;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[1]) & 0xffff) << 8;
-            packedVal |= (isc.val[2] & 0xffff) << 16;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[2]) & 0xffff) << 16;
-            packedVal |= (isc.val[3] & 0xffff) << 24;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[3]) & 0xffff) << 24;
            func(src, packedVal, dst, stream);
        }
@@ -2330,7 +2328,7 @@ namespace
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;
-            const npp_t pConstants[] = {static_cast<npp_t>(sc.val[0]), static_cast<npp_t>(sc.val[1]), static_cast<npp_t>(sc.val[2]), static_cast<npp_t>(sc.val[3])};
+            const npp_t pConstants[] = {saturate_cast<npp_t>(sc.val[0]), saturate_cast<npp_t>(sc.val[1]), saturate_cast<npp_t>(sc.val[2]), saturate_cast<npp_t>(sc.val[3])};
            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
@@ -2350,7 +2348,7 @@ namespace
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), static_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), saturate_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -2365,11 +2363,11 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarAnd<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        {BitScalar<int, bitScalarAnd<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
    };
    const int depth = src.depth();
@@ -2390,11 +2388,11 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarOr<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarOr<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        {BitScalar<int, bitScalarOr<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
    };
    const int depth = src.depth();
@@ -2415,11 +2413,11 @@ void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarXor<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarXor<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        {BitScalar<int, bitScalarXor<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
    };
    const int depth = src.depth();

--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -104,12 +104,12 @@ void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scal
 void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
 {
-    if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
    CV_Assert(!mask.empty() && mask.type() == CV_8U);
-    if (mask.size() != components.size() || components.type() != CV_32SC1)
+    if (!deviceSupports(SHARED_ATOMICS))
-        components.create(mask.size(), CV_32SC1);
+        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
+    components.create(mask.size(), CV_32SC1);
    cudaStream_t stream = StreamAccessor::getStream(s);
    device::ccl::labelComponents(mask, components, flags, stream);

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -522,6 +522,7 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    dst.create(dsize, src.type());
+    dst.setTo(Scalar::all(0));
    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
 }

--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@@ -382,6 +382,7 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
+            dstcol[3] = 255;
        }
    }
 }

--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -209,6 +209,8 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
+    uPyr_[0].setTo(Scalar::all(0));
+    vPyr_[0].setTo(Scalar::all(0));
    uPyr_[1].setTo(Scalar::all(0));
    vPyr_[1].setTo(Scalar::all(0));

--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1873,7 +1873,7 @@ PARAM_TEST_CASE(Bitwise_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channel
        cv::gpu::setDevice(devInfo.deviceID());
        src = randomMat(size, CV_MAKE_TYPE(depth, channels));
-        cv::Scalar_<int> ival = randomScalar(0.0, 255.0);
+        cv::Scalar_<int> ival = randomScalar(0.0, std::numeric_limits<int>::max());
        val = ival;
    }
 };

--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@@ -252,6 +252,8 @@ PARAM_TEST_CASE(WarpAffineNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolat
 GPU_TEST_P(WarpAffineNPP, Accuracy)
 {
    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
    int flags = interpolation;
    if (inverse)

--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp