minor gpu module refactoring: split big .cu files, disabled unnecessary template instantiation

be8e31f1 · Vladislav Vinogradov · d99f4a2b · be8e31f1 · be8e31f1 · be8e31f1
Commit be8e31f1 authored Sep 14, 2011 by Vladislav Vinogradov
22 changed files
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -756,12 +756,6 @@ namespace cv
        //! computes the proximity map for the raster template and the image where the template is searched for
        CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
-        //! downsamples image
-        CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-        //! upsamples image
-        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
        //! smoothes the source image and downsamples it
        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());

--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -3,7 +3,7 @@
 PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing::ValuesIn(devices()), 
                                                              testing::Values(GPU_TYPICAL_MAT_SIZES), 
                                                              testing::Values(CV_8UC1, CV_8UC4),
-                                                              testing::Values(3, 5, 7)))
+                                                              testing::Values(3, 5)))
 {
    DeviceInfo devInfo = std::tr1::get<0>(GetParam());
    Size size = std::tr1::get<1>(GetParam());
@@ -37,7 +37,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
                                                                                        testing::Values(GPU_TYPICAL_MAT_SIZES), 
                                                                                        testing::Values(CV_8UC1, CV_8UC4),
                                                                                        testing::Values((int)MORPH_ERODE, (int)MORPH_DILATE),
-                                                                                        testing::Values(3, 5, 7)))
+                                                                                        testing::Values(3, 5)))
 {
    DeviceInfo devInfo = std::tr1::get<0>(GetParam());
    Size size = std::tr1::get<1>(GetParam());
@@ -71,7 +71,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
 PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(testing::ValuesIn(devices()), 
                                                                            testing::Values(GPU_TYPICAL_MAT_SIZES), 
                                                                            testing::Values(CV_8UC1, CV_8UC4),
-                                                                            testing::Values(3, 5, 7)))
+                                                                            testing::Values(3, 5)))
 {
    DeviceInfo devInfo = std::tr1::get<0>(GetParam());
    Size size = std::tr1::get<1>(GetParam());
@@ -103,8 +103,8 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
 PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()), 
                                                                               testing::Values(GPU_TYPICAL_MAT_SIZES), 
-                                                                               testing::Values(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC3, CV_32FC1),
+                                                                               testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1),
-                                                                               testing::Values(3, 5, 7),
+                                                                               testing::Values(3, 5),
                                                                               testing::Values((int)BORDER_REFLECT101, (int)BORDER_CONSTANT)))
 {
    DeviceInfo devInfo = std::tr1::get<0>(GetParam());

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -244,8 +244,8 @@ PERF_TEST_P(DevInfo_Size_MatType, threshold, testing::Combine(testing::ValuesIn(
 }
 PERF_TEST_P(DevInfo_Size_MatType_Interpolation_SizeCoeff, resize, testing::Combine(testing::ValuesIn(devices()),
-                                                                                   testing::Values(GPU_TYPICAL_MAT_SIZES), 
+                                                                                   testing::Values(szSXGA, sz1080p), 
-                                                                                   testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
+                                                                                   testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1),
                                                                                   testing::Values((int)INTER_NEAREST, (int)INTER_LINEAR, (int)INTER_CUBIC),
                                                                                   testing::Values(0.5, 2.0)))
 {

--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@@ -53,7 +53,8 @@ typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, int, int> > DevInfo_K_Des
 const cv::Size sz1800x1500 = cv::Size(1800, 1500);
 const cv::Size sz4700x3000 = cv::Size(4700, 3000);
-#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
+//#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
+#define GPU_TYPICAL_MAT_SIZES szSXGA, sz1080p, sz4700x3000
 //! read image from testdata folder.
 Mat readImage(const string& fileName, int flags = CV_LOAD_IMAGE_COLOR);

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -179,18 +179,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
    static const match_caller_t match_callers[3][8] =
    {
        {
-            matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>, 
+            matchSingleL1_gpu<unsigned char>, 0/*matchSingleL1_gpu<signed char>*/, 
            matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>, 
            matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
        },
        {
-            matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>, 
+            0/*matchSingleL2_gpu<unsigned char>*/, 0/*matchSingleL2_gpu<signed char>*/, 
-            matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>, 
+            0/*matchSingleL2_gpu<unsigned short>*/, 0/*matchSingleL2_gpu<short>*/, 
-            matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
+            0/*matchSingleL2_gpu<int>*/, matchSingleL2_gpu<float>, 0, 0
        },
        {
-            matchSingleHamming_gpu<unsigned char>, matchSingleHamming_gpu<signed char>, 
+            matchSingleHamming_gpu<unsigned char>, 0/*matchSingleHamming_gpu<signed char>*/, 
-            matchSingleHamming_gpu<unsigned short>, matchSingleHamming_gpu<short>, 
+            matchSingleHamming_gpu<unsigned short>, 0/*matchSingleHamming_gpu<short>*/, 
            matchSingleHamming_gpu<int>, 0, 0, 0
        }
    };
@@ -318,18 +318,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
    static const match_caller_t match_callers[3][8] =
    {
        {
-            matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
+            matchCollectionL1_gpu<unsigned char>, 0/*matchCollectionL1_gpu<signed char>*/,
            matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
            matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
        },
        {
-            matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
+            0/*matchCollectionL2_gpu<unsigned char>*/, 0/*matchCollectionL2_gpu<signed char>*/,
-            matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
+            0/*matchCollectionL2_gpu<unsigned short>*/, 0/*matchCollectionL2_gpu<short>*/,
-            matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
+            0/*matchCollectionL2_gpu<int>*/, matchCollectionL2_gpu<float>, 0, 0
        },
        {
-            matchCollectionHamming_gpu<unsigned char>, matchCollectionHamming_gpu<signed char>,
+            matchCollectionHamming_gpu<unsigned char>, 0/*matchCollectionHamming_gpu<signed char>*/,
-            matchCollectionHamming_gpu<unsigned short>, matchCollectionHamming_gpu<short>,
+            matchCollectionHamming_gpu<unsigned short>, 0/*matchCollectionHamming_gpu<short>*/,
            matchCollectionHamming_gpu<int>, 0, 0, 0
        }
    };
@@ -427,16 +427,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
    static const match_caller_t match_callers[3][8] =
    {
        {
-            knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
+            knnMatchL1_gpu<unsigned char>, 0/*knnMatchL1_gpu<signed char>*/, knnMatchL1_gpu<unsigned short>,
            knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
        },
        {
-            knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
+            0/*knnMatchL2_gpu<unsigned char>*/, 0/*knnMatchL2_gpu<signed char>*/, 0/*knnMatchL2_gpu<unsigned short>*/,
-            knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
+            0/*knnMatchL2_gpu<short>*/, 0/*knnMatchL2_gpu<int>*/, knnMatchL2_gpu<float>, 0, 0
        },
        {
-            knnMatchHamming_gpu<unsigned char>, knnMatchHamming_gpu<signed char>, knnMatchHamming_gpu<unsigned short>,
+            knnMatchHamming_gpu<unsigned char>, 0/*knnMatchHamming_gpu<signed char>*/, knnMatchHamming_gpu<unsigned short>,
-            knnMatchHamming_gpu<short>, knnMatchHamming_gpu<int>, 0, 0, 0
+            0/*knnMatchHamming_gpu<short>*/, knnMatchHamming_gpu<int>, 0, 0, 0
        }
    };
@@ -605,16 +605,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
    static const radiusMatch_caller_t radiusMatch_callers[3][8] =
    {
        {
-            radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
+            radiusMatchL1_gpu<unsigned char>, 0/*radiusMatchL1_gpu<signed char>*/, radiusMatchL1_gpu<unsigned short>,
            radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
        },
        {
-            radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
+            0/*radiusMatchL2_gpu<unsigned char>*/, 0/*radiusMatchL2_gpu<signed char>*/, 0/*radiusMatchL2_gpu<unsigned short>*/,
-            radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
+            0/*radiusMatchL2_gpu<short>*/, 0/*radiusMatchL2_gpu<int>*/, radiusMatchL2_gpu<float>, 0, 0
        },
        {
-            radiusMatchHamming_gpu<unsigned char>, radiusMatchHamming_gpu<signed char>, radiusMatchHamming_gpu<unsigned short>,
+            radiusMatchHamming_gpu<unsigned char>, 0/*radiusMatchHamming_gpu<signed char>*/, radiusMatchHamming_gpu<unsigned short>,
-            radiusMatchHamming_gpu<short>, radiusMatchHamming_gpu<int>, 0, 0, 0
+            0/*radiusMatchHamming_gpu<short>*/, radiusMatchHamming_gpu<int>, 0, 0, 0
        }
    };

--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+namespace bf_krnls
+{
+    __constant__ float* ctable_color;
+    __constant__ float* ctable_space;
+    __constant__ size_t ctable_space_step;
+    __constant__ int cndisp;
+    __constant__ int cradius;
+    __constant__ short cedge_disc;
+    __constant__ short cmax_disc;
+}
+namespace cv { namespace gpu { namespace bf 
+{
+    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
+        size_t table_space_step = table_space.step / sizeof(float);
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
+        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
+    }
+}}}
+namespace bf_krnls
+{
+    template <int channels>
+    struct DistRgbMax
+    {
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+        {
+            uchar x = abs(a[0] - b[0]);
+            uchar y = abs(a[1] - b[1]);
+            uchar z = abs(a[2] - b[2]);
+            return (max(max(x, y), z));
+        }
+    };
+    template <>
+    struct DistRgbMax<1>
+    {
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+        {
+            return abs(a[0] - b[0]);
+        }
+    };
+    template <int channels, typename T>
+    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+        T dp[5];
+        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+        {
+            dp[0] = *(disp + (y  ) * disp_step + x + 0);
+            dp[1] = *(disp + (y-1) * disp_step + x + 0);
+            dp[2] = *(disp + (y  ) * disp_step + x - 1);
+            dp[3] = *(disp + (y+1) * disp_step + x + 0);
+            dp[4] = *(disp + (y  ) * disp_step + x + 1);
+            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            
+            {
+                const int ymin = max(0, y - cradius);
+                const int xmin = max(0, x - cradius);
+                const int ymax = min(h - 1, y + cradius);
+                const int xmax = min(w - 1, x + cradius);
+                float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+                const uchar* ic = img + y * img_step + channels * x;
+                for(int yi = ymin; yi <= ymax; yi++)
+                {
+                    const T* disp_y = disp + yi * disp_step;
+                    for(int xi = xmin; xi <= xmax; xi++)
+                    {
+                        const uchar* in = img + yi * img_step + channels * xi;
+                        uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
+                        const T disp_reg = disp_y[xi];
+                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
+                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
+                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
+                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
+                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
+                    }
+                }
+                float minimum = numeric_limits<float>::max();
+                int id = 0;
+                if (cost[0] < minimum)
+                {
+                    minimum = cost[0];
+                    id = 0;
+                }
+                if (cost[1] < minimum)
+                {
+                    minimum = cost[1];
+                    id = 1;
+                }
+                if (cost[2] < minimum)
+                {
+                    minimum = cost[2];
+                    id = 2;
+                }
+                if (cost[3] < minimum)
+                {
+                    minimum = cost[3];
+                    id = 3;
+                }
+                if (cost[4] < minimum)
+                {
+                    minimum = cost[4];
+                    id = 4;
+                }
+                *(disp + y * disp_step + x) = dp[id];
+            }
+        }
+    }
+}
+namespace cv { namespace gpu { namespace bf 
+{
+    template <typename T>     
+    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+        grid.x = divUp(disp.cols, threads.x << 1);
+        grid.y = divUp(disp.rows, threads.y);
+        switch (channels)
+        {
+        case 1:
+            for (int i = 0; i < iters; ++i)
+            {
+                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+            }
+            break;
+        case 3:
+            for (int i = 0; i < iters; ++i)
+            {
+                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                cudaSafeCall( cudaGetLastError() );
+            }
+            break;
+        default:
+            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+        }
+        if (stream != 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        bilateral_filter_caller(disp, img, channels, iters, stream);
+    }
+    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
+    {
+        bilateral_filter_caller(disp, img, channels, iters, stream);
+    }
+}}}
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+namespace cv { namespace gpu { namespace imgproc
+{
+    template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y;
+        __shared__ value_type smem[256 + 4];
+        value_type sum;
+        const int src_y = 2*y;
+        sum = VecTraits<value_type>::all(0);
+        sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
+        sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
+        sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
+        smem[2 + threadIdx.x] = sum;
+        if (threadIdx.x < 2)
+        {
+            const int left_x = x - 2 + threadIdx.x;
+            sum = VecTraits<value_type>::all(0);
+            sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
+            sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
+            sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
+            smem[threadIdx.x] = sum;
+        }
+        if (threadIdx.x > 253)
+        {
+            const int right_x = x + threadIdx.x + 2;
+            sum = VecTraits<value_type>::all(0);
+            sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
+            sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
+            sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
+            smem[4 + threadIdx.x] = sum;
+        }
+        __syncthreads();
+        if (threadIdx.x < 128)
+        {
+            const int tid2 = threadIdx.x * 2;
+            sum = VecTraits<value_type>::all(0);
+            sum = sum + 0.0625f * smem[2 + tid2 - 2];
+            sum = sum + 0.25f   * smem[2 + tid2 - 1];
+            sum = sum + 0.375f  * smem[2 + tid2    ];
+            sum = sum + 0.25f   * smem[2 + tid2 + 1];
+            sum = sum + 0.0625f * smem[2 + tid2 + 2];
+            const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+            if (dst_x < dst_cols)
+                dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+        }
+    }
+    template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        const dim3 block(256);
+        const dim3 grid(divUp(src.cols, block.x), dst.rows);
+        B<T> b(src.rows, src.cols);
+        pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+        cudaSafeCall( cudaGetLastError() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type type;
+        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+        static const caller_t callers[] = 
+        {
+            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
+        };
+        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+    }
+    template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+}}}
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+namespace cv { namespace gpu { namespace imgproc
+{
+    template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        __shared__ T smem1[10][10];
+        __shared__ value_type smem2[20][16];
+        value_type sum;
+        if (threadIdx.x < 10 && threadIdx.y < 10)
+            smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
+        __syncthreads();
+        const int tidx = threadIdx.x;
+        sum = VecTraits<value_type>::all(0);
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
+        smem2[2 + threadIdx.y][tidx] = sum;
+        if (threadIdx.y < 2)
+        {
+            sum = VecTraits<value_type>::all(0);
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
+            smem2[threadIdx.y][tidx] = sum;
+        }
+        if (threadIdx.y > 13)
+        {
+            sum = VecTraits<value_type>::all(0);
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
+            smem2[4 + threadIdx.y][tidx] = sum;
+        }
+        __syncthreads();
+        sum = VecTraits<value_type>::all(0);
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
+        if (x < dst.cols && y < dst.rows)
+            dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
+    }
+    template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        B<T> b(src.rows, src.cols);
+        pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
+        cudaSafeCall( cudaGetLastError() );
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type type;
+        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+        static const caller_t callers[] = 
+        {
+            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
+        };
+        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+    }
+    template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+}}}
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -722,7 +722,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
-    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2 
+    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 /*|| srcType == CV_16SC1*/ /*|| srcType == CV_16SC2*/ 
        || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1);
    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));
@@ -747,12 +747,12 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    case CV_8UC4:
        func = filters::linearRowFilter_gpu<uchar4, float4>;
        break;
-    case CV_16SC1:
+    /*case CV_16SC1:
        func = filters::linearRowFilter_gpu<short, float>;
-        break;
+        break;*/
-    case CV_16SC2:
+    /*case CV_16SC2:
        func = filters::linearRowFilter_gpu<short2, float2>;
-        break;
+        break;*/
    case CV_16SC3:
        func = filters::linearRowFilter_gpu<short3, float3>;
        break;
@@ -837,7 +837,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
-    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2
+    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 /*|| dstType == CV_16SC1*/ /*|| dstType == CV_16SC2*/
        || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);
    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
@@ -862,12 +862,12 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    case CV_8UC4:
        func = filters::linearColumnFilter_gpu<float4, uchar4>;
        break;
-    case CV_16SC1:
+    /*case CV_16SC1:
        func = filters::linearColumnFilter_gpu<float, short>;
-        break;
+        break;*/
-    case CV_16SC2:
+    /*case CV_16SC2:
        func = filters::linearColumnFilter_gpu<float2, short2>;
-        break;
+        break;*/
    case CV_16SC3:
        func = filters::linearColumnFilter_gpu<float3, short3>;
        break;

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -90,8 +90,6 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
-void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
@@ -120,17 +118,20 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
    typedef void (*caller_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
    static const caller_t callers[6][4] = 
    {
-        {remap_gpu<uchar>, remap_gpu<uchar2>, remap_gpu<uchar3>, remap_gpu<uchar4>},
+        {remap_gpu<uchar>, 0/*remap_gpu<uchar2>*/, remap_gpu<uchar3>, remap_gpu<uchar4>},
-        {remap_gpu<schar>, remap_gpu<char2>, remap_gpu<char3>, remap_gpu<char4>},
+        {0/*remap_gpu<schar>*/, 0/*remap_gpu<char2>*/, 0/*remap_gpu<char3>*/, 0/*remap_gpu<char4>*/},
-        {remap_gpu<ushort>, remap_gpu<ushort2>, remap_gpu<ushort3>, remap_gpu<ushort4>},
+        {remap_gpu<ushort>, 0/*remap_gpu<ushort2>*/, remap_gpu<ushort3>, remap_gpu<ushort4>},
-        {remap_gpu<short>, remap_gpu<short2>, remap_gpu<short3>, remap_gpu<short4>},
+        {remap_gpu<short>, 0/*remap_gpu<short2>*/, remap_gpu<short3>, remap_gpu<short4>},
-        {remap_gpu<int>, remap_gpu<int2>, remap_gpu<int3>, remap_gpu<int4>},
+        {0/*remap_gpu<int>*/, 0/*remap_gpu<int2>*/, 0/*remap_gpu<int3>*/, 0/*remap_gpu<int4>*/},
-        {remap_gpu<float>, remap_gpu<float2>, remap_gpu<float3>, remap_gpu<float4>}
+        {remap_gpu<float>, 0/*remap_gpu<float2>*/, remap_gpu<float3>, remap_gpu<float4>}
    };
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
+    caller_t func = callers[src.depth()][src.channels() - 1];
+    CV_Assert(func != 0);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
@@ -142,7 +143,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
    Scalar_<float> borderValueFloat;
    borderValueFloat = borderValue;
-    callers[src.depth()][src.channels() - 1](src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
+    func(src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
 }
 ////////////////////////////////////////////////////////////////////////
@@ -279,19 +280,6 @@ namespace cv { namespace gpu {  namespace imgproc
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
 {
-    using namespace cv::gpu::imgproc;
-    typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
-    static const caller_t callers[6][4] = 
-    {
-        {resize_gpu<uchar>, resize_gpu<uchar2>, resize_gpu<uchar3>, resize_gpu<uchar4>},
-        {resize_gpu<schar>, resize_gpu<char2>, resize_gpu<char3>, resize_gpu<char4>},
-        {resize_gpu<ushort>, resize_gpu<ushort2>, resize_gpu<ushort3>, resize_gpu<ushort4>},
-        {resize_gpu<short>, resize_gpu<short2>, resize_gpu<short3>, resize_gpu<short4>},
-        {resize_gpu<int>, resize_gpu<int2>, resize_gpu<int3>, resize_gpu<int4>},
-        {resize_gpu<float>, resize_gpu<float2>, resize_gpu<float3>, resize_gpu<float4>}
-    };
    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
    CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
@@ -352,6 +340,19 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
    }
    else
    {
+        using namespace cv::gpu::imgproc;
+        typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
+        static const caller_t callers[6][4] = 
+        {
+            {resize_gpu<uchar>, 0/*resize_gpu<uchar2>*/, resize_gpu<uchar3>, resize_gpu<uchar4>},
+            {0/*resize_gpu<schar>*/, 0/*resize_gpu<char2>*/, 0/*resize_gpu<char3>*/, 0/*resize_gpu<char4>*/},
+            {resize_gpu<ushort>, 0/*resize_gpu<ushort2>*/, resize_gpu<ushort3>, resize_gpu<ushort4>},
+            {resize_gpu<short>, 0/*resize_gpu<short2>*/, resize_gpu<short3>, resize_gpu<short4>},
+            {0/*resize_gpu<int>*/, 0/*resize_gpu<int2>*/, 0/*resize_gpu<int3>*/, 0/*resize_gpu<int4>*/},
+            {resize_gpu<float>, 0/*resize_gpu<float2>*/, resize_gpu<float3>, resize_gpu<float4>}
+        };
        callers[src.depth()][src.channels() - 1](src, static_cast<float>(fx), static_cast<float>(fy), dst, interpolation, stream);
    }
 }
@@ -1589,75 +1590,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
    cufftSafeCall(cufftDestroy(planC2R));
 }
-////////////////////////////////////////////////////////////////////
-// downsample
-namespace cv { namespace gpu { namespace imgproc
-{
-    template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-}}}
-void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
-    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
-    static const Caller callers[6][4] =
-        {{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
-          imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
-         {0,0,0,0}, {0,0,0,0},
-         {imgproc::downsampleCaller<short,1>, imgproc::downsampleCaller<short,2>,
-          imgproc::downsampleCaller<short,3>, imgproc::downsampleCaller<short,4>},
-         {0,0,0,0},
-         {imgproc::downsampleCaller<float,1>, imgproc::downsampleCaller<float,2>,
-          imgproc::downsampleCaller<float,3>, imgproc::downsampleCaller<float,4>}};
-    Caller caller = callers[src.depth()][src.channels()-1];
-    if (!caller)
-        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
-    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
-}
-//////////////////////////////////////////////////////////////////////////////
-// upsample
-namespace cv { namespace gpu { namespace imgproc
-{
-    template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
-}}}
-void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
-    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
-    static const Caller callers[6][5] =
-        {{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
-          imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
-         {0,0,0,0}, {0,0,0,0},
-         {imgproc::upsampleCaller<short,1>, imgproc::upsampleCaller<short,2>,
-          imgproc::upsampleCaller<short,3>, imgproc::upsampleCaller<short,4>},
-         {0,0,0,0},
-         {imgproc::upsampleCaller<float,1>, imgproc::upsampleCaller<float,2>,
-          imgproc::upsampleCaller<float,3>, imgproc::upsampleCaller<float,4>}};
-    Caller caller = callers[src.depth()][src.channels()-1];
-    if (!caller)
-        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
-    dst.create(src.rows*2, src.cols*2, src.type());
-    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
-}
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown

--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
@@ -47,9 +47,6 @@ namespace cv { namespace gpu { namespace device
 {
    namespace detail
    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Reduction
        template <int n> struct WarpReductor
        {
            template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
@@ -504,72 +501,6 @@ namespace cv { namespace gpu { namespace device
                }
            }
        };
-        ///////////////////////////////////////////////////////////////////////////////
-        // Vector Distance
-        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
-            {
-                if (ind < len)
-                {
-                    T1 val1 = *vecCached++;
-                    T2 val2;
-                    ForceGlob<T2>::Load(vecGlob, ind, val2);
-                    dist.reduceIter(val1, val2);
-                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
-                }
-            }
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
-            {
-                T1 val1 = *vecCached++;
-                T2 val2;
-                ForceGlob<T2>::Load(vecGlob, 0, val2);
-                vecGlob += THREAD_DIM;
-                dist.reduceIter(val1, val2);
-                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
-            }
-        };
-        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
-            {
-            }
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
-            {
-            }
-        };
-        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
-            }
-        };
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
-            }
-        };
    }
 }}}

--- a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+#include "../datamov_utils.hpp"
+namespace cv { namespace gpu { namespace device
+{
+    namespace detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+                    dist.reduceIter(val1, val2);
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+                dist.reduceIter(val1, val2);
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    }
+}}}
+#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -46,7 +46,6 @@
 #include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "datamov_utils.hpp"
-#include "functional.hpp"
 #include "detail/utility_detail.hpp"
 #define OPENCV_GPU_LOG_WARP_SIZE	    (5)

--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -43,7 +43,10 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_HPP__
+#include "internal_shared.hpp"
 #include "utility.hpp"
+#include "functional.hpp"
+#include "detail/vec_distance_detail.hpp"
 namespace cv {  namespace gpu { namespace device
 {