removed BEGIN_OPENCV_DEVICE_NAMESPACE macros

0f53f299 · Vladislav Vinogradov · d9265413 · 0f53f299 · 0f53f299 · 0f53f299
Commit 0f53f299 authored Nov 14, 2011 by Vladislav Vinogradov
73 changed files
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -425,21 +425,20 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // Polar <-> Cart

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace mathfunc 
+namespace cv { namespace gpu { namespace device 
 {
+    namespace mathfunc 
+    {
        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    }
+}}}

 namespace
 {
    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
    {
-        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+        using namespace ::cv::gpu::device::mathfunc;

        CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
        CV_Assert(x.depth() == CV_32F);
@@ -459,7 +458,7 @@ namespace

    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
    {
-        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+        using namespace ::cv::gpu::device::mathfunc;

        CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
        CV_Assert(mag.depth() == CV_32F);

--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -55,19 +55,18 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&,

 #else /* !defined (HAVE_CUDA) */

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bilateral_filter
+namespace cv { namespace gpu { namespace device 
 {
+    namespace bilateral_filter
+    {
        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);

        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    }
+}}}

-using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;
+using namespace ::cv::gpu::device::bilateral_filter;

 namespace
 {

--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -52,19 +52,18 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu

 #else

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace blend
+namespace cv { namespace gpu { namespace device 
 {
+    namespace blend
+    {
        template <typename T>
        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);

        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    }
+}}}

-using namespace OPENCV_DEVICE_NAMESPACE_ blend;
+using namespace ::cv::gpu::device::blend;

 void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
                          GpuMat& result, Stream& stream)

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -82,10 +82,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec

 #else /* !defined (HAVE_CUDA) */

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bf_match
+namespace cv { namespace gpu { namespace device 
 {
+    namespace bf_match
+    {
        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
            const DevMem2Di& trainIdx, const DevMem2Df& distance, 
            int cc, cudaStream_t stream);
@@ -105,10 +105,10 @@ namespace bf_match
        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
            int cc, cudaStream_t stream);
-}
+    }

-namespace bf_knnmatch
-{
+    namespace bf_knnmatch
+    {
        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
            const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
            int cc, cudaStream_t stream);
@@ -128,10 +128,10 @@ namespace bf_knnmatch
        template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
            const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
            int cc, cudaStream_t stream);
-}
+    }

-namespace bf_radius_match 
-{
+    namespace bf_radius_match 
+    {
        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
            int cc, cudaStream_t stream);
@@ -153,9 +153,8 @@ namespace bf_radius_match
        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
            int cc, cudaStream_t stream);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    }
+}}}

 ////////////////////////////////////////////////////////////////////
 // Train collection
@@ -199,7 +198,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
    if (query.empty() || train.empty())
        return;

-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
+    using namespace ::cv::gpu::device::bf_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
                             const DevMem2Di& trainIdx, const DevMem2Df& distance,
@@ -341,7 +340,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
+    using namespace ::cv::gpu::device::bf_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
@@ -452,7 +451,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
    if (query.empty() || train.empty())
        return;

-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
+    using namespace ::cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
                             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
@@ -581,7 +580,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
+    using namespace ::cv::gpu::device::bf_knnmatch;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                             const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
@@ -762,7 +761,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    if (query.empty() || train.empty())
        return;

-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
+    using namespace ::cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@@ -893,7 +892,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    if (query.empty() || empty())
        return;

-    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
+    using namespace ::cv::gpu::device::bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 

--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -56,31 +56,30 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat

 #else

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace transform_points 
+namespace cv { namespace gpu { namespace device 
 {
+    namespace transform_points 
+    {
        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
-}
+    }

-namespace project_points 
-{
+    namespace project_points 
+    {
        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
-}
+    }

-namespace solve_pnp_ransac
-{
+    namespace solve_pnp_ransac
+    {
        int maxNumIters();

        void computeHypothesisScores(
                const int num_hypotheses, const int num_points, const float* rot_matrices,
                const float3* transl_vectors, const float3* object, const float2* image,
                const float dist_threshold, int* hypothesis_scores);
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    }
+}}}

-using namespace OPENCV_DEVICE_NAMESPACE;
+using namespace ::cv::gpu::device;

 namespace
 {

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,22 +43,22 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bilateral_filter {
-
-__constant__ float* ctable_color;
-__constant__ float* ctable_space;
-__constant__ size_t ctable_space_step;
+namespace cv { namespace gpu { namespace device 
+{
+    namespace bilateral_filter 
+    {
+        __constant__ float* ctable_color;
+        __constant__ float* ctable_space;
+        __constant__ size_t ctable_space_step;

-__constant__ int cndisp;
-__constant__ int cradius;
+        __constant__ int cndisp;
+        __constant__ int cradius;

-__constant__ short cedge_disc;
-__constant__ short cmax_disc;
+        __constant__ short cedge_disc;
+        __constant__ short cmax_disc;

-void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
-{
+        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
+        {
            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
            size_t table_space_step = table_space.step / sizeof(float);
@@ -69,11 +69,11 @@ void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int ra

            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
-}
+        }

-template <int channels>
-struct DistRgbMax
-{
+        template <int channels>
+        struct DistRgbMax
+        {
            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
            {
                uchar x = ::abs(a[0] - b[0]);
@@ -81,20 +81,20 @@ struct DistRgbMax
                uchar z = ::abs(a[2] - b[2]);
                return (::max(::max(x, y), z));
            }
-};
+        };

-template <>
-struct DistRgbMax<1>
-{
+        template <>
+        struct DistRgbMax<1>
+        {
            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
            {
                return ::abs(a[0] - b[0]);
            }
-};
+        };

-template <int channels, typename T>
-__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-{
+        template <int channels, typename T>
+        __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+        {
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);

@@ -173,11 +173,11 @@ __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar*
                    *(disp + y * disp_step + x) = dp[id];
                }
            }
-}
+        }

-template <typename T>     
-void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
+        template <typename T>     
+        void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
            grid.x = divUp(disp.cols, threads.x << 1);
@@ -211,18 +211,16 @@ void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int

            if (stream != 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-}
+        }

-void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
+        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        {
            bilateral_filter_caller(disp, img, channels, iters, stream);
-}
+        }

-void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
+        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        {
            bilateral_filter_caller(disp, img, channels, iters, stream);
-}
-
-} // namespace bilateral_filter
-
-END_OPENCV_DEVICE_NAMESPACE
+        }
+    } // namespace bilateral_filter
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -42,14 +42,14 @@

 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace blend {
-
-template <typename T>
-__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                  const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+namespace cv { namespace gpu { namespace device 
 {
+    namespace blend 
+    {
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;

@@ -62,11 +62,11 @@ __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> i
                T p2 = img2.ptr(y)[x];
                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
            }
-}	
+        }	

-template <typename T>
-void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-{
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
            dim3 threads(16, 16);
            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
            
@@ -75,15 +75,15 @@ void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> i

            if (stream == 0)
                cudaSafeCall(cudaDeviceSynchronize());
-}
+        }

-template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);


-__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-{
+        {
            int x = blockIdx.x * blockDim.x + threadIdx.x;
            int y = blockIdx.y * blockDim.y + threadIdx.y;

@@ -99,10 +99,10 @@ __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, c
                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
            }
-}
+        }

-void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-{
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
            dim3 threads(16, 16);
            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
            
@@ -111,8 +111,6 @@ void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, Ptr

            if (stream == 0)
                cudaSafeCall(cudaDeviceSynchronize());
-}
-
-} // namespace blend 
-
-END_OPENCV_DEVICE_NAMESPACE
+        }
+    } // namespace blend 
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -44,12 +44,12 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
-
-namespace transform_points
+namespace cv { namespace gpu { namespace device 
 {
+    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
+
+    namespace transform_points
+    {
        __constant__ float3 crot0;
        __constant__ float3 crot1;
        __constant__ float3 crot2;
@@ -74,12 +74,12 @@ namespace transform_points
            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
+            ::cv::gpu::device::transform(src, dst, TransformOp(), stream);
        }
-} // namespace transform_points
+    } // namespace transform_points

-namespace project_points
-{
+    namespace project_points
+    {
        __constant__ float3 crot0;
        __constant__ float3 crot1;
        __constant__ float3 crot2;
@@ -113,12 +113,12 @@ namespace project_points
            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
+            ::cv::gpu::device::transform(src, dst, ProjectOp(), stream);
        }
-} // namespace project_points
+    } // namespace project_points

-namespace solve_pnp_ransac
-{
+    namespace solve_pnp_ransac
+    {
        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];

@@ -187,6 +187,5 @@ namespace solve_pnp_ransac

            cudaSafeCall( cudaDeviceSynchronize() );
        }
-} // namespace solvepnp_ransac
-
-END_OPENCV_DEVICE_NAMESPACE
+    } // namespace solvepnp_ransac
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -44,12 +44,12 @@
 #include <algorithm>
 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace canny {
-
-__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+namespace cv { namespace gpu { namespace device 
 {
+    namespace canny 
+    {
+        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        {
            __shared__ int smem[16][18];

            const int j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -71,10 +71,10 @@ __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi d
                    dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
                }
            }
-}
+        }

-void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
-{
+        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

@@ -82,26 +82,26 @@ void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int ro
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall(cudaThreadSynchronize());
-}
+        }

-struct L1
-{
+        struct L1
+        {
            static __device__ __forceinline__ float calc(int x, int y)
            {
                return ::abs(x) + ::abs(y);
            }
-};
-struct L2
-{
+        };
+        struct L2
+        {
            static __device__ __forceinline__ float calc(int x, int y)
            {
                return ::sqrtf(x * x + y * y);
            }
-};
+        };

-template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-{
+        {
            __shared__ int sdx[18][16];
            __shared__ int sdy[18][16];

@@ -133,10 +133,10 @@ template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, co
                    mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
                }
            }
-}
+        }

-void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
-{
+        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

@@ -148,19 +148,19 @@ void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi d
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall(cudaThreadSynchronize());
-}
+        }

-template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-{
+        template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+        {
            const int j = blockIdx.x * blockDim.x + threadIdx.x;
            const int i = blockIdx.y * blockDim.y + threadIdx.y;

            if (i < rows && j < cols)
                mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
-}
+        }

-void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
-{
+        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

@@ -172,15 +172,15 @@ void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int col
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall(cudaThreadSynchronize());
-}
+        }

-//////////////////////////////////////////////////////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////
            
-#define CANNY_SHIFT 15
-#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+        #define CANNY_SHIFT 15
+        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)

-__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-{
+        __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+        {
            __shared__ float smem[18][18];

            const int j = blockIdx.x * 16 + threadIdx.x;
@@ -239,13 +239,13 @@ __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag
                
                map.ptr(i + 1)[j + 1] = edge_type;
            }
-}
+        }

-#undef CANNY_SHIFT
-#undef TG22
+        #undef CANNY_SHIFT
+        #undef TG22

-void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-{
+        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

@@ -253,14 +253,14 @@ void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows,
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall(cudaThreadSynchronize());
-}
+        }

-//////////////////////////////////////////////////////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////

-__device__ unsigned int counter = 0;
+        __device__ unsigned int counter = 0;

-__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
-{
+        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+        {
            #if __CUDA_ARCH__ >= 120

            __shared__ int smem[18][18];
@@ -335,10 +335,10 @@ __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int co
            }

            #endif
-}
+        }

-void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
-{
+        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
+        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

@@ -346,13 +346,13 @@ void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall(cudaThreadSynchronize());
-}
+        }

-__constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-__constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+        __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+        __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};

-__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
-{
+        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
+        {
            #if __CUDA_ARCH__ >= 120

            const int stack_size = 512;
@@ -441,10 +441,10 @@ __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2,
            }

            #endif
-}
+        }

-void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
-{
+        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
            
@@ -466,19 +466,19 @@ void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int row

                std::swap(st1, st2);
            }
-}
+        }

-__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
-{
+        __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
+        {
            const int j = blockIdx.x * 16 + threadIdx.x;
            const int i = blockIdx.y * 16 + threadIdx.y;

            if (i < rows && j < cols)
                dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
-}
+        }

-void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
-{
+        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
+        {
            dim3 block(16, 16, 1);
            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

@@ -486,8 +486,6 @@ void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall(cudaThreadSynchronize());
-}
-
-} // namespace canny
-
-END_OPENCV_DEVICE_NAMESPACE
+        }
+    } // namespace canny
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -47,26 +47,26 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define MAX_KERNEL_SIZE 16
-#define BLOCK_DIM_X 16
-#define BLOCK_DIM_Y 4
-#define RESULT_STEPS 8
-#define HALO_STEPS 1
-
-namespace column_filter {
+namespace cv { namespace gpu { namespace device 
+{
+    #define MAX_KERNEL_SIZE 16
+    #define BLOCK_DIM_X 16
+    #define BLOCK_DIM_Y 4
+    #define RESULT_STEPS 8
+    #define HALO_STEPS 1

-__constant__ float c_kernel[MAX_KERNEL_SIZE];
+    namespace column_filter 
+    {
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];

-void loadKernel(const float kernel[], int ksize)
-{
+        void loadKernel(const float kernel[], int ksize)
+        {
            cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
-}
+        }

-template <int KERNEL_SIZE, typename T, typename D, typename B>
-__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
-{
+        template <int KERNEL_SIZE, typename T, typename D, typename B>
+        __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+        {
            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

            __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
@@ -111,11 +111,11 @@ __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int a
                        dst.ptr(dstY)[x] = saturate_cast<D>(sum);
                }
            }
-}
+        }

-template <int ksize, typename T, typename D, template<typename> class B>
-void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
-{        
+        template <int ksize, typename T, typename D, template<typename> class B>
+        void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+        {        
            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));

@@ -126,11 +126,11 @@ void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst,

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-}
+        }

-template <typename T, typename D>
-void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-{
+        template <typename T, typename D>
+        void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+        {
            typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
            static const caller_t callers[5][17] = 
            {
@@ -234,16 +234,14 @@ void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const fl
            loadKernel(kernel, ksize);

            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-}
-
-template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-
-} // namespace column_filter
+        }

-END_OPENCV_DEVICE_NAMESPACE
+        template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    } // namespace column_filter
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -45,29 +45,29 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define UINT_BITS 32U
-
-//Warps == subhistograms per threadblock
-#define WARP_COUNT 6
+namespace cv { namespace gpu { namespace device 
+{
+    #define UINT_BITS 32U

-//Threadblock size
-#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-#define HISTOGRAM256_BIN_COUNT 256
+    //Warps == subhistograms per threadblock
+    #define WARP_COUNT 6

-//Shared memory per threadblock
-#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
+    //Threadblock size
+    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
+    #define HISTOGRAM256_BIN_COUNT 256

-#define PARTIAL_HISTOGRAM256_COUNT 240
+    //Shared memory per threadblock
+    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)

-#define MERGE_THREADBLOCK_SIZE 256
+    #define PARTIAL_HISTOGRAM256_COUNT 240

-#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
+    #define MERGE_THREADBLOCK_SIZE 256

-namespace hist {
+    #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)

-#if (!USE_SMEM_ATOMICS)
+    namespace hist 
+    {
+        #if (!USE_SMEM_ATOMICS)

            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )

@@ -82,7 +82,7 @@ namespace hist {
                } while (s_WarpHist[data] != count);
            }

-#else
+        #else

            #define TAG_MASK 0xFFFFFFFFU

@@ -91,20 +91,20 @@ namespace hist {
                atomicAdd(s_WarpHist + data, 1);
            }

-#endif
+        #endif

-__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-{
+        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
+        {
            uint x = pos_x << 2;

            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
-}
+        }

-__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-{
+        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+        {
            //Per-warp subhistogram storage
            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
@@ -138,17 +138,17 @@ __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistogra

                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
            }
-}
+        }

-////////////////////////////////////////////////////////////////////////////////
-// Merge histogram256() output
-// Run one threadblock per bin; each threadblock adds up the same bin counter
-// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-// takes only a fraction of total processing time
-////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////
+        // Merge histogram256() output
+        // Run one threadblock per bin; each threadblock adds up the same bin counter
+        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
+        // takes only a fraction of total processing time
+        ////////////////////////////////////////////////////////////////////////////////

-__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-{
+        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
+        {
            uint sum = 0;

            #pragma unroll
@@ -167,10 +167,10 @@ __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histog

            if(threadIdx.x == 0)
                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-}
+        }

-void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
-{
+        void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
+        {
            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
                DevMem2D_<uint>(src),
                buf, 
@@ -185,12 +185,12 @@ void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-}
+        }

-__constant__ int c_lut[256];
+        __constant__ int c_lut[256];

-__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
-{
+        __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
+        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

@@ -200,10 +200,10 @@ __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
                const int lut = c_lut[val];
                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
            }
-}
+        }

-void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
-{
+        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
+        {
            dim3 block(16, 16);
            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

@@ -214,8 +214,6 @@ void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-} // namespace hist
-
-END_OPENCV_DEVICE_NAMESPACE
+        }
+    } // namespace hist
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -50,7 +50,7 @@
 #include "safe_call.hpp"

 #ifndef CV_PI
-#define CV_PI   3.1415926535897932384626433832795f
+#define CV_PI   3.1415926535897932384626433832795
 #endif

 #ifndef CV_PI_F
@@ -61,27 +61,21 @@
  #endif
 #endif

-#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device { 
-#define END_OPENCV_DEVICE_NAMESPACE   }}}
-#define OPENCV_DEVICE_NAMESPACE       ::cv::gpu::device
-#define OPENCV_DEVICE_NAMESPACE_      ::cv::gpu::device:: 
-
 #ifdef __CUDACC__

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-typedef signed char schar;
-typedef unsigned int uint;
-
-template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+namespace cv { namespace gpu { namespace device 
 {
+    typedef unsigned char uchar;
+    typedef unsigned short ushort;
+    typedef signed char schar;
+    typedef unsigned int uint;
+
+    template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+    {
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-}
-
-END_OPENCV_DEVICE_NAMESPACE
+    }
+}}}

 #endif

@@ -102,87 +96,6 @@ namespace cv { namespace gpu

    static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }

-    /*template<class T> static inline void uploadConstant(const char* name, const T& value) 
-    { 
-        cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); 
-    }
-
-    template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) 
-    {
-        cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); 
-    }   */     
-
-    //template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)
-    //{            
-    //    //!!!! const_cast is disabled!
-    //    //!!!! Please use constructor of 'class texture'  instead.
-    //
-    //    //textureReference* tex; 
-    //    //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); 
-    //    //tex->normalized = normalized;
-    //    //tex->filterMode = filterMode;
-    //    //tex->addressMode[0] = addrMode;
-    //    //tex->addressMode[1] = addrMode;
-    //    
-    //    const textureReference* tex; 
-    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-    //
-    //    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    //    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-    //}
-
-    //static inline void unbindTexture(const char *name)
-    //{
-    //    const textureReference* tex; 
-    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-    //    cudaSafeCall( cudaUnbindTexture(tex) );
-    //}
-
-    
-
-    //class TextureBinder
-    //{
-    //public:
-    //    TextureBinder() : tex_(0) {}
-    //    template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
-    //    {
-    //        bind(tex, img);
-    //    }
-    //    template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
-    //    {
-    //        bind(tex_name, img);
-    //    }
-    //    ~TextureBinder() { unbind(); }
-    //
-    //    template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
-    //    {
-    //        unbind();
-    //
-    //        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    //        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-    //
-    //        tex_ = tex;
-    //    }
-    //    template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
-    //    {
-    //        const textureReference* tex; 
-    //        cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); 
-    //        bind(tex, img);
-    //    }
-    //
-    //    void unbind()
-    //    {
-    //        if (tex_)
-    //        {
-    //            cudaUnbindTexture(tex_);
-    //            tex_ = 0;
-    //        }
-    //    }
-    //
-    //private:
-    //    const textureReference* tex_;
-    //};
-
    class NppStreamHandler
    {
    public:

--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -42,46 +42,46 @@

 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace mathfunc {
-
-//////////////////////////////////////////////////////////////////////////////////////
-// Cart <-> Polar
-
-struct Nothing
+namespace cv { namespace gpu { namespace device 
 {
+    namespace mathfunc 
+    {
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Cart <-> Polar
+
+        struct Nothing
+        {
            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
            {
            }
-};
-struct Magnitude
-{
+        };
+        struct Magnitude
+        {
            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
            {
                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
            }
-};
-struct MagnitudeSqr
-{
+        };
+        struct MagnitudeSqr
+        {
            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
            {
                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
            }
-};
-struct Atan2
-{
+        };
+        struct Atan2
+        {
            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
            {
                float angle = ::atan2f(y_data, x_data);
                angle += (angle < 0) * 2.0 * CV_PI;
                dst[y * dst_step + x] = scale * angle;
            }
-};
-template <typename Mag, typename Angle>
-__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
+        };
+        template <typename Mag, typename Angle>
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-{
+        {
 	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
 	        const int y = blockDim.y * blockIdx.y + threadIdx.y;

@@ -93,26 +93,26 @@ __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr,
                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
            }
-}
+        }

-struct NonEmptyMag
-{
+        struct NonEmptyMag
+        {
            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
            {
                return mag[y * mag_step + x];
            }
-};
-struct EmptyMag
-{
+        };
+        struct EmptyMag
+        {
            static __device__ __forceinline__ float get(const float*, size_t, int, int)
            {
                return 1.0f;
            }
-};
-template <typename Mag>
-__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+        };
+        template <typename Mag>
+        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-{
+        {
 	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
 	        const int y = blockDim.y * blockIdx.y + threadIdx.y;

@@ -127,11 +127,11 @@ __global__ void polarToCart(const float* mag, size_t mag_step, const float* angl
                xptr[y * x_step + x] = mag_data * cos_a;
                yptr[y * y_step + x] = mag_data * sin_a;
            }
-}
+        }

-template <typename Mag, typename Angle>
-void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
-{
+        template <typename Mag, typename Angle>
+        void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);

@@ -147,10 +147,10 @@ void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-}
+        }

-void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
-{
+        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        {
            typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
            static const caller_t callers[2][2][2] = 
            {
@@ -177,11 +177,11 @@ void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMe
            };

            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-}
+        }

-template <typename Mag>
-void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
-{
+        template <typename Mag>
+        void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);

@@ -196,10 +196,10 @@ void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-}
+        }

-void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
-{
+        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        {
            typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
            static const caller_t callers[2] = 
            {
@@ -208,8 +208,6 @@ void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, b
            };

            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-}
-
-} // namespace mathfunc
-
-END_OPENCV_DEVICE_NAMESPACE
+        }
+    } // namespace mathfunc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -46,12 +46,12 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
+namespace cv { namespace gpu { namespace device 
 {
+    namespace imgproc 
+    {
+        template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;

            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -122,10 +122,10 @@ template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src,
                if (dst_x < dst_cols)
                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
            }
-}
+        }

-template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
+        template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {
            const dim3 block(256);
            const dim3 grid(divUp(src.cols, block.x), dst.rows);

@@ -136,10 +136,10 @@ template <typename T, template <typename> class B> void pyrDown_caller(const Dev

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-}
+        }

-template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
-{
+        template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+        {
            typedef typename TypeVec<T, cn>::vec_type type;

            typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
@@ -150,38 +150,36 @@ template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMe
            };

            callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-}
-
-template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-} // namespace imgproc
+        }

-END_OPENCV_DEVICE_NAMESPACE
+        template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
--- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
--- a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp