Commit be8e31f1 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

minor gpu module refactoring: split big .cu files, disabled unnecessary template instantiation

parent d99f4a2b
......@@ -756,12 +756,6 @@ namespace cv
//! computes the proximity map for the raster template and the image where the template is searched for
CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method);
//! downsamples image
CV_EXPORTS void downsample(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
//! upsamples image
CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
//! smoothes the source image and downsamples it
CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
......
......@@ -3,7 +3,7 @@
PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing::ValuesIn(devices()),
testing::Values(GPU_TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC4),
testing::Values(3, 5, 7)))
testing::Values(3, 5)))
{
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
Size size = std::tr1::get<1>(GetParam());
......@@ -37,7 +37,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
testing::Values(GPU_TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC4),
testing::Values((int)MORPH_ERODE, (int)MORPH_DILATE),
testing::Values(3, 5, 7)))
testing::Values(3, 5)))
{
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
Size size = std::tr1::get<1>(GetParam());
......@@ -71,7 +71,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(testing::ValuesIn(devices()),
testing::Values(GPU_TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC4),
testing::Values(3, 5, 7)))
testing::Values(3, 5)))
{
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
Size size = std::tr1::get<1>(GetParam());
......@@ -103,8 +103,8 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()),
testing::Values(GPU_TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC3, CV_32FC1),
testing::Values(3, 5, 7),
testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1),
testing::Values(3, 5),
testing::Values((int)BORDER_REFLECT101, (int)BORDER_CONSTANT)))
{
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
......
......@@ -244,8 +244,8 @@ PERF_TEST_P(DevInfo_Size_MatType, threshold, testing::Combine(testing::ValuesIn(
}
PERF_TEST_P(DevInfo_Size_MatType_Interpolation_SizeCoeff, resize, testing::Combine(testing::ValuesIn(devices()),
testing::Values(GPU_TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
testing::Values(szSXGA, sz1080p),
testing::Values(CV_8UC1, CV_8UC4, CV_16UC1, CV_32FC1),
testing::Values((int)INTER_NEAREST, (int)INTER_LINEAR, (int)INTER_CUBIC),
testing::Values(0.5, 2.0)))
{
......
......@@ -53,7 +53,8 @@ typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, int, int> > DevInfo_K_Des
const cv::Size sz1800x1500 = cv::Size(1800, 1500);
const cv::Size sz4700x3000 = cv::Size(4700, 3000);
#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
//#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
#define GPU_TYPICAL_MAT_SIZES szSXGA, sz1080p, sz4700x3000
//! read image from testdata folder.
Mat readImage(const string& fileName, int flags = CV_LOAD_IMAGE_COLOR);
......
......@@ -179,18 +179,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
static const match_caller_t match_callers[3][8] =
{
{
matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<signed char>,
matchSingleL1_gpu<unsigned char>, 0/*matchSingleL1_gpu<signed char>*/,
matchSingleL1_gpu<unsigned short>, matchSingleL1_gpu<short>,
matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
},
{
matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<signed char>,
matchSingleL2_gpu<unsigned short>, matchSingleL2_gpu<short>,
matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
0/*matchSingleL2_gpu<unsigned char>*/, 0/*matchSingleL2_gpu<signed char>*/,
0/*matchSingleL2_gpu<unsigned short>*/, 0/*matchSingleL2_gpu<short>*/,
0/*matchSingleL2_gpu<int>*/, matchSingleL2_gpu<float>, 0, 0
},
{
matchSingleHamming_gpu<unsigned char>, matchSingleHamming_gpu<signed char>,
matchSingleHamming_gpu<unsigned short>, matchSingleHamming_gpu<short>,
matchSingleHamming_gpu<unsigned char>, 0/*matchSingleHamming_gpu<signed char>*/,
matchSingleHamming_gpu<unsigned short>, 0/*matchSingleHamming_gpu<short>*/,
matchSingleHamming_gpu<int>, 0, 0, 0
}
};
......@@ -318,18 +318,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
static const match_caller_t match_callers[3][8] =
{
{
matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<signed char>,
matchCollectionL1_gpu<unsigned char>, 0/*matchCollectionL1_gpu<signed char>*/,
matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
},
{
matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<signed char>,
matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
0/*matchCollectionL2_gpu<unsigned char>*/, 0/*matchCollectionL2_gpu<signed char>*/,
0/*matchCollectionL2_gpu<unsigned short>*/, 0/*matchCollectionL2_gpu<short>*/,
0/*matchCollectionL2_gpu<int>*/, matchCollectionL2_gpu<float>, 0, 0
},
{
matchCollectionHamming_gpu<unsigned char>, matchCollectionHamming_gpu<signed char>,
matchCollectionHamming_gpu<unsigned short>, matchCollectionHamming_gpu<short>,
matchCollectionHamming_gpu<unsigned char>, 0/*matchCollectionHamming_gpu<signed char>*/,
matchCollectionHamming_gpu<unsigned short>, 0/*matchCollectionHamming_gpu<short>*/,
matchCollectionHamming_gpu<int>, 0, 0, 0
}
};
......@@ -427,16 +427,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
static const match_caller_t match_callers[3][8] =
{
{
knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<signed char>, knnMatchL1_gpu<unsigned short>,
knnMatchL1_gpu<unsigned char>, 0/*knnMatchL1_gpu<signed char>*/, knnMatchL1_gpu<unsigned short>,
knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
},
{
knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<signed char>, knnMatchL2_gpu<unsigned short>,
knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
0/*knnMatchL2_gpu<unsigned char>*/, 0/*knnMatchL2_gpu<signed char>*/, 0/*knnMatchL2_gpu<unsigned short>*/,
0/*knnMatchL2_gpu<short>*/, 0/*knnMatchL2_gpu<int>*/, knnMatchL2_gpu<float>, 0, 0
},
{
knnMatchHamming_gpu<unsigned char>, knnMatchHamming_gpu<signed char>, knnMatchHamming_gpu<unsigned short>,
knnMatchHamming_gpu<short>, knnMatchHamming_gpu<int>, 0, 0, 0
knnMatchHamming_gpu<unsigned char>, 0/*knnMatchHamming_gpu<signed char>*/, knnMatchHamming_gpu<unsigned short>,
0/*knnMatchHamming_gpu<short>*/, knnMatchHamming_gpu<int>, 0, 0, 0
}
};
......@@ -605,16 +605,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
static const radiusMatch_caller_t radiusMatch_callers[3][8] =
{
{
radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<signed char>, radiusMatchL1_gpu<unsigned short>,
radiusMatchL1_gpu<unsigned char>, 0/*radiusMatchL1_gpu<signed char>*/, radiusMatchL1_gpu<unsigned short>,
radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
},
{
radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<signed char>, radiusMatchL2_gpu<unsigned short>,
radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
0/*radiusMatchL2_gpu<unsigned char>*/, 0/*radiusMatchL2_gpu<signed char>*/, 0/*radiusMatchL2_gpu<unsigned short>*/,
0/*radiusMatchL2_gpu<short>*/, 0/*radiusMatchL2_gpu<int>*/, radiusMatchL2_gpu<float>, 0, 0
},
{
radiusMatchHamming_gpu<unsigned char>, radiusMatchHamming_gpu<signed char>, radiusMatchHamming_gpu<unsigned short>,
radiusMatchHamming_gpu<short>, radiusMatchHamming_gpu<int>, 0, 0, 0
radiusMatchHamming_gpu<unsigned char>, 0/*radiusMatchHamming_gpu<signed char>*/, radiusMatchHamming_gpu<unsigned short>,
0/*radiusMatchHamming_gpu<short>*/, radiusMatchHamming_gpu<int>, 0, 0, 0
}
};
......
This diff is collapsed.
This diff is collapsed.
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace bf_krnls
{
__constant__ float* ctable_color;
__constant__ float* ctable_space;
__constant__ size_t ctable_space_step;
__constant__ int cndisp;
__constant__ int cradius;
__constant__ short cedge_disc;
__constant__ short cmax_disc;
}
namespace cv { namespace gpu { namespace bf
{
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
{
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
}
}}}
namespace bf_krnls
{
template <int channels>
struct DistRgbMax
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
uchar x = abs(a[0] - b[0]);
uchar y = abs(a[1] - b[1]);
uchar z = abs(a[2] - b[2]);
return (max(max(x, y), z));
}
};
template <>
struct DistRgbMax<1>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
return abs(a[0] - b[0]);
}
};
template <int channels, typename T>
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
T dp[5];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)
{
const int ymin = max(0, y - cradius);
const int xmin = max(0, x - cradius);
const int ymax = min(h - 1, y + cradius);
const int xmax = min(w - 1, x + cradius);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const uchar* ic = img + y * img_step + channels * x;
for(int yi = ymin; yi <= ymax; yi++)
{
const T* disp_y = disp + yi * disp_step;
for(int xi = xmin; xi <= xmax; xi++)
{
const uchar* in = img + yi * img_step + channels * xi;
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
const T disp_reg = disp_y[xi];
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
}
}
}
}
namespace cv { namespace gpu { namespace bf
{
template <typename T>
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1);
grid.y = divUp(disp.rows, threads.y);
switch (channels)
{
case 1:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
case 3:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
default:
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
if (stream != 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
}}}
This diff is collapsed.
This diff is collapsed.
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y;
__shared__ value_type smem[256 + 4];
value_type sum;
const int src_y = 2*y;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
smem[2 + threadIdx.x] = sum;
if (threadIdx.x < 2)
{
const int left_x = x - 2 + threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
smem[threadIdx.x] = sum;
}
if (threadIdx.x > 253)
{
const int right_x = x + threadIdx.x + 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
smem[4 + threadIdx.x] = sum;
}
__syncthreads();
if (threadIdx.x < 128)
{
const int tid2 = threadIdx.x * 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
if (dst_x < dst_cols)
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
}
}
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows);
B<T> b(src.rows, src.cols);
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
}}}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ T smem1[10][10];
__shared__ value_type smem2[20][16];
value_type sum;
if (threadIdx.x < 10 && threadIdx.y < 10)
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
__syncthreads();
const int tidx = threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
smem2[2 + threadIdx.y][tidx] = sum;
if (threadIdx.y < 2)
{
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
smem2[threadIdx.y][tidx] = sum;
}
if (threadIdx.y > 13)
{
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
smem2[4 + threadIdx.y][tidx] = sum;
}
__syncthreads();
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
}
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<T> b(src.rows, src.cols);
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
}}}
This diff is collapsed.
This diff is collapsed.
......@@ -722,7 +722,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC1 || srcType == CV_16SC2
CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 /*|| srcType == CV_16SC1*/ /*|| srcType == CV_16SC2*/
|| srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1);
CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));
......@@ -747,12 +747,12 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
case CV_8UC4:
func = filters::linearRowFilter_gpu<uchar4, float4>;
break;
case CV_16SC1:
/*case CV_16SC1:
func = filters::linearRowFilter_gpu<short, float>;
break;
case CV_16SC2:
break;*/
/*case CV_16SC2:
func = filters::linearRowFilter_gpu<short2, float2>;
break;
break;*/
case CV_16SC3:
func = filters::linearRowFilter_gpu<short3, float3>;
break;
......@@ -837,7 +837,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC1 || dstType == CV_16SC2
CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 /*|| dstType == CV_16SC1*/ /*|| dstType == CV_16SC2*/
|| dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);
CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
......@@ -862,12 +862,12 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
case CV_8UC4:
func = filters::linearColumnFilter_gpu<float4, uchar4>;
break;
case CV_16SC1:
/*case CV_16SC1:
func = filters::linearColumnFilter_gpu<float, short>;
break;
case CV_16SC2:
break;*/
/*case CV_16SC2:
func = filters::linearColumnFilter_gpu<float2, short2>;
break;
break;*/
case CV_16SC3:
func = filters::linearColumnFilter_gpu<float3, short3>;
break;
......
......@@ -90,8 +90,6 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
......@@ -120,17 +118,20 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
typedef void (*caller_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
static const caller_t callers[6][4] =
{
{remap_gpu<uchar>, remap_gpu<uchar2>, remap_gpu<uchar3>, remap_gpu<uchar4>},
{remap_gpu<schar>, remap_gpu<char2>, remap_gpu<char3>, remap_gpu<char4>},
{remap_gpu<ushort>, remap_gpu<ushort2>, remap_gpu<ushort3>, remap_gpu<ushort4>},
{remap_gpu<short>, remap_gpu<short2>, remap_gpu<short3>, remap_gpu<short4>},
{remap_gpu<int>, remap_gpu<int2>, remap_gpu<int3>, remap_gpu<int4>},
{remap_gpu<float>, remap_gpu<float2>, remap_gpu<float3>, remap_gpu<float4>}
{remap_gpu<uchar>, 0/*remap_gpu<uchar2>*/, remap_gpu<uchar3>, remap_gpu<uchar4>},
{0/*remap_gpu<schar>*/, 0/*remap_gpu<char2>*/, 0/*remap_gpu<char3>*/, 0/*remap_gpu<char4>*/},
{remap_gpu<ushort>, 0/*remap_gpu<ushort2>*/, remap_gpu<ushort3>, remap_gpu<ushort4>},
{remap_gpu<short>, 0/*remap_gpu<short2>*/, remap_gpu<short3>, remap_gpu<short4>},
{0/*remap_gpu<int>*/, 0/*remap_gpu<int2>*/, 0/*remap_gpu<int3>*/, 0/*remap_gpu<int4>*/},
{remap_gpu<float>, 0/*remap_gpu<float2>*/, remap_gpu<float3>, remap_gpu<float4>}
};
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
caller_t func = callers[src.depth()][src.channels() - 1];
CV_Assert(func != 0);
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
......@@ -142,7 +143,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
Scalar_<float> borderValueFloat;
borderValueFloat = borderValue;
callers[src.depth()][src.channels() - 1](src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
func(src, xmap, ymap, dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
......@@ -279,19 +280,6 @@ namespace cv { namespace gpu { namespace imgproc
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{
using namespace cv::gpu::imgproc;
typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
static const caller_t callers[6][4] =
{
{resize_gpu<uchar>, resize_gpu<uchar2>, resize_gpu<uchar3>, resize_gpu<uchar4>},
{resize_gpu<schar>, resize_gpu<char2>, resize_gpu<char3>, resize_gpu<char4>},
{resize_gpu<ushort>, resize_gpu<ushort2>, resize_gpu<ushort3>, resize_gpu<ushort4>},
{resize_gpu<short>, resize_gpu<short2>, resize_gpu<short3>, resize_gpu<short4>},
{resize_gpu<int>, resize_gpu<int2>, resize_gpu<int3>, resize_gpu<int4>},
{resize_gpu<float>, resize_gpu<float2>, resize_gpu<float3>, resize_gpu<float4>}
};
CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC );
CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
......@@ -352,6 +340,19 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
}
else
{
using namespace cv::gpu::imgproc;
typedef void (*caller_t)(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
static const caller_t callers[6][4] =
{
{resize_gpu<uchar>, 0/*resize_gpu<uchar2>*/, resize_gpu<uchar3>, resize_gpu<uchar4>},
{0/*resize_gpu<schar>*/, 0/*resize_gpu<char2>*/, 0/*resize_gpu<char3>*/, 0/*resize_gpu<char4>*/},
{resize_gpu<ushort>, 0/*resize_gpu<ushort2>*/, resize_gpu<ushort3>, resize_gpu<ushort4>},
{resize_gpu<short>, 0/*resize_gpu<short2>*/, resize_gpu<short3>, resize_gpu<short4>},
{0/*resize_gpu<int>*/, 0/*resize_gpu<int2>*/, 0/*resize_gpu<int3>*/, 0/*resize_gpu<int4>*/},
{resize_gpu<float>, 0/*resize_gpu<float2>*/, resize_gpu<float3>, resize_gpu<float4>}
};
callers[src.depth()][src.channels() - 1](src, static_cast<float>(fx), static_cast<float>(fy), dst, interpolation, stream);
}
}
......@@ -1589,75 +1590,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
cufftSafeCall(cufftDestroy(planC2R));
}
////////////////////////////////////////////////////////////////////
// downsample
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, int cn>
void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
}}}
void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, Stream& stream)
{
CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
static const Caller callers[6][4] =
{{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
{0,0,0,0}, {0,0,0,0},
{imgproc::downsampleCaller<short,1>, imgproc::downsampleCaller<short,2>,
imgproc::downsampleCaller<short,3>, imgproc::downsampleCaller<short,4>},
{0,0,0,0},
{imgproc::downsampleCaller<float,1>, imgproc::downsampleCaller<float,2>,
imgproc::downsampleCaller<float,3>, imgproc::downsampleCaller<float,4>}};
Caller caller = callers[src.depth()][src.channels()-1];
if (!caller)
CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// upsample
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, int cn>
void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
}}}
void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
{
CV_Assert(src.depth() < CV_64F && src.channels() <= 4);
typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
static const Caller callers[6][5] =
{{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
{0,0,0,0}, {0,0,0,0},
{imgproc::upsampleCaller<short,1>, imgproc::upsampleCaller<short,2>,
imgproc::upsampleCaller<short,3>, imgproc::upsampleCaller<short,4>},
{0,0,0,0},
{imgproc::upsampleCaller<float,1>, imgproc::upsampleCaller<float,2>,
imgproc::upsampleCaller<float,3>, imgproc::upsampleCaller<float,4>}};
Caller caller = callers[src.depth()][src.channels()-1];
if (!caller)
CV_Error(CV_StsUnsupportedFormat, "bad number of channels");
dst.create(src.rows*2, src.cols*2, src.type());
caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// pyrDown
......
......@@ -47,9 +47,6 @@ namespace cv { namespace gpu { namespace device
{
namespace detail
{
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int n> struct WarpReductor
{
template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
......@@ -504,72 +501,6 @@ namespace cv { namespace gpu { namespace device
}
}
};
///////////////////////////////////////////////////////////////////////////////
// Vector Distance
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
{
template <typename Dist, typename T1, typename T2>
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
{
if (ind < len)
{
T1 val1 = *vecCached++;
T2 val2;
ForceGlob<T2>::Load(vecGlob, ind, val2);
dist.reduceIter(val1, val2);
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
}
}
template <typename Dist, typename T1, typename T2>
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
{
T1 val1 = *vecCached++;
T2 val2;
ForceGlob<T2>::Load(vecGlob, 0, val2);
vecGlob += THREAD_DIM;
dist.reduceIter(val1, val2);
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
}
};
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
{
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
{
}
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
{
}
};
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
{
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
{
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
}
};
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
{
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
{
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
}
};
}
}}}
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#include "../datamov_utils.hpp"
namespace cv { namespace gpu { namespace device
{
namespace detail
{
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
{
template <typename Dist, typename T1, typename T2>
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
{
if (ind < len)
{
T1 val1 = *vecCached++;
T2 val2;
ForceGlob<T2>::Load(vecGlob, ind, val2);
dist.reduceIter(val1, val2);
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
}
}
template <typename Dist, typename T1, typename T2>
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
{
T1 val1 = *vecCached++;
T2 val2;
ForceGlob<T2>::Load(vecGlob, 0, val2);
vecGlob += THREAD_DIM;
dist.reduceIter(val1, val2);
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
}
};
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
{
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
{
}
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
{
}
};
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
{
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
{
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
}
};
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
{
template <typename Dist, typename T1, typename T2>
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
{
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
}
};
}
}}}
#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
......@@ -46,7 +46,6 @@
#include "internal_shared.hpp"
#include "saturate_cast.hpp"
#include "datamov_utils.hpp"
#include "functional.hpp"
#include "detail/utility_detail.hpp"
#define OPENCV_GPU_LOG_WARP_SIZE (5)
......
......@@ -43,7 +43,10 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "internal_shared.hpp"
#include "utility.hpp"
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
namespace cv { namespace gpu { namespace device
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment