Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
be8e31f1
Commit
be8e31f1
authored
Sep 14, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
minor gpu module refactoring: split big .cu files, disabled unnecessary template instantiation
parent
d99f4a2b
Hide whitespace changes
Inline
Side-by-side
Showing
22 changed files
with
2226 additions
and
1959 deletions
+2226
-1959
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+0
-6
perf_filters.cpp
modules/gpu/perf/perf_filters.cpp
+5
-5
perf_imgproc.cpp
modules/gpu/perf/perf_imgproc.cpp
+2
-2
perf_utility.hpp
modules/gpu/perf/perf_utility.hpp
+2
-1
brute_force_matcher.cpp
modules/gpu/src/brute_force_matcher.cpp
+22
-22
bf_knnmatch.cu
modules/gpu/src/cuda/bf_knnmatch.cu
+464
-0
bf_match.cu
modules/gpu/src/cuda/bf_match.cu
+16
-593
bf_radius_match.cu
modules/gpu/src/cuda/bf_radius_match.cu
+202
-0
bilateral_filter.cu
modules/gpu/src/cuda/bilateral_filter.cu
+233
-0
column_filter.cu
modules/gpu/src/cuda/column_filter.cu
+240
-0
imgproc.cu
modules/gpu/src/cuda/imgproc.cu
+2
-775
pyr_down.cu
modules/gpu/src/cuda/pyr_down.cu
+185
-0
pyr_up.cu
modules/gpu/src/cuda/pyr_up.cu
+180
-0
remap.cu
modules/gpu/src/cuda/remap.cu
+249
-0
resize.cu
modules/gpu/src/cuda/resize.cu
+264
-0
row_filter.cu
modules/gpu/src/cuda/row_filter.cu
+7
-384
filtering.cpp
modules/gpu/src/filtering.cpp
+10
-10
imgproc.cpp
modules/gpu/src/imgproc.cpp
+23
-91
utility_detail.hpp
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+0
-69
vec_distance_detail.hpp
...gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+117
-0
utility.hpp
modules/gpu/src/opencv2/gpu/device/utility.hpp
+0
-1
vec_distance.hpp
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+3
-0
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
be8e31f1
...
...
@@ -756,12 +756,6 @@ namespace cv
//! computes the proximity map for the raster template and the image where the template is searched for
CV_EXPORTS
void
matchTemplate
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
,
int
method
);
//! downsamples image
CV_EXPORTS
void
downsample
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Stream
&
stream
=
Stream
::
Null
());
//! upsamples image
CV_EXPORTS
void
upsample
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Stream
&
stream
=
Stream
::
Null
());
//! smoothes the source image and downsamples it
CV_EXPORTS
void
pyrDown
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
borderType
=
BORDER_DEFAULT
,
Stream
&
stream
=
Stream
::
Null
());
...
...
modules/gpu/perf/perf_filters.cpp
View file @
be8e31f1
...
...
@@ -3,7 +3,7 @@
PERF_TEST_P
(
DevInfo_Size_MatType_KernelSize
,
boxFilter
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
GPU_TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
),
testing
::
Values
(
3
,
5
,
7
)))
testing
::
Values
(
3
,
5
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
Size
size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
...
@@ -37,7 +37,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
testing
::
Values
(
GPU_TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
),
testing
::
Values
((
int
)
MORPH_ERODE
,
(
int
)
MORPH_DILATE
),
testing
::
Values
(
3
,
5
,
7
)))
testing
::
Values
(
3
,
5
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
Size
size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
...
@@ -71,7 +71,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
PERF_TEST_P
(
DevInfo_Size_MatType_KernelSize
,
linearFilter
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
GPU_TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
),
testing
::
Values
(
3
,
5
,
7
)))
testing
::
Values
(
3
,
5
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
Size
size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
...
@@ -103,8 +103,8 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
PERF_TEST_P
(
DevInfo_Size_MatType_KernelSize_BorderMode
,
separableLinearFilter
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
GPU_TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
,
CV_16SC
1
,
CV_16SC
3
,
CV_32FC1
),
testing
::
Values
(
3
,
5
,
7
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
,
CV_16SC3
,
CV_32FC1
),
testing
::
Values
(
3
,
5
),
testing
::
Values
((
int
)
BORDER_REFLECT101
,
(
int
)
BORDER_CONSTANT
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
...
...
modules/gpu/perf/perf_imgproc.cpp
View file @
be8e31f1
...
...
@@ -244,8 +244,8 @@ PERF_TEST_P(DevInfo_Size_MatType, threshold, testing::Combine(testing::ValuesIn(
}
PERF_TEST_P
(
DevInfo_Size_MatType_Interpolation_SizeCoeff
,
resize
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
GPU_TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC
3
,
CV_8UC4
,
CV_16UC1
,
CV_16UC3
,
CV_16UC4
,
CV_32FC1
,
CV_32FC3
,
CV_32FC4
),
testing
::
Values
(
szSXGA
,
sz1080p
),
testing
::
Values
(
CV_8UC1
,
CV_8UC
4
,
CV_16UC1
,
CV_32FC1
),
testing
::
Values
((
int
)
INTER_NEAREST
,
(
int
)
INTER_LINEAR
,
(
int
)
INTER_CUBIC
),
testing
::
Values
(
0.5
,
2.0
)))
{
...
...
modules/gpu/perf/perf_utility.hpp
View file @
be8e31f1
...
...
@@ -53,7 +53,8 @@ typedef TestBaseWithParam< std::tr1::tuple<DeviceInfo, int, int> > DevInfo_K_Des
const
cv
::
Size
sz1800x1500
=
cv
::
Size
(
1800
,
1500
);
const
cv
::
Size
sz4700x3000
=
cv
::
Size
(
4700
,
3000
);
#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
//#define GPU_TYPICAL_MAT_SIZES szXGA, szSXGA, sz720p, sz1080p, sz1800x1500, sz4700x3000
#define GPU_TYPICAL_MAT_SIZES szSXGA, sz1080p, sz4700x3000
//! read image from testdata folder.
Mat
readImage
(
const
string
&
fileName
,
int
flags
=
CV_LOAD_IMAGE_COLOR
);
...
...
modules/gpu/src/brute_force_matcher.cpp
View file @
be8e31f1
...
...
@@ -179,18 +179,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
static
const
match_caller_t
match_callers
[
3
][
8
]
=
{
{
matchSingleL1_gpu
<
unsigned
char
>
,
matchSingleL1_gpu
<
signed
char
>
,
matchSingleL1_gpu
<
unsigned
char
>
,
0
/*matchSingleL1_gpu<signed char>*/
,
matchSingleL1_gpu
<
unsigned
short
>
,
matchSingleL1_gpu
<
short
>
,
matchSingleL1_gpu
<
int
>
,
matchSingleL1_gpu
<
float
>
,
0
,
0
},
{
matchSingleL2_gpu
<
unsigned
char
>
,
matchSingleL2_gpu
<
signed
char
>
,
matchSingleL2_gpu
<
unsigned
short
>
,
matchSingleL2_gpu
<
short
>
,
matchSingleL2_gpu
<
int
>
,
matchSingleL2_gpu
<
float
>
,
0
,
0
0
/*matchSingleL2_gpu<unsigned char>*/
,
0
/*matchSingleL2_gpu<signed char>*/
,
0
/*matchSingleL2_gpu<unsigned short>*/
,
0
/*matchSingleL2_gpu<short>*/
,
0
/*matchSingleL2_gpu<int>*/
,
matchSingleL2_gpu
<
float
>
,
0
,
0
},
{
matchSingleHamming_gpu
<
unsigned
char
>
,
matchSingleHamming_gpu
<
signed
char
>
,
matchSingleHamming_gpu
<
unsigned
short
>
,
matchSingleHamming_gpu
<
short
>
,
matchSingleHamming_gpu
<
unsigned
char
>
,
0
/*matchSingleHamming_gpu<signed char>*/
,
matchSingleHamming_gpu
<
unsigned
short
>
,
0
/*matchSingleHamming_gpu<short>*/
,
matchSingleHamming_gpu
<
int
>
,
0
,
0
,
0
}
};
...
...
@@ -318,18 +318,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
static
const
match_caller_t
match_callers
[
3
][
8
]
=
{
{
matchCollectionL1_gpu
<
unsigned
char
>
,
matchCollectionL1_gpu
<
signed
char
>
,
matchCollectionL1_gpu
<
unsigned
char
>
,
0
/*matchCollectionL1_gpu<signed char>*/
,
matchCollectionL1_gpu
<
unsigned
short
>
,
matchCollectionL1_gpu
<
short
>
,
matchCollectionL1_gpu
<
int
>
,
matchCollectionL1_gpu
<
float
>
,
0
,
0
},
{
matchCollectionL2_gpu
<
unsigned
char
>
,
matchCollectionL2_gpu
<
signed
char
>
,
matchCollectionL2_gpu
<
unsigned
short
>
,
matchCollectionL2_gpu
<
short
>
,
matchCollectionL2_gpu
<
int
>
,
matchCollectionL2_gpu
<
float
>
,
0
,
0
0
/*matchCollectionL2_gpu<unsigned char>*/
,
0
/*matchCollectionL2_gpu<signed char>*/
,
0
/*matchCollectionL2_gpu<unsigned short>*/
,
0
/*matchCollectionL2_gpu<short>*/
,
0
/*matchCollectionL2_gpu<int>*/
,
matchCollectionL2_gpu
<
float
>
,
0
,
0
},
{
matchCollectionHamming_gpu
<
unsigned
char
>
,
matchCollectionHamming_gpu
<
signed
char
>
,
matchCollectionHamming_gpu
<
unsigned
short
>
,
matchCollectionHamming_gpu
<
short
>
,
matchCollectionHamming_gpu
<
unsigned
char
>
,
0
/*matchCollectionHamming_gpu<signed char>*/
,
matchCollectionHamming_gpu
<
unsigned
short
>
,
0
/*matchCollectionHamming_gpu<short>*/
,
matchCollectionHamming_gpu
<
int
>
,
0
,
0
,
0
}
};
...
...
@@ -427,16 +427,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
static
const
match_caller_t
match_callers
[
3
][
8
]
=
{
{
knnMatchL1_gpu
<
unsigned
char
>
,
knnMatchL1_gpu
<
signed
char
>
,
knnMatchL1_gpu
<
unsigned
short
>
,
knnMatchL1_gpu
<
unsigned
char
>
,
0
/*knnMatchL1_gpu<signed char>*/
,
knnMatchL1_gpu
<
unsigned
short
>
,
knnMatchL1_gpu
<
short
>
,
knnMatchL1_gpu
<
int
>
,
knnMatchL1_gpu
<
float
>
,
0
,
0
},
{
knnMatchL2_gpu
<
unsigned
char
>
,
knnMatchL2_gpu
<
signed
char
>
,
knnMatchL2_gpu
<
unsigned
short
>
,
knnMatchL2_gpu
<
short
>
,
knnMatchL2_gpu
<
int
>
,
knnMatchL2_gpu
<
float
>
,
0
,
0
0
/*knnMatchL2_gpu<unsigned char>*/
,
0
/*knnMatchL2_gpu<signed char>*/
,
0
/*knnMatchL2_gpu<unsigned short>*/
,
0
/*knnMatchL2_gpu<short>*/
,
0
/*knnMatchL2_gpu<int>*/
,
knnMatchL2_gpu
<
float
>
,
0
,
0
},
{
knnMatchHamming_gpu
<
unsigned
char
>
,
knnMatchHamming_gpu
<
signed
char
>
,
knnMatchHamming_gpu
<
unsigned
short
>
,
knnMatchHamming_gpu
<
short
>
,
knnMatchHamming_gpu
<
int
>
,
0
,
0
,
0
knnMatchHamming_gpu
<
unsigned
char
>
,
0
/*knnMatchHamming_gpu<signed char>*/
,
knnMatchHamming_gpu
<
unsigned
short
>
,
0
/*knnMatchHamming_gpu<short>*/
,
knnMatchHamming_gpu
<
int
>
,
0
,
0
,
0
}
};
...
...
@@ -605,16 +605,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
static
const
radiusMatch_caller_t
radiusMatch_callers
[
3
][
8
]
=
{
{
radiusMatchL1_gpu
<
unsigned
char
>
,
radiusMatchL1_gpu
<
signed
char
>
,
radiusMatchL1_gpu
<
unsigned
short
>
,
radiusMatchL1_gpu
<
unsigned
char
>
,
0
/*radiusMatchL1_gpu<signed char>*/
,
radiusMatchL1_gpu
<
unsigned
short
>
,
radiusMatchL1_gpu
<
short
>
,
radiusMatchL1_gpu
<
int
>
,
radiusMatchL1_gpu
<
float
>
,
0
,
0
},
{
radiusMatchL2_gpu
<
unsigned
char
>
,
radiusMatchL2_gpu
<
signed
char
>
,
radiusMatchL2_gpu
<
unsigned
short
>
,
radiusMatchL2_gpu
<
short
>
,
radiusMatchL2_gpu
<
int
>
,
radiusMatchL2_gpu
<
float
>
,
0
,
0
0
/*radiusMatchL2_gpu<unsigned char>*/
,
0
/*radiusMatchL2_gpu<signed char>*/
,
0
/*radiusMatchL2_gpu<unsigned short>*/
,
0
/*radiusMatchL2_gpu<short>*/
,
0
/*radiusMatchL2_gpu<int>*/
,
radiusMatchL2_gpu
<
float
>
,
0
,
0
},
{
radiusMatchHamming_gpu
<
unsigned
char
>
,
radiusMatchHamming_gpu
<
signed
char
>
,
radiusMatchHamming_gpu
<
unsigned
short
>
,
radiusMatchHamming_gpu
<
short
>
,
radiusMatchHamming_gpu
<
int
>
,
0
,
0
,
0
radiusMatchHamming_gpu
<
unsigned
char
>
,
0
/*radiusMatchHamming_gpu<signed char>*/
,
radiusMatchHamming_gpu
<
unsigned
short
>
,
0
/*radiusMatchHamming_gpu<short>*/
,
radiusMatchHamming_gpu
<
int
>
,
0
,
0
,
0
}
};
...
...
modules/gpu/src/cuda/bf_knnmatch.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bfmatcher
{
template <typename VecDiff, typename Dist, typename T, typename Mask>
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2,
typename Dist::result_type* smem)
{
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
distMin1 = numeric_limits<typename Dist::result_type>::max();
distMin2 = numeric_limits<typename Dist::result_type>::max();
bestTrainIdx1 = -1;
bestTrainIdx2 = -1;
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
{
if (m(queryIdx, trainIdx))
{
Dist dist;
const T* trainRow = train.ptr(trainIdx);
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
const typename Dist::result_type val = dist;
if (val < distMin1)
{
distMin1 = val;
bestTrainIdx1 = trainIdx;
}
else if (val < distMin2)
{
distMin2 = val;
bestTrainIdx2 = trainIdx;
}
}
}
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
__global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)
{
typedef typename Dist::result_type result_type;
typedef typename Dist::value_type value_type;
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
const int queryIdx = blockIdx.x;
result_type distMin1;
result_type distMin2;
int bestTrainIdx1;
int bestTrainIdx2;
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
__syncthreads();
volatile result_type* sdistMinRow = smem;
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);
if (threadIdx.x == 0)
{
sdistMinRow[threadIdx.y] = distMin1;
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
{
distMin1 = numeric_limits<result_type>::max();
distMin2 = numeric_limits<result_type>::max();
bestTrainIdx1 = -1;
bestTrainIdx2 = -1;
#pragma unroll
for (int i = 0; i < BLOCK_DIM_Y; ++i)
{
result_type val = sdistMinRow[i];
if (val < distMin1)
{
distMin1 = val;
bestTrainIdx1 = sbestTrainIdxRow[i];
}
else if (val < distMin2)
{
distMin2 = val;
bestTrainIdx2 = sbestTrainIdxRow[i];
}
}
#pragma unroll
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
{
result_type val = sdistMinRow[i];
if (val < distMin2)
{
distMin2 = val;
bestTrainIdx2 = sbestTrainIdxRow[i];
}
}
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
distance[queryIdx] = make_float2(distMin1, distMin2);
}
}
///////////////////////////////////////////////////////////////////////////////
// Knn 2 Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Knn 2 Match Dispatcher
template <typename Dist, typename T, typename Mask>
void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (query.cols < 64)
{
knnMatch2Cached_caller<16, 16, 64, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols == 64)
{
knnMatch2Cached_caller<16, 16, 64, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols < 128)
{
knnMatch2Cached_caller<16, 16, 128, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols == 128 && cc >= 12)
{
knnMatch2Cached_caller<16, 16, 128, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols < 256 && cc >= 12)
{
knnMatch2Cached_caller<16, 16, 256, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols == 256 && cc >= 12)
{
knnMatch2Cached_caller<16, 16, 256, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else
{
knnMatch2Simple_caller<16, 16, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)
{
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = query.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (trainIdx < train.rows)
{
const T* trainDescs = train.ptr(trainIdx);
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
if (mask(queryIdx, trainIdx))
{
Dist dist;
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
myDist = dist;
}
if (threadIdx.x == 0)
distance.ptr(queryIdx)[trainIdx] = myDist;
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
{
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
{
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
__shared__ float sdist[SMEM_SIZE];
__shared__ int strainIdx[SMEM_SIZE];
const int queryIdx = blockIdx.x;
float* allDist = allDist_.ptr(queryIdx);
int* trainIdx = trainIdx_.ptr(queryIdx);
float* distance = distance_.ptr(queryIdx);
float dist = numeric_limits<float>::max();
int bestIdx = -1;
for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)
{
float reg = allDist[i];
if (reg < dist)
{
dist = reg;
bestIdx = i;
}
}
sdist[threadIdx.x] = dist;
strainIdx[threadIdx.x] = bestIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());
if (threadIdx.x == 0)
{
if (dist < numeric_limits<float>::max())
{
allDist[bestIdx] = numeric_limits<float>::max();
trainIdx[i] = bestIdx;
distance[i] = dist;
}
}
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel caller
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
const dim3 threads(BLOCK_SIZE, 1, 1);
const dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < k; ++i)
{
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)
{
findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);
}
///////////////////////////////////////////////////////////////////////////////
// knn match Dispatcher
template <typename Dist, typename T>
void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
if (mask.data)
{
if (k == 2)
{
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
}
else
{
if (k == 2)
{
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
}
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
}
///////////////////////////////////////////////////////////////////////////////
// knn match caller
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
}
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
}
//template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
}
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
}}}
modules/gpu/src/cuda/b
rute_force_matcher
.cu
→
modules/gpu/src/cuda/b
f_match
.cu
View file @
be8e31f1
...
...
@@ -49,11 +49,6 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bfmatcher
{
///////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// Match //////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
template <int BLOCK_DIM_Y, typename T>
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
{
...
...
@@ -312,7 +307,7 @@ namespace cv { namespace gpu { namespace bfmatcher
}
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
...
...
@@ -329,11 +324,11 @@ namespace cv { namespace gpu { namespace bfmatcher
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
}
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_, const DevMem2D& mask,
...
...
@@ -348,9 +343,9 @@ namespace cv { namespace gpu { namespace bfmatcher
}
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
...
...
@@ -365,7 +360,7 @@ namespace cv { namespace gpu { namespace bfmatcher
}
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
...
...
@@ -382,11 +377,11 @@ namespace cv { namespace gpu { namespace bfmatcher
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
}
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
...
...
@@ -401,580 +396,8 @@ namespace cv { namespace gpu { namespace bfmatcher
}
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// Knn Match ////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
template <typename VecDiff, typename Dist, typename T, typename Mask>
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2,
typename Dist::result_type* smem)
{
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
distMin1 = numeric_limits<typename Dist::result_type>::max();
distMin2 = numeric_limits<typename Dist::result_type>::max();
bestTrainIdx1 = -1;
bestTrainIdx2 = -1;
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
{
if (m(queryIdx, trainIdx))
{
Dist dist;
const T* trainRow = train.ptr(trainIdx);
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
const typename Dist::result_type val = dist;
if (val < distMin1)
{
distMin1 = val;
bestTrainIdx1 = trainIdx;
}
else if (val < distMin2)
{
distMin2 = val;
bestTrainIdx2 = trainIdx;
}
}
}
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
__global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)
{
typedef typename Dist::result_type result_type;
typedef typename Dist::value_type value_type;
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
const int queryIdx = blockIdx.x;
result_type distMin1;
result_type distMin2;
int bestTrainIdx1;
int bestTrainIdx2;
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
__syncthreads();
volatile result_type* sdistMinRow = smem;
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);
if (threadIdx.x == 0)
{
sdistMinRow[threadIdx.y] = distMin1;
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
{
distMin1 = numeric_limits<result_type>::max();
distMin2 = numeric_limits<result_type>::max();
bestTrainIdx1 = -1;
bestTrainIdx2 = -1;
#pragma unroll
for (int i = 0; i < BLOCK_DIM_Y; ++i)
{
result_type val = sdistMinRow[i];
if (val < distMin1)
{
distMin1 = val;
bestTrainIdx1 = sbestTrainIdxRow[i];
}
else if (val < distMin2)
{
distMin2 = val;
bestTrainIdx2 = sbestTrainIdxRow[i];
}
}
#pragma unroll
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
{
result_type val = sdistMinRow[i];
if (val < distMin2)
{
distMin2 = val;
bestTrainIdx2 = sbestTrainIdxRow[i];
}
}
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
distance[queryIdx] = make_float2(distMin1, distMin2);
}
}
///////////////////////////////////////////////////////////////////////////////
// Knn 2 Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Knn 2 Match Dispatcher
template <typename Dist, typename T, typename Mask>
void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (query.cols < 64)
{
knnMatch2Cached_caller<16, 16, 64, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols == 64)
{
knnMatch2Cached_caller<16, 16, 64, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols < 128)
{
knnMatch2Cached_caller<16, 16, 128, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols == 128 && cc >= 12)
{
knnMatch2Cached_caller<16, 16, 128, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols < 256 && cc >= 12)
{
knnMatch2Cached_caller<16, 16, 256, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else if (query.cols == 256 && cc >= 12)
{
knnMatch2Cached_caller<16, 16, 256, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
else
{
knnMatch2Simple_caller<16, 16, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)
{
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = query.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (trainIdx < train.rows)
{
const T* trainDescs = train.ptr(trainIdx);
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
if (mask(queryIdx, trainIdx))
{
Dist dist;
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
myDist = dist;
}
if (threadIdx.x == 0)
distance.ptr(queryIdx)[trainIdx] = myDist;
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
{
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
{
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
__shared__ float sdist[SMEM_SIZE];
__shared__ int strainIdx[SMEM_SIZE];
const int queryIdx = blockIdx.x;
float* allDist = allDist_.ptr(queryIdx);
int* trainIdx = trainIdx_.ptr(queryIdx);
float* distance = distance_.ptr(queryIdx);
float dist = numeric_limits<float>::max();
int bestIdx = -1;
for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)
{
float reg = allDist[i];
if (reg < dist)
{
dist = reg;
bestIdx = i;
}
}
sdist[threadIdx.x] = dist;
strainIdx[threadIdx.x] = bestIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());
if (threadIdx.x == 0)
{
if (dist < numeric_limits<float>::max())
{
allDist[bestIdx] = numeric_limits<float>::max();
trainIdx[i] = bestIdx;
distance[i] = dist;
}
}
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel caller
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
const dim3 threads(BLOCK_SIZE, 1, 1);
const dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < k; ++i)
{
findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)
{
findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);
}
///////////////////////////////////////////////////////////////////////////////
// knn match Dispatcher
template <typename Dist, typename T>
void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
if (mask.data)
{
if (k == 2)
{
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
}
else
{
if (k == 2)
{
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
}
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
}
///////////////////////////////////////////////////////////////////////////////
// knn match caller
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
}
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
}
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
}
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Radius Match //////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask,
DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
{
#if __CUDA_ARCH__ >= 110
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = query.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (trainIdx < train.rows)
{
const T* trainDescs = train.ptr(trainIdx);
if (mask(queryIdx, trainIdx))
{
Dist dist;
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
if (threadIdx.x == 0)
{
if (dist < maxDistance)
{
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (i < trainIdx_.cols)
{
distance.ptr(queryIdx)[i] = dist;
trainIdx_.ptr(queryIdx)[i] = trainIdx;
}
}
}
}
}
#endif
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match Dispatcher
template <typename Dist, typename T, typename Mask>
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance),
stream);
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, nMatches, distance,
stream);
}
else
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, nMatches, distance,
stream);
}
}
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, nMatches, distance,
stream);
}
else
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, nMatches, distance,
stream);
}
}
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, nMatches, distance,
stream);
}
else
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, nMatches, distance,
stream);
}
}
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
}}}
modules/gpu/src/cuda/bf_radius_match.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bfmatcher
{
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask,
DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
{
#if __CUDA_ARCH__ >= 110
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = query.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (trainIdx < train.rows)
{
const T* trainDescs = train.ptr(trainIdx);
if (mask(queryIdx, trainIdx))
{
Dist dist;
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);
if (threadIdx.x == 0)
{
if (dist < maxDistance)
{
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (i < trainIdx_.cols)
{
distance.ptr(queryIdx)[i] = dist;
trainIdx_.ptr(queryIdx)[i] = trainIdx;
}
}
}
}
}
#endif
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2D_<unsigned int>& nMatches, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match Dispatcher
template <typename Dist, typename T, typename Mask>
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance),
stream);
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, nMatches, distance,
stream);
}
else
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, nMatches, distance,
stream);
}
}
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
//template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, nMatches, distance,
stream);
}
else
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, nMatches, distance,
stream);
}
}
//template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
//template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
//template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
//template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
//template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
cudaStream_t stream)
{
if (mask.data)
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx, nMatches, distance,
stream);
}
else
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx, nMatches, distance,
stream);
}
}
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
//template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
//template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance, cudaStream_t stream);
}}}
modules/gpu/src/cuda/bilateral_filter.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace bf_krnls
{
__constant__ float* ctable_color;
__constant__ float* ctable_space;
__constant__ size_t ctable_space_step;
__constant__ int cndisp;
__constant__ int cradius;
__constant__ short cedge_disc;
__constant__ short cmax_disc;
}
namespace cv { namespace gpu { namespace bf
{
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
{
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
}
}}}
namespace bf_krnls
{
template <int channels>
struct DistRgbMax
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
uchar x = abs(a[0] - b[0]);
uchar y = abs(a[1] - b[1]);
uchar z = abs(a[2] - b[2]);
return (max(max(x, y), z));
}
};
template <>
struct DistRgbMax<1>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
return abs(a[0] - b[0]);
}
};
template <int channels, typename T>
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
T dp[5];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)
{
const int ymin = max(0, y - cradius);
const int xmin = max(0, x - cradius);
const int ymax = min(h - 1, y + cradius);
const int xmax = min(w - 1, x + cradius);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const uchar* ic = img + y * img_step + channels * x;
for(int yi = ymin; yi <= ymax; yi++)
{
const T* disp_y = disp + yi * disp_step;
for(int xi = xmin; xi <= xmax; xi++)
{
const uchar* in = img + yi * img_step + channels * xi;
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
const T disp_reg = disp_y[xi];
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
}
}
}
}
namespace cv { namespace gpu { namespace bf
{
template <typename T>
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1);
grid.y = divUp(disp.rows, threads.y);
switch (channels)
{
case 1:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
case 3:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
default:
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
if (stream != 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
}}}
modules/gpu/src/cuda/column_filter.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
namespace filter_krnls_column
{
__constant__ float cLinearKernel[MAX_KERNEL_SIZE];
void loadLinearKernel(const float kernel[], int ksize)
{
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) );
}
template <int ksize, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
T* sDataColumn = smem + threadIdx.x;
if (x < src.cols)
{
const T* srcCol = src.ptr() + x;
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
__syncthreads();
if (y < src.rows)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
#pragma unroll
for(int i = 0; i < ksize; ++i)
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
dst.ptr(y)[x] = saturate_cast<D>(sum);
}
}
}
}
namespace cv { namespace gpu { namespace filters
{
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
B<T> b(src.rows);
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
{
cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
}
filter_krnls_column::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] =
{
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
linearColumnFilter_caller<16, T, D, BrdColReflect101>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
linearColumnFilter_caller<16, T, D, BrdColReplicate>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
linearColumnFilter_caller<10, T, D, BrdColConstant>,
linearColumnFilter_caller<11, T, D, BrdColConstant>,
linearColumnFilter_caller<12, T, D, BrdColConstant>,
linearColumnFilter_caller<13, T, D, BrdColConstant>,
linearColumnFilter_caller<14, T, D, BrdColConstant>,
linearColumnFilter_caller<15, T, D, BrdColConstant>,
linearColumnFilter_caller<16, T, D, BrdColConstant>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
linearColumnFilter_caller<10, T, D, BrdColReflect>,
linearColumnFilter_caller<11, T, D, BrdColReflect>,
linearColumnFilter_caller<12, T, D, BrdColReflect>,
linearColumnFilter_caller<13, T, D, BrdColReflect>,
linearColumnFilter_caller<14, T, D, BrdColReflect>,
linearColumnFilter_caller<15, T, D, BrdColReflect>,
linearColumnFilter_caller<16, T, D, BrdColReflect>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
linearColumnFilter_caller<10, T, D, BrdColWrap>,
linearColumnFilter_caller<11, T, D, BrdColWrap>,
linearColumnFilter_caller<12, T, D, BrdColWrap>,
linearColumnFilter_caller<13, T, D, BrdColWrap>,
linearColumnFilter_caller<14, T, D, BrdColWrap>,
linearColumnFilter_caller<15, T, D, BrdColWrap>,
linearColumnFilter_caller<16, T, D, BrdColWrap>,
}
};
filter_krnls_column::loadLinearKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}}
modules/gpu/src/cuda/imgproc.cu
View file @
be8e31f1
...
...
@@ -41,433 +41,16 @@
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/
filters
.hpp"
#include "opencv2/gpu/device/
border_interpolate
.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
/////////////////////////////////// Remap ///////////////////////////////////////////////
namespace cv { namespace gpu { namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = mapx.ptr(y)[x];
const float ycoo = mapy.ptr(y)[x];
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_remap_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_remap_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
{
if (stream == 0)
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue);
else
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream);
}
};
template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream);
static const caller_t callers[3][5] =
{
{
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
RemapDispatcher<PointFilter, BrdConstant, T>::call,
RemapDispatcher<PointFilter, BrdReflect, T>::call,
RemapDispatcher<PointFilter, BrdWrap, T>::call
},
{
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
RemapDispatcher<LinearFilter, BrdWrap, T>::call
},
{
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
RemapDispatcher<CubicFilter, BrdWrap, T>::call
}
};
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream);
}
template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uint >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uint2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uint3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uint4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
/////////////////////////////////// Resize ///////////////////////////////////////////////
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
}
}
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
resize<<<grid, block>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_resize_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_resize_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
template <template <typename> class Filter, typename T> struct ResizeDispatcher
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
if (stream == 0)
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
else
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
}
};
template <typename T> void resize_gpu(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
static const caller_t callers[3] =
{
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
};
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
}
template void resize_gpu<uchar >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<schar>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<char2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<char3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<char4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort >(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort2>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort3>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort4>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uint >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uint2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uint3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uint4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<int >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<int2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<int3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<int4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
texture<uchar4, 2> tex_meanshift;
...
...
@@ -1199,363 +782,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
/////////////////////////////////////////////////////////////////////////
// downsample
template <typename T, int cn>
__global__ void downsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
int ch_x = x / cn;
dst.ptr(y)[x] = src.ptr(y*2)[ch_x*2*cn + x - ch_x*cn];
}
}
template <typename T, int cn>
void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
downsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// upsample
template <typename T, int cn>
__global__ void upsampleKernel(const PtrStep_<T> src, DevMem2D_<T> dst)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
int ch_x = x / cn;
T val = ((ch_x & 1) || (y & 1)) ? 0 : src.ptr(y/2)[ch_x/2*cn + x - ch_x*cn];
dst.ptr(y)[x] = val;
}
}
template <typename T, int cn>
void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
upsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
}
template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// pyrDown
template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y;
__shared__ value_type smem[256 + 4];
value_type sum;
const int src_y = 2*y;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
smem[2 + threadIdx.x] = sum;
if (threadIdx.x < 2)
{
const int left_x = x - 2 + threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
smem[threadIdx.x] = sum;
}
if (threadIdx.x > 253)
{
const int right_x = x + threadIdx.x + 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
smem[4 + threadIdx.x] = sum;
}
__syncthreads();
if (threadIdx.x < 128)
{
const int tid2 = threadIdx.x * 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
if (dst_x < dst_cols)
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
}
}
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows);
B<T> b(src.rows, src.cols);
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// pyrUp
template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ T smem1[10][10];
__shared__ value_type smem2[20][16];
value_type sum;
if (threadIdx.x < 10 && threadIdx.y < 10)
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
__syncthreads();
const int tidx = threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
smem2[2 + threadIdx.y][tidx] = sum;
if (threadIdx.y < 2)
{
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
smem2[threadIdx.y][tidx] = sum;
}
if (threadIdx.y > 13)
{
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
smem2[4 + threadIdx.y][tidx] = sum;
}
__syncthreads();
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
}
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<T> b(src.rows, src.cols);
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
}
//////////////////////////////////////////////////////////////////////////
// buildWarpMaps
...
...
modules/gpu/src/cuda/pyr_down.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y;
__shared__ value_type smem[256 + 4];
value_type sum;
const int src_y = 2*y;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
smem[2 + threadIdx.x] = sum;
if (threadIdx.x < 2)
{
const int left_x = x - 2 + threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
smem[threadIdx.x] = sum;
}
if (threadIdx.x > 253)
{
const int right_x = x + threadIdx.x + 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
smem[4 + threadIdx.x] = sum;
}
__syncthreads();
if (threadIdx.x < 128)
{
const int tid2 = threadIdx.x * 2;
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1];
sum = sum + 0.0625f * smem[2 + tid2 + 2];
const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
if (dst_x < dst_cols)
dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
}
}
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows);
B<T> b(src.rows, src.cols);
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
}}}
modules/gpu/src/cuda/pyr_up.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ T smem1[10][10];
__shared__ value_type smem2[20][16];
value_type sum;
if (threadIdx.x < 10 && threadIdx.y < 10)
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
__syncthreads();
const int tidx = threadIdx.x;
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
smem2[2 + threadIdx.y][tidx] = sum;
if (threadIdx.y < 2)
{
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
smem2[threadIdx.y][tidx] = sum;
}
if (threadIdx.y > 13)
{
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)];
sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
smem2[4 + threadIdx.y][tidx] = sum;
}
__syncthreads();
sum = VecTraits<value_type>::all(0);
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx];
sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx];
sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
}
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<T> b(src.rows, src.cols);
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
}}}
modules/gpu/src/cuda/remap.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = mapx.ptr(y)[x];
const float ycoo = mapy.ptr(y)[x];
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
BorderReader< PtrStep_<T>, B<work_type> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, B<work_type> > > filter_src(brdSrc);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_remap_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_remap_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
{ \
static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream)
{
if (stream == 0)
RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue);
else
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream);
}
};
template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, const float* borderValue, cudaStream_t stream);
static const caller_t callers[3][5] =
{
{
RemapDispatcher<PointFilter, BrdReflect101, T>::call,
RemapDispatcher<PointFilter, BrdReplicate, T>::call,
RemapDispatcher<PointFilter, BrdConstant, T>::call,
RemapDispatcher<PointFilter, BrdReflect, T>::call,
RemapDispatcher<PointFilter, BrdWrap, T>::call
},
{
RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
RemapDispatcher<LinearFilter, BrdConstant, T>::call,
RemapDispatcher<LinearFilter, BrdReflect, T>::call,
RemapDispatcher<LinearFilter, BrdWrap, T>::call
},
{
RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
RemapDispatcher<CubicFilter, BrdConstant, T>::call,
RemapDispatcher<CubicFilter, BrdReflect, T>::call,
RemapDispatcher<CubicFilter, BrdWrap, T>::call
}
};
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream);
}
template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
//template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream);
}}}
modules/gpu/src/cuda/resize.cu
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc
{
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows)
{
const float xcoo = x / fx;
const float ycoo = y / fy;
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
}
}
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep_<T>, BrdReplicate<T> > > filter_src(brdSrc);
resize<<<grid, block>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep_<T>, BrdReplicate<T> > brdSrc(src, brd);
resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_resize_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_resize_ ## type , x, y); \
} \
}; \
template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
resize<<<grid, block>>>(filter_src, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <> struct ResizeDispatcherNonStream<PointFilter, type> \
{ \
static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
{ \
dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
template <template <typename> class Filter, typename T> struct ResizeDispatcher
{
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{
if (stream == 0)
ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
else
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
}
};
template <typename T> void resize_gpu(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
static const caller_t callers[3] =
{
ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
};
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
}
template void resize_gpu<uchar >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<uchar2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<schar>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort >(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<ushort2>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort3>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort4>(const DevMem2D& src,float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<short2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float >(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<float2>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float3>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float4>(const DevMem2D& src, float fx, float fy, const DevMem2D& dst, int interpolation, cudaStream_t stream);
}}}
modules/gpu/src/cuda/
filters
.cu
→
modules/gpu/src/cuda/
row_filter
.cu
View file @
be8e31f1
...
...
@@ -49,28 +49,19 @@
using namespace cv::gpu;
using namespace cv::gpu::device;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Linear filters
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
namespace filter_krnls
namespace filter_krnls
_row
{
__constant__ float cLinearKernel[MAX_KERNEL_SIZE];
}
namespace cv { namespace gpu { namespace filters
{
void loadLinearKernel(const float kernel[], int ksize)
{
cudaSafeCall( cudaMemcpyToSymbol(
filter_krnls::
cLinearKernel, kernel, ksize * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) );
}
}}}
namespace filter_krnls
{
template <typename T, size_t size> struct SmemType_
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
...
...
@@ -131,7 +122,7 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
typedef typename filter_krnls::SmemType<T>::smem_t smem_t;
typedef typename filter_krnls
_row
::SmemType<T>::smem_t smem_t;
B<smem_t> b(src.cols);
if (!b.is_range_safe(-BLOCK_DIM_X, (grid.x + 1) * BLOCK_DIM_X - 1))
...
...
@@ -140,7 +131,7 @@ namespace cv { namespace gpu { namespace filters
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
}
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
filter_krnls
_row
::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
...
...
@@ -250,384 +241,16 @@ namespace cv { namespace gpu { namespace filters
}
};
loadLinearKernel(kernel, ksize);
filter_krnls_row::
loadLinearKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short3, float3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}}
namespace filter_krnls
{
template <int ksize, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
T* sDataColumn = smem + threadIdx.x;
if (x < src.cols)
{
const T* srcCol = src.ptr() + x;
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
__syncthreads();
if (y < src.rows)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
#pragma unroll
for(int i = 0; i < ksize; ++i)
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i];
dst.ptr(y)[x] = saturate_cast<D>(sum);
}
}
}
}
namespace cv { namespace gpu { namespace filters
{
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
B<T> b(src.rows);
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
{
cv::gpu::error("linearColumnFilter: can't use specified border extrapolation, image is too small, "
"try bigger image or another border extrapolation mode", __FILE__, __LINE__);
}
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] =
{
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect101>,
linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
linearColumnFilter_caller<3 , T, D, BrdColReflect101>,
linearColumnFilter_caller<4 , T, D, BrdColReflect101>,
linearColumnFilter_caller<5 , T, D, BrdColReflect101>,
linearColumnFilter_caller<6 , T, D, BrdColReflect101>,
linearColumnFilter_caller<7 , T, D, BrdColReflect101>,
linearColumnFilter_caller<8 , T, D, BrdColReflect101>,
linearColumnFilter_caller<9 , T, D, BrdColReflect101>,
linearColumnFilter_caller<10, T, D, BrdColReflect101>,
linearColumnFilter_caller<11, T, D, BrdColReflect101>,
linearColumnFilter_caller<12, T, D, BrdColReflect101>,
linearColumnFilter_caller<13, T, D, BrdColReflect101>,
linearColumnFilter_caller<14, T, D, BrdColReflect101>,
linearColumnFilter_caller<15, T, D, BrdColReflect101>,
linearColumnFilter_caller<16, T, D, BrdColReflect101>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReplicate>,
linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
linearColumnFilter_caller<3 , T, D, BrdColReplicate>,
linearColumnFilter_caller<4 , T, D, BrdColReplicate>,
linearColumnFilter_caller<5 , T, D, BrdColReplicate>,
linearColumnFilter_caller<6 , T, D, BrdColReplicate>,
linearColumnFilter_caller<7 , T, D, BrdColReplicate>,
linearColumnFilter_caller<8 , T, D, BrdColReplicate>,
linearColumnFilter_caller<9 , T, D, BrdColReplicate>,
linearColumnFilter_caller<10, T, D, BrdColReplicate>,
linearColumnFilter_caller<11, T, D, BrdColReplicate>,
linearColumnFilter_caller<12, T, D, BrdColReplicate>,
linearColumnFilter_caller<13, T, D, BrdColReplicate>,
linearColumnFilter_caller<14, T, D, BrdColReplicate>,
linearColumnFilter_caller<15, T, D, BrdColReplicate>,
linearColumnFilter_caller<16, T, D, BrdColReplicate>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColConstant>,
linearColumnFilter_caller<2 , T, D, BrdColConstant>,
linearColumnFilter_caller<3 , T, D, BrdColConstant>,
linearColumnFilter_caller<4 , T, D, BrdColConstant>,
linearColumnFilter_caller<5 , T, D, BrdColConstant>,
linearColumnFilter_caller<6 , T, D, BrdColConstant>,
linearColumnFilter_caller<7 , T, D, BrdColConstant>,
linearColumnFilter_caller<8 , T, D, BrdColConstant>,
linearColumnFilter_caller<9 , T, D, BrdColConstant>,
linearColumnFilter_caller<10, T, D, BrdColConstant>,
linearColumnFilter_caller<11, T, D, BrdColConstant>,
linearColumnFilter_caller<12, T, D, BrdColConstant>,
linearColumnFilter_caller<13, T, D, BrdColConstant>,
linearColumnFilter_caller<14, T, D, BrdColConstant>,
linearColumnFilter_caller<15, T, D, BrdColConstant>,
linearColumnFilter_caller<16, T, D, BrdColConstant>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColReflect>,
linearColumnFilter_caller<2 , T, D, BrdColReflect>,
linearColumnFilter_caller<3 , T, D, BrdColReflect>,
linearColumnFilter_caller<4 , T, D, BrdColReflect>,
linearColumnFilter_caller<5 , T, D, BrdColReflect>,
linearColumnFilter_caller<6 , T, D, BrdColReflect>,
linearColumnFilter_caller<7 , T, D, BrdColReflect>,
linearColumnFilter_caller<8 , T, D, BrdColReflect>,
linearColumnFilter_caller<9 , T, D, BrdColReflect>,
linearColumnFilter_caller<10, T, D, BrdColReflect>,
linearColumnFilter_caller<11, T, D, BrdColReflect>,
linearColumnFilter_caller<12, T, D, BrdColReflect>,
linearColumnFilter_caller<13, T, D, BrdColReflect>,
linearColumnFilter_caller<14, T, D, BrdColReflect>,
linearColumnFilter_caller<15, T, D, BrdColReflect>,
linearColumnFilter_caller<16, T, D, BrdColReflect>
},
{
0,
linearColumnFilter_caller<1 , T, D, BrdColWrap>,
linearColumnFilter_caller<2 , T, D, BrdColWrap>,
linearColumnFilter_caller<3 , T, D, BrdColWrap>,
linearColumnFilter_caller<4 , T, D, BrdColWrap>,
linearColumnFilter_caller<5 , T, D, BrdColWrap>,
linearColumnFilter_caller<6 , T, D, BrdColWrap>,
linearColumnFilter_caller<7 , T, D, BrdColWrap>,
linearColumnFilter_caller<8 , T, D, BrdColWrap>,
linearColumnFilter_caller<9 , T, D, BrdColWrap>,
linearColumnFilter_caller<10, T, D, BrdColWrap>,
linearColumnFilter_caller<11, T, D, BrdColWrap>,
linearColumnFilter_caller<12, T, D, BrdColWrap>,
linearColumnFilter_caller<13, T, D, BrdColWrap>,
linearColumnFilter_caller<14, T, D, BrdColWrap>,
linearColumnFilter_caller<15, T, D, BrdColWrap>,
linearColumnFilter_caller<16, T, D, BrdColWrap>,
}
};
loadLinearKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float3, short3>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}}
/////////////////////////////////////////////////////////////////////////////////////////////////
// Bilateral filters
namespace bf_krnls
{
__constant__ float* ctable_color;
__constant__ float* ctable_space;
__constant__ size_t ctable_space_step;
__constant__ int cndisp;
__constant__ int cradius;
__constant__ short cedge_disc;
__constant__ short cmax_disc;
}
namespace cv { namespace gpu { namespace bf
{
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
{
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
}
}}}
namespace bf_krnls
{
template <int channels>
struct DistRgbMax
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
uchar x = abs(a[0] - b[0]);
uchar y = abs(a[1] - b[1]);
uchar z = abs(a[2] - b[2]);
return (max(max(x, y), z));
}
};
template <>
struct DistRgbMax<1>
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{
return abs(a[0] - b[0]);
}
};
template <int channels, typename T>
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
{
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
T dp[5];
if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
{
dp[0] = *(disp + (y ) * disp_step + x + 0);
dp[1] = *(disp + (y-1) * disp_step + x + 0);
dp[2] = *(disp + (y ) * disp_step + x - 1);
dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1);
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)
{
const int ymin = max(0, y - cradius);
const int xmin = max(0, x - cradius);
const int ymax = min(h - 1, y + cradius);
const int xmax = min(w - 1, x + cradius);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
const uchar* ic = img + y * img_step + channels * x;
for(int yi = ymin; yi <= ymax; yi++)
{
const T* disp_y = disp + yi * disp_step;
for(int xi = xmin; xi <= xmax; xi++)
{
const uchar* in = img + yi * img_step + channels * xi;
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
const T disp_reg = disp_y[xi];
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
}
}
float minimum = numeric_limits<float>::max();
int id = 0;
if (cost[0] < minimum)
{
minimum = cost[0];
id = 0;
}
if (cost[1] < minimum)
{
minimum = cost[1];
id = 1;
}
if (cost[2] < minimum)
{
minimum = cost[2];
id = 2;
}
if (cost[3] < minimum)
{
minimum = cost[3];
id = 3;
}
if (cost[4] < minimum)
{
minimum = cost[4];
id = 4;
}
*(disp + y * disp_step + x) = dp[id];
}
}
}
}
namespace cv { namespace gpu { namespace bf
{
template <typename T>
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1);
grid.y = divUp(disp.rows, threads.y);
switch (channels)
{
case 1:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
case 3:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
default:
cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
}
if (stream != 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)
{
bilateral_filter_caller(disp, img, channels, iters, stream);
}
}}}
modules/gpu/src/filtering.cpp
View file @
be8e31f1
...
...
@@ -722,7 +722,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
int
gpuBorderType
;
CV_Assert
(
tryConvertToGpuBorderType
(
borderType
,
gpuBorderType
));
CV_Assert
(
srcType
==
CV_8UC1
||
srcType
==
CV_8UC4
||
srcType
==
CV_16SC1
||
srcType
==
CV_16SC2
CV_Assert
(
srcType
==
CV_8UC1
||
srcType
==
CV_8UC4
/*|| srcType == CV_16SC1*/
/*|| srcType == CV_16SC2*/
||
srcType
==
CV_16SC3
||
srcType
==
CV_32SC1
||
srcType
==
CV_32FC1
);
CV_Assert
(
CV_MAT_DEPTH
(
bufType
)
==
CV_32F
&&
CV_MAT_CN
(
srcType
)
==
CV_MAT_CN
(
bufType
));
...
...
@@ -747,12 +747,12 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
case
CV_8UC4
:
func
=
filters
::
linearRowFilter_gpu
<
uchar4
,
float4
>
;
break
;
case
CV_16SC1
:
/*
case CV_16SC1:
func = filters::linearRowFilter_gpu<short, float>;
break
;
case
CV_16SC2
:
break;
*/
/*
case CV_16SC2:
func = filters::linearRowFilter_gpu<short2, float2>;
break
;
break;
*/
case
CV_16SC3
:
func
=
filters
::
linearRowFilter_gpu
<
short3
,
float3
>
;
break
;
...
...
@@ -837,7 +837,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
int
gpuBorderType
;
CV_Assert
(
tryConvertToGpuBorderType
(
borderType
,
gpuBorderType
));
CV_Assert
(
dstType
==
CV_8UC1
||
dstType
==
CV_8UC4
||
dstType
==
CV_16SC1
||
dstType
==
CV_16SC2
CV_Assert
(
dstType
==
CV_8UC1
||
dstType
==
CV_8UC4
/*|| dstType == CV_16SC1*/
/*|| dstType == CV_16SC2*/
||
dstType
==
CV_16SC3
||
dstType
==
CV_32SC1
||
dstType
==
CV_32FC1
);
CV_Assert
(
CV_MAT_DEPTH
(
bufType
)
==
CV_32F
&&
CV_MAT_CN
(
dstType
)
==
CV_MAT_CN
(
bufType
));
...
...
@@ -862,12 +862,12 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
case
CV_8UC4
:
func
=
filters
::
linearColumnFilter_gpu
<
float4
,
uchar4
>
;
break
;
case
CV_16SC1
:
/*
case CV_16SC1:
func = filters::linearColumnFilter_gpu<float, short>;
break
;
case
CV_16SC2
:
break;
*/
/*
case CV_16SC2:
func = filters::linearColumnFilter_gpu<float2, short2>;
break
;
break;
*/
case
CV_16SC3
:
func
=
filters
::
linearColumnFilter_gpu
<
float3
,
short3
>
;
break
;
...
...
modules/gpu/src/imgproc.cpp
View file @
be8e31f1
...
...
@@ -90,8 +90,6 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
void
cv
::
gpu
::
ConvolveBuf
::
create
(
Size
,
Size
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
convolve
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
convolve
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
bool
,
ConvolveBuf
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
downsample
(
const
GpuMat
&
,
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
upsample
(
const
GpuMat
&
,
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
pyrDown
(
const
GpuMat
&
,
GpuMat
&
,
int
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
pyrUp
(
const
GpuMat
&
,
GpuMat
&
,
int
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Canny
(
const
GpuMat
&
,
GpuMat
&
,
double
,
double
,
int
,
bool
)
{
throw_nogpu
();
}
...
...
@@ -120,17 +118,20 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
src
,
const
DevMem2Df
&
xmap
,
const
DevMem2Df
&
ymap
,
const
DevMem2D
&
dst
,
int
interpolation
,
int
borderMode
,
const
float
*
borderValue
,
cudaStream_t
stream
);
static
const
caller_t
callers
[
6
][
4
]
=
{
{
remap_gpu
<
uchar
>
,
remap_gpu
<
uchar2
>
,
remap_gpu
<
uchar3
>
,
remap_gpu
<
uchar4
>
},
{
remap_gpu
<
schar
>
,
remap_gpu
<
char2
>
,
remap_gpu
<
char3
>
,
remap_gpu
<
char4
>
},
{
remap_gpu
<
ushort
>
,
remap_gpu
<
ushort2
>
,
remap_gpu
<
ushort3
>
,
remap_gpu
<
ushort4
>
},
{
remap_gpu
<
short
>
,
remap_gpu
<
short2
>
,
remap_gpu
<
short3
>
,
remap_gpu
<
short4
>
},
{
remap_gpu
<
int
>
,
remap_gpu
<
int2
>
,
remap_gpu
<
int3
>
,
remap_gpu
<
int4
>
},
{
remap_gpu
<
float
>
,
remap_gpu
<
float2
>
,
remap_gpu
<
float3
>
,
remap_gpu
<
float4
>
}
{
remap_gpu
<
uchar
>
,
0
/*remap_gpu<uchar2>*/
,
remap_gpu
<
uchar3
>
,
remap_gpu
<
uchar4
>
},
{
0
/*remap_gpu<schar>*/
,
0
/*remap_gpu<char2>*/
,
0
/*remap_gpu<char3>*/
,
0
/*remap_gpu<char4>*/
},
{
remap_gpu
<
ushort
>
,
0
/*remap_gpu<ushort2>*/
,
remap_gpu
<
ushort3
>
,
remap_gpu
<
ushort4
>
},
{
remap_gpu
<
short
>
,
0
/*remap_gpu<short2>*/
,
remap_gpu
<
short3
>
,
remap_gpu
<
short4
>
},
{
0
/*remap_gpu<int>*/
,
0
/*remap_gpu<int2>*/
,
0
/*remap_gpu<int3>*/
,
0
/*remap_gpu<int4>*/
},
{
remap_gpu
<
float
>
,
0
/*remap_gpu<float2>*/
,
remap_gpu
<
float3
>
,
remap_gpu
<
float4
>
}
};
CV_Assert
(
src
.
depth
()
<=
CV_32F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
xmap
.
type
()
==
CV_32F
&&
ymap
.
type
()
==
CV_32F
&&
xmap
.
size
()
==
ymap
.
size
());
caller_t
func
=
callers
[
src
.
depth
()][
src
.
channels
()
-
1
];
CV_Assert
(
func
!=
0
);
CV_Assert
(
interpolation
==
INTER_NEAREST
||
interpolation
==
INTER_LINEAR
||
interpolation
==
INTER_CUBIC
);
CV_Assert
(
borderMode
==
BORDER_REFLECT101
||
borderMode
==
BORDER_REPLICATE
||
borderMode
==
BORDER_CONSTANT
||
borderMode
==
BORDER_REFLECT
||
borderMode
==
BORDER_WRAP
);
...
...
@@ -142,7 +143,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
Scalar_
<
float
>
borderValueFloat
;
borderValueFloat
=
borderValue
;
callers
[
src
.
depth
()][
src
.
channels
()
-
1
]
(
src
,
xmap
,
ymap
,
dst
,
interpolation
,
gpuBorderType
,
borderValueFloat
.
val
,
StreamAccessor
::
getStream
(
stream
));
func
(
src
,
xmap
,
ymap
,
dst
,
interpolation
,
gpuBorderType
,
borderValueFloat
.
val
,
StreamAccessor
::
getStream
(
stream
));
}
////////////////////////////////////////////////////////////////////////
...
...
@@ -279,19 +280,6 @@ namespace cv { namespace gpu { namespace imgproc
void
cv
::
gpu
::
resize
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
dsize
,
double
fx
,
double
fy
,
int
interpolation
,
Stream
&
s
)
{
using
namespace
cv
::
gpu
::
imgproc
;
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
src
,
float
fx
,
float
fy
,
const
DevMem2D
&
dst
,
int
interpolation
,
cudaStream_t
stream
);
static
const
caller_t
callers
[
6
][
4
]
=
{
{
resize_gpu
<
uchar
>
,
resize_gpu
<
uchar2
>
,
resize_gpu
<
uchar3
>
,
resize_gpu
<
uchar4
>
},
{
resize_gpu
<
schar
>
,
resize_gpu
<
char2
>
,
resize_gpu
<
char3
>
,
resize_gpu
<
char4
>
},
{
resize_gpu
<
ushort
>
,
resize_gpu
<
ushort2
>
,
resize_gpu
<
ushort3
>
,
resize_gpu
<
ushort4
>
},
{
resize_gpu
<
short
>
,
resize_gpu
<
short2
>
,
resize_gpu
<
short3
>
,
resize_gpu
<
short4
>
},
{
resize_gpu
<
int
>
,
resize_gpu
<
int2
>
,
resize_gpu
<
int3
>
,
resize_gpu
<
int4
>
},
{
resize_gpu
<
float
>
,
resize_gpu
<
float2
>
,
resize_gpu
<
float3
>
,
resize_gpu
<
float4
>
}
};
CV_Assert
(
src
.
depth
()
<=
CV_32F
&&
src
.
channels
()
<=
4
);
CV_Assert
(
interpolation
==
INTER_NEAREST
||
interpolation
==
INTER_LINEAR
||
interpolation
==
INTER_CUBIC
);
CV_Assert
(
!
(
dsize
==
Size
())
||
(
fx
>
0
&&
fy
>
0
)
);
...
...
@@ -352,6 +340,19 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
}
else
{
using
namespace
cv
::
gpu
::
imgproc
;
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
src
,
float
fx
,
float
fy
,
const
DevMem2D
&
dst
,
int
interpolation
,
cudaStream_t
stream
);
static
const
caller_t
callers
[
6
][
4
]
=
{
{
resize_gpu
<
uchar
>
,
0
/*resize_gpu<uchar2>*/
,
resize_gpu
<
uchar3
>
,
resize_gpu
<
uchar4
>
},
{
0
/*resize_gpu<schar>*/
,
0
/*resize_gpu<char2>*/
,
0
/*resize_gpu<char3>*/
,
0
/*resize_gpu<char4>*/
},
{
resize_gpu
<
ushort
>
,
0
/*resize_gpu<ushort2>*/
,
resize_gpu
<
ushort3
>
,
resize_gpu
<
ushort4
>
},
{
resize_gpu
<
short
>
,
0
/*resize_gpu<short2>*/
,
resize_gpu
<
short3
>
,
resize_gpu
<
short4
>
},
{
0
/*resize_gpu<int>*/
,
0
/*resize_gpu<int2>*/
,
0
/*resize_gpu<int3>*/
,
0
/*resize_gpu<int4>*/
},
{
resize_gpu
<
float
>
,
0
/*resize_gpu<float2>*/
,
resize_gpu
<
float3
>
,
resize_gpu
<
float4
>
}
};
callers
[
src
.
depth
()][
src
.
channels
()
-
1
](
src
,
static_cast
<
float
>
(
fx
),
static_cast
<
float
>
(
fy
),
dst
,
interpolation
,
stream
);
}
}
...
...
@@ -1589,75 +1590,6 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
cufftSafeCall
(
cufftDestroy
(
planC2R
));
}
////////////////////////////////////////////////////////////////////
// downsample
namespace
cv
{
namespace
gpu
{
namespace
imgproc
{
template
<
typename
T
,
int
cn
>
void
downsampleCaller
(
const
DevMem2D
src
,
DevMem2D
dst
,
cudaStream_t
stream
);
}}}
void
cv
::
gpu
::
downsample
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Stream
&
stream
)
{
CV_Assert
(
src
.
depth
()
<
CV_64F
&&
src
.
channels
()
<=
4
);
typedef
void
(
*
Caller
)(
const
DevMem2D
,
DevMem2D
,
cudaStream_t
stream
);
static
const
Caller
callers
[
6
][
4
]
=
{{
imgproc
::
downsampleCaller
<
uchar
,
1
>
,
imgproc
::
downsampleCaller
<
uchar
,
2
>
,
imgproc
::
downsampleCaller
<
uchar
,
3
>
,
imgproc
::
downsampleCaller
<
uchar
,
4
>
},
{
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
},
{
imgproc
::
downsampleCaller
<
short
,
1
>
,
imgproc
::
downsampleCaller
<
short
,
2
>
,
imgproc
::
downsampleCaller
<
short
,
3
>
,
imgproc
::
downsampleCaller
<
short
,
4
>
},
{
0
,
0
,
0
,
0
},
{
imgproc
::
downsampleCaller
<
float
,
1
>
,
imgproc
::
downsampleCaller
<
float
,
2
>
,
imgproc
::
downsampleCaller
<
float
,
3
>
,
imgproc
::
downsampleCaller
<
float
,
4
>
}};
Caller
caller
=
callers
[
src
.
depth
()][
src
.
channels
()
-
1
];
if
(
!
caller
)
CV_Error
(
CV_StsUnsupportedFormat
,
"bad number of channels"
);
dst
.
create
((
src
.
rows
+
1
)
/
2
,
(
src
.
cols
+
1
)
/
2
,
src
.
type
());
caller
(
src
,
dst
.
reshape
(
1
),
StreamAccessor
::
getStream
(
stream
));
}
//////////////////////////////////////////////////////////////////////////////
// upsample
namespace
cv
{
namespace
gpu
{
namespace
imgproc
{
template
<
typename
T
,
int
cn
>
void
upsampleCaller
(
const
DevMem2D
src
,
DevMem2D
dst
,
cudaStream_t
stream
);
}}}
void
cv
::
gpu
::
upsample
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Stream
&
stream
)
{
CV_Assert
(
src
.
depth
()
<
CV_64F
&&
src
.
channels
()
<=
4
);
typedef
void
(
*
Caller
)(
const
DevMem2D
,
DevMem2D
,
cudaStream_t
stream
);
static
const
Caller
callers
[
6
][
5
]
=
{{
imgproc
::
upsampleCaller
<
uchar
,
1
>
,
imgproc
::
upsampleCaller
<
uchar
,
2
>
,
imgproc
::
upsampleCaller
<
uchar
,
3
>
,
imgproc
::
upsampleCaller
<
uchar
,
4
>
},
{
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
},
{
imgproc
::
upsampleCaller
<
short
,
1
>
,
imgproc
::
upsampleCaller
<
short
,
2
>
,
imgproc
::
upsampleCaller
<
short
,
3
>
,
imgproc
::
upsampleCaller
<
short
,
4
>
},
{
0
,
0
,
0
,
0
},
{
imgproc
::
upsampleCaller
<
float
,
1
>
,
imgproc
::
upsampleCaller
<
float
,
2
>
,
imgproc
::
upsampleCaller
<
float
,
3
>
,
imgproc
::
upsampleCaller
<
float
,
4
>
}};
Caller
caller
=
callers
[
src
.
depth
()][
src
.
channels
()
-
1
];
if
(
!
caller
)
CV_Error
(
CV_StsUnsupportedFormat
,
"bad number of channels"
);
dst
.
create
(
src
.
rows
*
2
,
src
.
cols
*
2
,
src
.
type
());
caller
(
src
,
dst
.
reshape
(
1
),
StreamAccessor
::
getStream
(
stream
));
}
//////////////////////////////////////////////////////////////////////////////
// pyrDown
...
...
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
View file @
be8e31f1
...
...
@@ -47,9 +47,6 @@ namespace cv { namespace gpu { namespace device
{
namespace
detail
{
///////////////////////////////////////////////////////////////////////////////
// Reduction
template
<
int
n
>
struct
WarpReductor
{
template
<
typename
T
,
typename
Op
>
static
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
...
...
@@ -504,72 +501,6 @@ namespace cv { namespace gpu { namespace device
}
}
};
///////////////////////////////////////////////////////////////////////////////
// Vector Distance
template
<
int
THREAD_DIM
,
int
N
>
struct
UnrollVecDiffCached
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
void
calcCheck
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
int
ind
)
{
if
(
ind
<
len
)
{
T1
val1
=
*
vecCached
++
;
T2
val2
;
ForceGlob
<
T2
>::
Load
(
vecGlob
,
ind
,
val2
);
dist
.
reduceIter
(
val1
,
val2
);
UnrollVecDiffCached
<
THREAD_DIM
,
N
-
1
>::
calcCheck
(
vecCached
,
vecGlob
,
len
,
dist
,
ind
+
THREAD_DIM
);
}
}
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
void
calcWithoutCheck
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
Dist
&
dist
)
{
T1
val1
=
*
vecCached
++
;
T2
val2
;
ForceGlob
<
T2
>::
Load
(
vecGlob
,
0
,
val2
);
vecGlob
+=
THREAD_DIM
;
dist
.
reduceIter
(
val1
,
val2
);
UnrollVecDiffCached
<
THREAD_DIM
,
N
-
1
>::
calcWithoutCheck
(
vecCached
,
vecGlob
,
dist
);
}
};
template
<
int
THREAD_DIM
>
struct
UnrollVecDiffCached
<
THREAD_DIM
,
0
>
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calcCheck
(
const
T1
*
,
const
T2
*
,
int
,
Dist
&
,
int
)
{
}
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calcWithoutCheck
(
const
T1
*
,
const
T2
*
,
Dist
&
)
{
}
};
template
<
int
THREAD_DIM
,
int
MAX_LEN
,
bool
LEN_EQ_MAX_LEN
>
struct
VecDiffCachedCalculator
;
template
<
int
THREAD_DIM
,
int
MAX_LEN
>
struct
VecDiffCachedCalculator
<
THREAD_DIM
,
MAX_LEN
,
false
>
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calc
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
int
tid
)
{
UnrollVecDiffCached
<
THREAD_DIM
,
MAX_LEN
/
THREAD_DIM
>::
calcCheck
(
vecCached
,
vecGlob
,
len
,
dist
,
tid
);
}
};
template
<
int
THREAD_DIM
,
int
MAX_LEN
>
struct
VecDiffCachedCalculator
<
THREAD_DIM
,
MAX_LEN
,
true
>
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calc
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
int
tid
)
{
UnrollVecDiffCached
<
THREAD_DIM
,
MAX_LEN
/
THREAD_DIM
>::
calcWithoutCheck
(
vecCached
,
vecGlob
+
tid
,
dist
);
}
};
}
}}}
...
...
modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
0 → 100644
View file @
be8e31f1
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#include "../datamov_utils.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
{
namespace
detail
{
template
<
int
THREAD_DIM
,
int
N
>
struct
UnrollVecDiffCached
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
void
calcCheck
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
int
ind
)
{
if
(
ind
<
len
)
{
T1
val1
=
*
vecCached
++
;
T2
val2
;
ForceGlob
<
T2
>::
Load
(
vecGlob
,
ind
,
val2
);
dist
.
reduceIter
(
val1
,
val2
);
UnrollVecDiffCached
<
THREAD_DIM
,
N
-
1
>::
calcCheck
(
vecCached
,
vecGlob
,
len
,
dist
,
ind
+
THREAD_DIM
);
}
}
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
void
calcWithoutCheck
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
Dist
&
dist
)
{
T1
val1
=
*
vecCached
++
;
T2
val2
;
ForceGlob
<
T2
>::
Load
(
vecGlob
,
0
,
val2
);
vecGlob
+=
THREAD_DIM
;
dist
.
reduceIter
(
val1
,
val2
);
UnrollVecDiffCached
<
THREAD_DIM
,
N
-
1
>::
calcWithoutCheck
(
vecCached
,
vecGlob
,
dist
);
}
};
template
<
int
THREAD_DIM
>
struct
UnrollVecDiffCached
<
THREAD_DIM
,
0
>
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calcCheck
(
const
T1
*
,
const
T2
*
,
int
,
Dist
&
,
int
)
{
}
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calcWithoutCheck
(
const
T1
*
,
const
T2
*
,
Dist
&
)
{
}
};
template
<
int
THREAD_DIM
,
int
MAX_LEN
,
bool
LEN_EQ_MAX_LEN
>
struct
VecDiffCachedCalculator
;
template
<
int
THREAD_DIM
,
int
MAX_LEN
>
struct
VecDiffCachedCalculator
<
THREAD_DIM
,
MAX_LEN
,
false
>
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calc
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
int
tid
)
{
UnrollVecDiffCached
<
THREAD_DIM
,
MAX_LEN
/
THREAD_DIM
>::
calcCheck
(
vecCached
,
vecGlob
,
len
,
dist
,
tid
);
}
};
template
<
int
THREAD_DIM
,
int
MAX_LEN
>
struct
VecDiffCachedCalculator
<
THREAD_DIM
,
MAX_LEN
,
true
>
{
template
<
typename
Dist
,
typename
T1
,
typename
T2
>
static
__device__
__forceinline__
void
calc
(
const
T1
*
vecCached
,
const
T2
*
vecGlob
,
int
len
,
Dist
&
dist
,
int
tid
)
{
UnrollVecDiffCached
<
THREAD_DIM
,
MAX_LEN
/
THREAD_DIM
>::
calcWithoutCheck
(
vecCached
,
vecGlob
+
tid
,
dist
);
}
};
}
}}}
#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
modules/gpu/src/opencv2/gpu/device/utility.hpp
View file @
be8e31f1
...
...
@@ -46,7 +46,6 @@
#include "internal_shared.hpp"
#include "saturate_cast.hpp"
#include "datamov_utils.hpp"
#include "functional.hpp"
#include "detail/utility_detail.hpp"
#define OPENCV_GPU_LOG_WARP_SIZE (5)
...
...
modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
View file @
be8e31f1
...
...
@@ -43,7 +43,10 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "internal_shared.hpp"
#include "utility.hpp"
#include "functional.hpp"
#include "detail/vec_distance_detail.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment