Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
9da6d789
Commit
9da6d789
authored
Sep 28, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimized gpu filters, added buffered version for different filters
parent
340e23a4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
165 additions
and
107 deletions
+165
-107
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+27
-5
perf_filters.cpp
modules/gpu/perf/perf_filters.cpp
+5
-7
column_filter.cu
modules/gpu/src/cuda/column_filter.cu
+42
-26
row_filter.cu
modules/gpu/src/cuda/row_filter.cu
+58
-35
filtering.cpp
modules/gpu/src/filtering.cpp
+0
-0
tests.cpp
samples/gpu/performance/tests.cpp
+33
-34
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
9da6d789
...
...
@@ -340,6 +340,8 @@ namespace cv
//! returns the separable filter engine with the specified filters
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createSeparableFilter_GPU
(
const
Ptr
<
BaseRowFilter_GPU
>&
rowFilter
,
const
Ptr
<
BaseColumnFilter_GPU
>&
columnFilter
,
int
srcType
,
int
bufType
,
int
dstType
);
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createSeparableFilter_GPU
(
const
Ptr
<
BaseRowFilter_GPU
>&
rowFilter
,
const
Ptr
<
BaseColumnFilter_GPU
>&
columnFilter
,
int
srcType
,
int
bufType
,
int
dstType
,
GpuMat
&
buf
);
//! returns horizontal 1D box filter
//! supports only CV_8UC1 source type and CV_32FC1 sum type
...
...
@@ -367,6 +369,8 @@ namespace cv
//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createMorphologyFilter_GPU
(
int
op
,
int
type
,
const
Mat
&
kernel
,
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createMorphologyFilter_GPU
(
int
op
,
int
type
,
const
Mat
&
kernel
,
GpuMat
&
buf
,
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
//! returns 2D filter with the specified kernel
//! supports CV_8UC1 and CV_8UC4 types
...
...
@@ -386,7 +390,7 @@ namespace cv
//! OpenCV version supports only CV_32F as buffer depth and
//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
CV_EXPORTS
Ptr
<
BaseRowFilter_GPU
>
getLinearRowFilter_GPU
(
int
srcType
,
int
bufType
,
const
Mat
&
rowKernel
,
int
anchor
=
-
1
,
int
borderType
=
BORDER_
CONSTAN
T
);
int
anchor
=
-
1
,
int
borderType
=
BORDER_
DEFAUL
T
);
//! returns the primitive column filter with the specified kernel.
//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
...
...
@@ -397,20 +401,27 @@ namespace cv
//! OpenCV version supports only CV_32F as buffer depth and
//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
CV_EXPORTS
Ptr
<
BaseColumnFilter_GPU
>
getLinearColumnFilter_GPU
(
int
bufType
,
int
dstType
,
const
Mat
&
columnKernel
,
int
anchor
=
-
1
,
int
borderType
=
BORDER_
CONSTAN
T
);
int
anchor
=
-
1
,
int
borderType
=
BORDER_
DEFAUL
T
);
//! returns the separable linear filter engine
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createSeparableLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
rowKernel
,
const
Mat
&
columnKernel
,
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createSeparableLinearFilter_GPU
(
int
srcType
,
int
dstType
,
const
Mat
&
rowKernel
,
const
Mat
&
columnKernel
,
GpuMat
&
buf
,
const
Point
&
anchor
=
Point
(
-
1
,
-
1
),
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
//! returns filter engine for the generalized Sobel operator
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createDerivFilter_GPU
(
int
srcType
,
int
dstType
,
int
dx
,
int
dy
,
int
ksize
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createDerivFilter_GPU
(
int
srcType
,
int
dstType
,
int
dx
,
int
dy
,
int
ksize
,
GpuMat
&
buf
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
//! returns the Gaussian filter engine
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createGaussianFilter_GPU
(
int
type
,
Size
ksize
,
double
sigma1
,
double
sigma2
=
0
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
CV_EXPORTS
Ptr
<
FilterEngine_GPU
>
createGaussianFilter_GPU
(
int
type
,
Size
ksize
,
GpuMat
&
buf
,
double
sigma1
,
double
sigma2
=
0
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
//! returns maximum filter
CV_EXPORTS
Ptr
<
BaseFilter_GPU
>
getMaxFilter_GPU
(
int
srcType
,
int
dstType
,
const
Size
&
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
));
...
...
@@ -426,31 +437,42 @@ namespace cv
static
inline
void
blur
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
ksize
,
Point
anchor
=
Point
(
-
1
,
-
1
),
Stream
&
stream
=
Stream
::
Null
())
{
boxFilter
(
src
,
dst
,
-
1
,
ksize
,
anchor
,
stream
);
}
//! erodes the image (applies the local minimum operator)
CV_EXPORTS
void
erode
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
,
Stream
&
stream
=
Stream
::
Null
());
CV_EXPORTS
void
erode
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
CV_EXPORTS
void
erode
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
kernel
,
GpuMat
&
buf
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
,
Stream
&
stream
=
Stream
::
Null
());
//! dilates the image (applies the local maximum operator)
CV_EXPORTS
void
dilate
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
,
Stream
&
stream
=
Stream
::
Null
());
CV_EXPORTS
void
dilate
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
CV_EXPORTS
void
dilate
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
const
Mat
&
kernel
,
GpuMat
&
buf
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
,
Stream
&
stream
=
Stream
::
Null
());
//! applies an advanced morphological operation to the image
CV_EXPORTS
void
morphologyEx
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
op
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
,
Stream
&
stream
=
Stream
::
Null
());
CV_EXPORTS
void
morphologyEx
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
op
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
);
CV_EXPORTS
void
morphologyEx
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
op
,
const
Mat
&
kernel
,
GpuMat
&
buf1
,
GpuMat
&
buf2
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
iterations
=
1
,
Stream
&
stream
=
Stream
::
Null
());
//! applies non-separable 2D linear filter to the image
CV_EXPORTS
void
filter2D
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
ddepth
,
const
Mat
&
kernel
,
Point
anchor
=
Point
(
-
1
,
-
1
),
Stream
&
stream
=
Stream
::
Null
());
//! applies separable 2D linear filter to the image
CV_EXPORTS
void
sepFilter2D
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
ddepth
,
const
Mat
&
kernelX
,
const
Mat
&
kernelY
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
CV_EXPORTS
void
sepFilter2D
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
ddepth
,
const
Mat
&
kernelX
,
const
Mat
&
kernelY
,
GpuMat
&
buf
,
Point
anchor
=
Point
(
-
1
,
-
1
),
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
,
Stream
&
stream
=
Stream
::
Null
());
//! applies generalized Sobel operator to the image
CV_EXPORTS
void
Sobel
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
ddepth
,
int
dx
,
int
dy
,
int
ksize
=
3
,
double
scale
=
1
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
CV_EXPORTS
void
Sobel
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
ddepth
,
int
dx
,
int
dy
,
GpuMat
&
buf
,
int
ksize
=
3
,
double
scale
=
1
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
,
Stream
&
stream
=
Stream
::
Null
());
//! applies the vertical or horizontal Scharr operator to the image
CV_EXPORTS
void
Scharr
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
ddepth
,
int
dx
,
int
dy
,
double
scale
=
1
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
CV_EXPORTS
void
Scharr
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
ddepth
,
int
dx
,
int
dy
,
GpuMat
&
buf
,
double
scale
=
1
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
,
Stream
&
stream
=
Stream
::
Null
());
//! smooths the image using Gaussian filter.
CV_EXPORTS
void
GaussianBlur
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
ksize
,
double
sigma1
,
double
sigma2
=
0
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
);
CV_EXPORTS
void
GaussianBlur
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
Size
ksize
,
GpuMat
&
buf
,
double
sigma1
,
double
sigma2
=
0
,
int
rowBorderType
=
BORDER_DEFAULT
,
int
columnBorderType
=
-
1
,
Stream
&
stream
=
Stream
::
Null
());
//! applies Laplacian operator to the image
...
...
modules/gpu/perf/perf_filters.cpp
View file @
9da6d789
...
...
@@ -101,17 +101,15 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
SANITY_CHECK
(
dst_host
);
}
PERF_TEST_P
(
DevInfo_Size_MatType_KernelSize_BorderMode
,
separableLinearFilter
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
GPU_TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
,
CV_16SC3
,
CV_32FC1
),
testing
::
Values
(
3
,
5
),
testing
::
Values
((
int
)
BORDER_REFLECT101
,
(
int
)
BORDER_CONSTANT
)))
PERF_TEST_P
(
DevInfo_Size_MatType_KernelSize
,
separableLinearFilter
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
GPU_TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
,
CV_32FC1
),
testing
::
Values
(
3
,
5
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
Size
size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
int
type
=
std
::
tr1
::
get
<
2
>
(
GetParam
());
int
ksize
=
std
::
tr1
::
get
<
3
>
(
GetParam
());
int
borderMode
=
std
::
tr1
::
get
<
4
>
(
GetParam
());
setDevice
(
devInfo
.
deviceID
());
...
...
@@ -123,7 +121,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, t
GpuMat
dst
(
size
,
type
);
Mat
kernel
=
getGaussianKernel
(
ksize
,
0.5
,
CV_32F
);
Ptr
<
FilterEngine_GPU
>
filter
=
createSeparableLinearFilter_GPU
(
type
,
type
,
kernel
,
kernel
,
Point
(
-
1
,
-
1
)
,
borderMode
);
Ptr
<
FilterEngine_GPU
>
filter
=
createSeparableLinearFilter_GPU
(
type
,
type
,
kernel
,
kernel
,
Point
(
-
1
,
-
1
));
declare
.
time
(
1.0
).
iterations
(
100
);
...
...
modules/gpu/src/cuda/column_filter.cu
View file @
9da6d789
...
...
@@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
...
...
@@ -51,49 +52,64 @@ using namespace cv::gpu::device;
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
#define BLOCK_DIM_Y 8
#define RESULT_STEPS 8
#define HALO_STEPS 1
namespace filter_
krnls_
column
namespace filter_column
{
__constant__ float c
LinearK
ernel[MAX_KERNEL_SIZE];
__constant__ float c
_k
ernel[MAX_KERNEL_SIZE];
void load
Linear
Kernel(const float kernel[], int ksize)
void loadKernel(const float kernel[], int ksize)
{
cudaSafeCall( cudaMemcpyToSymbol(c
LinearK
ernel, kernel, ksize * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(c
_k
ernel, kernel, ksize * sizeof(float)) );
}
template <int
ksize
, typename T, typename D, typename B>
template <int
KERNEL_SIZE
, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]
;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t
;
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
__shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
T* sDataColumn = smem + threadIdx.x;
//Offset to the upper halo edge
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
if (x < src.cols)
{
const T* src
C
ol = src.ptr() + x;
const T* src
_c
ol = src.ptr() + x;
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
//Main data
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
//Upper halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
//Lower halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
__syncthreads();
if (y < src.rows)
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
#pragma unroll
for(int
i = 0; i < ksize; ++i
)
sum = sum + s
DataColumn[i * BLOCK_DIM_X] * cLinearKernel[i
];
for(int
j = 0; j < KERNEL_SIZE; ++j
)
sum = sum + s
mem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j
];
dst.ptr(y)[x] = saturate_cast<D>(sum);
int dstY = y + i * BLOCK_DIM_Y;
if (dstY < src.rows)
dst.ptr(dstY)[x] = saturate_cast<D>(sum);
}
}
}
...
...
@@ -103,13 +119,13 @@ namespace cv { namespace gpu { namespace filters
{
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
dim3 threads
(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows,
BLOCK_DIM_Y));
{
const dim3 block
(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS *
BLOCK_DIM_Y));
B<T> b(src.rows);
filter_
krnls_column::linearColumnFilter<ksize, T, D><<<grid, threads
, 0, stream>>>(src, dst, anchor, b);
filter_
column::linearColumnFilter<ksize, T, D><<<grid, block
, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
...
...
@@ -219,7 +235,7 @@ namespace cv { namespace gpu { namespace filters
}
};
filter_
krnls_column::loadLinear
Kernel(kernel, ksize);
filter_
column::load
Kernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
...
...
modules/gpu/src/cuda/row_filter.cu
View file @
9da6d789
...
...
@@ -12,6 +12,7 @@
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
...
...
@@ -51,64 +52,85 @@ using namespace cv::gpu::device;
#define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16
#define BLOCK_DIM_Y 4
#define RESULT_STEPS 8
#define HALO_STEPS 1
namespace filter_
krnls_
row
namespace filter_row
{
__constant__ float c
LinearK
ernel[MAX_KERNEL_SIZE];
__constant__ float c
_k
ernel[MAX_KERNEL_SIZE];
void load
Linear
Kernel(const float kernel[], int ksize)
void loadKernel(const float kernel[], int ksize)
{
cudaSafeCall( cudaMemcpyToSymbol(c
LinearK
ernel, kernel, ksize * sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(c
_k
ernel, kernel, ksize * sizeof(float)) );
}
template <typename T, size_t size> struct SmemType_
namespace detail
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
};
template <typename T> struct SmemType_<T, 4>
{
typedef T smem_t;
};
template <typename T, size_t size> struct SmemType
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
};
template <typename T> struct SmemType<T, 4>
{
typedef T smem_t;
};
}
template <typename T> struct SmemType
{
typedef typename
SmemType_
<T, sizeof(T)>::smem_t smem_t;
typedef typename
detail::SmemType
<T, sizeof(T)>::smem_t smem_t;
};
template <int
ksize
, typename T, typename D, typename B>
template <int
KERNEL_SIZE
, typename T, typename D, typename B>
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{
typedef typename SmemType<T>::smem_t smem_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x;
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
__shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
//Offset to the left halo edge
const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (y < src.rows)
{
const T* rowSrc = src.ptr(y);
const T* src_row = src.ptr(y);
//Load main data
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
//Load left halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc);
sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc);
sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc);
//Load right halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
__syncthreads();
if (x < src.cols)
D* dst_row = dst.ptr(y);
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0);
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
#pragma unroll
for
(int i = 0; i < ksize; ++i
)
sum = sum + s
DataRow[i] * cLinearKernel[i
];
for
(int j = 0; j < KERNEL_SIZE; ++j
)
sum = sum + s
mem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j
];
dst.ptr(y)[x] = saturate_cast<D>(sum);
int dstX = x + i * BLOCK_DIM_X;
if (dstX < src.cols)
dst_row[dstX] = saturate_cast<D>(sum);
}
}
}
...
...
@@ -119,13 +141,14 @@ namespace cv { namespace gpu { namespace filters
template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
typedef typename filter_row::SmemType<T>::smem_t smem_t;
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
typedef typename filter_krnls_row::SmemType<T>::smem_t smem_t;
B<smem_t> b(src.cols);
filter_
krnls_row::linearRowFilter<ksize, T, D><<<grid, threads
, 0, stream>>>(src, dst, anchor, b);
filter_
row::linearRowFilter<ksize, T, D><<<grid, block
, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
...
...
@@ -235,7 +258,7 @@ namespace cv { namespace gpu { namespace filters
}
};
filter_
krnls_row::loadLinear
Kernel(kernel, ksize);
filter_
row::load
Kernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
}
...
...
modules/gpu/src/filtering.cpp
View file @
9da6d789
This diff is collapsed.
Click to expand it.
samples/gpu/performance/tests.cpp
View file @
9da6d789
...
...
@@ -737,32 +737,6 @@ TEST(resize)
}
TEST
(
Sobel
)
{
Mat
src
,
dst
;
gpu
::
GpuMat
d_src
,
d_dst
;
for
(
int
size
=
2000
;
size
<=
4000
;
size
+=
1000
)
{
SUBTEST
<<
"size "
<<
size
<<
", 32F"
;
gen
(
src
,
size
,
size
,
CV_32F
,
0
,
1
);
dst
.
create
(
size
,
size
,
CV_32F
);
CPU_ON
;
Sobel
(
src
,
dst
,
dst
.
depth
(),
1
,
1
);
CPU_OFF
;
d_src
=
src
;
d_dst
.
create
(
size
,
size
,
CV_32F
);
GPU_ON
;
gpu
::
Sobel
(
d_src
,
d_dst
,
d_dst
.
depth
(),
1
,
1
);
GPU_OFF
;
}
}
TEST
(
cvtColor
)
{
Mat
src
,
dst
;
...
...
@@ -1068,26 +1042,28 @@ TEST(solvePnPRansac)
TEST
(
GaussianBlur
)
{
for
(
int
size
=
1000
;
size
<
10000
;
size
+=
3
000
)
for
(
int
size
=
1000
;
size
<
=
4000
;
size
+=
1
000
)
{
SUBTEST
<<
"
16SC3
, size "
<<
size
;
SUBTEST
<<
"
8UC1
, size "
<<
size
;
Mat
src
;
gen
(
src
,
size
,
size
,
CV_
16SC3
,
0
,
256
);
Mat
src
;
gen
(
src
,
size
,
size
,
CV_
8UC1
,
0
,
256
);
Mat
dst
(
src
.
size
(),
src
.
type
());
CPU_ON
;
GaussianBlur
(
src
,
dst
,
Size
(
5
,
5
),
0
);
GaussianBlur
(
src
,
dst
,
Size
(
3
,
3
),
1
);
CPU_OFF
;
gpu
::
GpuMat
d_src
(
src
);
gpu
::
GpuMat
d_dst
(
src
.
size
(),
src
.
type
());
gpu
::
GpuMat
d_buf
;
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
3
,
3
),
d_buf
,
1
);
GPU_ON
;
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
5
,
5
),
0
);
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
3
,
3
),
d_buf
,
1
);
GPU_OFF
;
}
for
(
int
size
=
1000
;
size
<
10000
;
size
+=
3
000
)
for
(
int
size
=
1000
;
size
<
=
4000
;
size
+=
1
000
)
{
SUBTEST
<<
"8UC4, size "
<<
size
;
...
...
@@ -1095,14 +1071,37 @@ TEST(GaussianBlur)
Mat
dst
(
src
.
size
(),
src
.
type
());
CPU_ON
;
GaussianBlur
(
src
,
dst
,
Size
(
5
,
5
),
0
);
GaussianBlur
(
src
,
dst
,
Size
(
3
,
3
),
1
);
CPU_OFF
;
gpu
::
GpuMat
d_src
(
src
);
gpu
::
GpuMat
d_dst
(
src
.
size
(),
src
.
type
());
gpu
::
GpuMat
d_buf
;
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
3
,
3
),
d_buf
,
1
);
GPU_ON
;
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
3
,
3
),
d_buf
,
1
);
GPU_OFF
;
}
for
(
int
size
=
1000
;
size
<=
4000
;
size
+=
1000
)
{
SUBTEST
<<
"32FC1, size "
<<
size
;
Mat
src
;
gen
(
src
,
size
,
size
,
CV_32FC1
,
0
,
1
);
Mat
dst
(
src
.
size
(),
src
.
type
());
CPU_ON
;
GaussianBlur
(
src
,
dst
,
Size
(
3
,
3
),
1
);
CPU_OFF
;
gpu
::
GpuMat
d_src
(
src
);
gpu
::
GpuMat
d_dst
(
src
.
size
(),
src
.
type
());
gpu
::
GpuMat
d_buf
;
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
3
,
3
),
d_buf
,
1
);
GPU_ON
;
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
5
,
5
),
0
);
gpu
::
GaussianBlur
(
d_src
,
d_dst
,
Size
(
3
,
3
),
d_buf
,
1
);
GPU_OFF
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment