Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
7bf29e14
Commit
7bf29e14
authored
Jul 22, 2010
by
Andrey Morozov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
optimized gpumat::setTo(), ~ 30 speedup
parent
8bb987e4
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
58 additions
and
80 deletions
+58
-80
cuda_shared.hpp
modules/gpu/src/cuda/cuda_shared.hpp
+0
-1
matrix_operations.cu
modules/gpu/src/cuda/matrix_operations.cu
+51
-75
operator_set_to.cpp
tests/gpu/src/operator_set_to.cpp
+7
-4
No files found.
modules/gpu/src/cuda/cuda_shared.hpp
View file @
7bf29e14
...
...
@@ -59,7 +59,6 @@ namespace cv
namespace
impl
{
static
inline
int
divUp
(
int
a
,
int
b
)
{
return
(
a
%
b
==
0
)
?
a
/
b
:
a
/
b
+
1
;
}
extern
"C"
void
set_to_without_mask
(
const
DevMem2D
&
mat
,
const
double
*
scalar
,
int
depth
,
int
channels
);
extern
"C"
void
set_to_with_mask
(
const
DevMem2D
&
mat
,
const
double
*
scalar
,
const
DevMem2D
&
mask
,
int
depth
,
int
channels
);
...
...
modules/gpu/src/cuda/matrix_operations.cu
View file @
7bf29e14
...
...
@@ -50,62 +50,36 @@ __constant__ __align__(16) float scalar_d[4];
namespace mat_operators
{
template <typename T, int channels, int count = channels>
struct unroll
{
__device__ static void unroll_set(T * mat, size_t i)
{
mat[i] = static_cast<T>(scalar_d[channels - count]);
unroll<T, channels, count - 1>::unroll_set(mat, i+1);
}
__device__ static void unroll_set_with_mask(T * mat, unsigned char mask, size_t i)
{
if ( mask != 0 )
mat[i] = static_cast<T>(scalar_d[channels - count]);
unroll<T, channels, count - 1>::unroll_set_with_mask(mat, mask, i+1);
}
};
template <typename T, int channels>
struct unroll<T, channels, 0>
{
__device__ static void unroll_set(T * , size_t){}
__device__ static void unroll_set_with_mask(T * , unsigned char, size_t){}
};
template <typename T, int channels>
__device__ size_t GetIndex(size_t i, int cols, int step)
{
size_t ret = (i / static_cast<size_t>(cols))*static_cast<size_t>(step) / static_cast<size_t>(sizeof(T)) +
(i % static_cast<size_t>(cols))*static_cast<size_t>(channels);
return ret;
}
template <typename T, int channels>
template<typename T, int channels>
__global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step)
{
size_t i = (blockIdx.x * blockDim.x + threadIdx.x);
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if (
i < cols * rows
)
if (
(x < cols * channels ) && (y < rows)
)
{
unroll<T, channels>::unroll_set(mat, GetIndex<T,channels>(i, cols, step));
size_t idx = y * (step / sizeof(T)) + x;
mat[idx] = scalar_d[ x % channels ];
}
}
template
<typename T, int channels>
__global__ void kernel_set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step)
template<typename T, int channels>
__global__ void kernel_set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step
, int step_mask
)
{
size_t i = (blockIdx.x * blockDim.x + threadIdx.x);
if (i < cols * rows)
unroll<T, channels>::unroll_set_with_mask(mat, mask[i], GetIndex<T,channels>(i, cols, step));
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if (mask[y * step_mask + x] != 0)
if ((x < cols * channels ) && (y < rows))
{
size_t idx = y * (step / sizeof(T)) + x;
mat[idx] = scalar_d[ x % channels ];
}
}
}
extern "C" void cv::gpu::impl::set_to_with
_mask(const DevMem2D& mat, const double * scalar, const DevMem2D& mask
, int elemSize1, int channels)
extern "C" void cv::gpu::impl::set_to_with
out_mask(const DevMem2D& mat, const double * scalar
, int elemSize1, int channels)
{
// download scalar to constant memory
float data[4];
data[0] = static_cast<float>(scalar[0]);
data[1] = static_cast<float>(scalar[1]);
...
...
@@ -113,37 +87,38 @@ extern "C" void cv::gpu::impl::set_to_with_mask(const DevMem2D& mat, const doubl
data[3] = static_cast<float>(scalar[3]);
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, &data, sizeof(data)));
dim3 threadsPerBlock(
256,1,
1);
dim3 numBlocks (mat.
rows * mat.cols / threadsPerBlock.x + 1,
1, 1);
dim3 threadsPerBlock(
16, 16,
1);
dim3 numBlocks (mat.
cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y +
1, 1);
if (channels == 1)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 1><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 1><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float , 1><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 1><<<numBlocks,threadsPerBlock>>>(mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 1><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask<float, 1><<<numBlocks,threadsPerBlock>>>((float *)mat
.ptr, mat.cols, mat.rows, mat.step);
}
if (channels == 2)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 2><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 2><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float , 2><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 2><<<numBlocks,threadsPerBlock>>>(mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 2><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask<float, 2><<<numBlocks,threadsPerBlock>>>((float *)mat
.ptr, mat.cols, mat.rows, mat.step);
}
if (channels == 3)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 3><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 3><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float , 3><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 3><<<numBlocks,threadsPerBlock>>>(mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 3><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask<float, 3><<<numBlocks,threadsPerBlock>>>((float *)mat
.ptr, mat.cols, mat.rows, mat.step);
}
if (channels == 4)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 4><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 4><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float , 4><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 4><<<numBlocks,threadsPerBlock>>>(mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 4><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat
.ptr, mat.cols, mat.rows, mat.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask<float, 4><<<numBlocks,threadsPerBlock>>>((float *)mat
.ptr, mat.cols, mat.rows, mat.step);
}
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall ( cudaThreadSynchronize() );
}
extern "C" void cv::gpu::impl::set_to_with
out_mask(const DevMem2D& mat, const double * scalar
, int elemSize1, int channels)
extern "C" void cv::gpu::impl::set_to_with
_mask(const DevMem2D& mat, const double * scalar, const DevMem2D& mask
, int elemSize1, int channels)
{
float data[4];
data[0] = static_cast<float>(scalar[0]);
...
...
@@ -152,33 +127,34 @@ extern "C" void cv::gpu::impl::set_to_without_mask(const DevMem2D& mat, const do
data[3] = static_cast<float>(scalar[3]);
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, &data, sizeof(data)));
dim3 threadsPerBlock(
256, 1
, 1);
dim3 numBlocks (mat.
rows * mat.cols / threadsPerBlock.x + 1,
1, 1);
dim3 threadsPerBlock(
16, 16
, 1);
dim3 numBlocks (mat.
cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y +
1, 1);
if (channels == 1)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 1><<<numBlocks,threadsPerBlock>>>(mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 1><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask< float, 1><<<numBlocks,threadsPerBlock>>>(( float *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 1><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 1><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float, 1><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
}
if (channels == 2)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 2><<<numBlocks,threadsPerBlock>>>(mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 2><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask< float , 2><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 2><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 2><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float, 2><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
}
if (channels == 3)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 3><<<numBlocks,threadsPerBlock>>>(mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 3><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask< float, 3><<<numBlocks,threadsPerBlock>>>(( float *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 3><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 3><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float, 3><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
}
if (channels == 4)
{
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
out_mask<unsigned char, 4><<<numBlocks,threadsPerBlock>>>(mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
out_mask<unsigned short, 4><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
out_mask<float, 4><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, mat.cols, mat.rows, mat
.step);
if (elemSize1 == 1) ::mat_operators::kernel_set_to_with
_mask<unsigned char, 4><<<numBlocks,threadsPerBlock>>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 2) ::mat_operators::kernel_set_to_with
_mask<unsigned short, 4><<<numBlocks,threadsPerBlock>>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
if (elemSize1 == 4) ::mat_operators::kernel_set_to_with
_mask<float, 4><<<numBlocks,threadsPerBlock>>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask
.step);
}
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall
( cudaThreadSynchronize() );
}
tests/gpu/src/operator_set_to.cpp
View file @
7bf29e14
...
...
@@ -6,6 +6,7 @@
#include <iterator>
#include <limits>
#include <numeric>
#include <iomanip> // for cout << setw()
using
namespace
cv
;
using
namespace
std
;
...
...
@@ -35,6 +36,7 @@ class CV_GpuMatOpSetTo : public CvTest
bool
test_cv_32f_c3
();
bool
test_cv_32f_c4
();
private
:
int
rows
;
int
cols
;
...
...
@@ -43,8 +45,8 @@ class CV_GpuMatOpSetTo : public CvTest
CV_GpuMatOpSetTo
::
CV_GpuMatOpSetTo
()
:
CvTest
(
"GpuMatOperatorSetTo"
,
"setTo"
)
{
rows
=
12
7
;
cols
=
12
9
;
rows
=
12
9
;
cols
=
12
7
;
s
.
val
[
0
]
=
128.0
;
s
.
val
[
1
]
=
128.0
;
...
...
@@ -75,8 +77,9 @@ bool CV_GpuMatOpSetTo::compare_matrix(cv::Mat & cpumat, gpu::GpuMat & gpumat)
//int64 time1 = getTickCount();
gpumat
.
setTo
(
s
);
//int64 time2 = getTickCount();
//std::cout << "\ntime cpu:" << double((time1 - time) / getTickFrequency());
//std::cout << "\ntime gpu:" << double((time2 - time1) / getTickFrequency());
//std::cout << "\ntime cpu: " << std::fixed << std::setprecision(12) << double((time1 - time) / (double)getTickFrequency());
//std::cout << "\ntime gpu: " << std::fixed << std::setprecision(12) << double((time2 - time1) / (double)getTickFrequency());
//std::cout << "\n";
#ifdef PRINT_MATRIX
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment