Commit 769564c1 authored by Andrey Morozov's avatar Andrey Morozov

implemented asynchronous call for gpumat::setTo(), gpumat::copyTo(), gpumat::converTo()

parent 1ead3a5b
......@@ -64,7 +64,7 @@ namespace cv
CV_EXPORTS int getNumberOfSMs(int device);
//////////////////////////////// GpuMat ////////////////////////////////
class CudaStrem;
class CudaStream;
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
class CV_EXPORTS GpuMat
......
......@@ -61,12 +61,12 @@ namespace cv
{
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels);
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels);
extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels);
extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream = 0);
extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta);
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream = 0);
}
}
}
......
......@@ -42,7 +42,6 @@
#include <stddef.h>
#include <stdio.h>
//#include <iostream>
#include "cuda_shared.hpp"
#include "cuda_runtime.h"
......@@ -239,19 +238,27 @@ namespace cv
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels);
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T>
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels)
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
if (stream == 0)
{
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
}
}
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels)
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
static CopyToFunc tab[8] =
{
......@@ -269,7 +276,7 @@ namespace cv
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels);
func(mat_src, mat_dst, mask, channels, stream);
}
......@@ -277,28 +284,43 @@ namespace cv
////////////////////////////////// SetTo //////////////////////////////////
///////////////////////////////////////////////////////////////////////////
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels);
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels);
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
template <typename T>
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels)
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
if (stream == 0)
{
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
}
}
template <typename T>
void set_to_without_mask_run(const DevMem2D& mat, int channels)
void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
if (stream == 0)
{
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
}
}
extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels)
extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream)
{
double data[4];
data[0] = scalar[0];
......@@ -323,11 +345,11 @@ namespace cv
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(mat, channels);
func(mat, channels, stream);
}
extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels)
extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
double data[4];
data[0] = scalar[0];
......@@ -352,7 +374,7 @@ namespace cv
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(mat, mask, channels);
func(mat, mask, channels, stream);
}
......@@ -360,22 +382,27 @@ namespace cv
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta);
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
template<typename T, typename DT>
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta)
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{
const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
dim3 block(32, 8);
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
if (stream == 0)
{
::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
}
}
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta)
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{
static CvtFunc tab[8][8] =
{
......@@ -406,7 +433,7 @@ namespace cv
CvtFunc func = tab[sdepth][ddepth];
if (func == 0)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(src, dst, width, height, alpha, beta);
func(src, dst, width, height, alpha, beta, stream);
}
} // namespace impl
} // namespace gpu
......
......@@ -74,6 +74,7 @@ struct CudaStream::Impl
cudaStream_t stream;
int ref_counter;
};
namespace
{
template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment