Unverified Commit 35f66340 authored by Claudio's avatar Claudio

Add cuda::Stream capability to cuda::HOG::compute

In the previous version only the default stream was/could be used, i.e.
cv::cuda::Stream::Null().

With this change, HOG::compute() will now run in parallel over different
cuda::Streams.

The code has been reordered so that all data allocation is completed
first, then all the kernels are run in parallel over streams.

Fix #8177
parent f109c013
This diff is collapsed.
......@@ -66,15 +66,18 @@ namespace cv { namespace cuda { namespace device
{
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y,
int ncells_block_x, int ncells_block_y);
int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream);
void compute_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, const PtrStepSzf& grad,
const PtrStepSzb& qangle, float sigma, float* block_hists,
int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream);
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream);
void classify_hists(int win_height, int win_width, int block_stride_y,
int block_stride_x, int win_stride_y, int win_stride_x, int height,
......@@ -90,12 +93,14 @@ namespace cv { namespace cuda { namespace device
cv::cuda::PtrStepSzf descriptors);
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
cv::cuda::PtrStepSzf descriptors);
cv::cuda::PtrStepSzf descriptors,
const cudaStream_t& stream);
void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma,
const cudaStream_t& stream);
void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
......@@ -182,8 +187,8 @@ namespace
private:
int getTotalHistSize(Size img_size) const;
void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists);
void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream);
// void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle, Stream& stream);
// Coefficients of the separating plane
float free_coef_;
......@@ -310,7 +315,7 @@ namespace
BufferPool pool(Stream::Null());
GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
computeBlockHistograms(img, block_hists);
computeBlockHistograms(img, block_hists, Stream::Null());
Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
......@@ -458,19 +463,16 @@ namespace
CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
CV_Assert( !stream );
BufferPool pool(stream);
GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
computeBlockHistograms(img, block_hists);
BufferPool pool(stream);
GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
const size_t block_hist_size = getBlockHistogramSize();
Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
_descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32FC1);
GpuMat descriptors = _descriptors.getGpuMat();
GpuMat descriptors = _descriptors.getGpuMat();
computeBlockHistograms(img, block_hists, stream);
switch (descr_format_)
{
......@@ -490,7 +492,8 @@ namespace
img.rows, img.cols,
block_hists.ptr<float>(),
cell_size_.width, cells_per_block_.width,
descriptors);
descriptors,
StreamAccessor::getStream(stream));
break;
default:
CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
......@@ -504,18 +507,25 @@ namespace
return static_cast<int>(block_hist_size * blocks_per_img.area());
}
void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream)
{
BufferPool pool(stream);
cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height);
BufferPool pool(Stream::Null());
float angleScale = static_cast<float>(nbins_ / CV_PI);
GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
computeGradient(img, grad, qangle);
hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height, StreamAccessor::getStream(stream));
block_hists.create(1, getTotalHistSize(img.size()), CV_32FC1);
switch (img.type())
{
case CV_8UC1:
hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
break;
case CV_8UC4:
hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_, StreamAccessor::getStream(stream));
break;
}
hog::compute_hists(nbins_,
block_stride_.width, block_stride_.height,
......@@ -524,7 +534,8 @@ namespace
(float)getWinSigma(),
block_hists.ptr<float>(),
cell_size_.width, cell_size_.height,
cells_per_block_.width, cells_per_block_.height);
cells_per_block_.width, cells_per_block_.height,
StreamAccessor::getStream(stream));
hog::normalize_hists(nbins_,
block_stride_.width, block_stride_.height,
......@@ -532,24 +543,8 @@ namespace
block_hists.ptr<float>(),
(float)threshold_L2hys_,
cell_size_.width, cell_size_.height,
cells_per_block_.width, cells_per_block_.height);
}
void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)
{
grad.create(img.size(), CV_32FC2);
qangle.create(img.size(), CV_8UC2);
float angleScale = (float)(nbins_ / CV_PI);
switch (img.type())
{
case CV_8UC1:
hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
break;
case CV_8UC4:
hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
break;
}
cells_per_block_.width, cells_per_block_.height,
StreamAccessor::getStream(stream));
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment