Unverified Commit 35f66340 authored by Claudio's avatar Claudio

Add cuda::Stream capability to cuda::HOG::compute

In the previous version only the default stream was/could be used, i.e.

With this change, HOG::compute() will now run in parallel over different

The code has been reordered so that all data allocation is completed
first, then all the kernels are run in parallel over streams.

Fix #8177
parent f109c013
This diff is collapsed.
......@@ -66,15 +66,18 @@ namespace cv { namespace cuda { namespace device
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y,
int ncells_block_x, int ncells_block_y);
int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream);
void compute_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, const PtrStepSzf& grad,
const PtrStepSzb& qangle, float sigma, float* block_hists,
int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream);
void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y);
int height, int width, float* block_hists, float threshold, int cell_size_x, int cell_size_y, int ncells_block_x, int ncells_block_y,
const cudaStream_t& stream);
void classify_hists(int win_height, int win_width, int block_stride_y,
int block_stride_x, int win_stride_y, int win_stride_x, int height,
......@@ -90,12 +93,14 @@ namespace cv { namespace cuda { namespace device
cv::cuda::PtrStepSzf descriptors);
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int cell_size_x, int ncells_block_x,
cv::cuda::PtrStepSzf descriptors);
cv::cuda::PtrStepSzf descriptors,
const cudaStream_t& stream);
void compute_gradients_8UC1(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
void compute_gradients_8UC4(int nbins, int height, int width, const cv::cuda::PtrStepSzb& img,
float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma);
float angle_scale, cv::cuda::PtrStepSzf grad, cv::cuda::PtrStepSzb qangle, bool correct_gamma,
const cudaStream_t& stream);
void resize_8UC1(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
void resize_8UC4(const cv::cuda::PtrStepSzb& src, cv::cuda::PtrStepSzb dst);
......@@ -182,8 +187,8 @@ namespace
int getTotalHistSize(Size img_size) const;
void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists);
void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream);
// void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle, Stream& stream);
// Coefficients of the separating plane
float free_coef_;
......@@ -310,7 +315,7 @@ namespace
BufferPool pool(Stream::Null());
GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
computeBlockHistograms(img, block_hists);
computeBlockHistograms(img, block_hists, Stream::Null());
Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
......@@ -458,19 +463,16 @@ namespace
CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
CV_Assert( !stream );
BufferPool pool(stream);
GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
computeBlockHistograms(img, block_hists);
BufferPool pool(stream);
GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
const size_t block_hist_size = getBlockHistogramSize();
Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);
_descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32FC1);
GpuMat descriptors = _descriptors.getGpuMat();
GpuMat descriptors = _descriptors.getGpuMat();
computeBlockHistograms(img, block_hists, stream);
switch (descr_format_)
......@@ -490,7 +492,8 @@ namespace
img.rows, img.cols,
cell_size_.width, cells_per_block_.width,
CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
......@@ -504,18 +507,25 @@ namespace
return static_cast<int>(block_hist_size * blocks_per_img.area());
void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists, Stream& stream)
BufferPool pool(stream);
cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height);
BufferPool pool(Stream::Null());
float angleScale = static_cast<float>(nbins_ / CV_PI);
GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
computeGradient(img, grad, qangle);
hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height, cells_per_block_.width, cells_per_block_.height, StreamAccessor::getStream(stream));
block_hists.create(1, getTotalHistSize(img.size()), CV_32FC1);
switch (img.type())
case CV_8UC1:
hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
case CV_8UC4:
hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_, StreamAccessor::getStream(stream));
block_stride_.width, block_stride_.height,
......@@ -524,7 +534,8 @@ namespace
cell_size_.width, cell_size_.height,
cells_per_block_.width, cells_per_block_.height);
cells_per_block_.width, cells_per_block_.height,
block_stride_.width, block_stride_.height,
......@@ -532,24 +543,8 @@ namespace
cell_size_.width, cell_size_.height,
cells_per_block_.width, cells_per_block_.height);
void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)
grad.create(img.size(), CV_32FC2);
qangle.create(img.size(), CV_8UC2);
float angleScale = (float)(nbins_ / CV_PI);
switch (img.type())
case CV_8UC1:
hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
case CV_8UC4:
hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
cells_per_block_.width, cells_per_block_.height,
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment