Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
2ced568d
Commit
2ced568d
authored
Jan 29, 2020
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #16220 from YashasSamaga:cuda4dnn-roi-pooling-test_fix-optim
parents
e037800e
fd369a50
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
91 additions
and
29 deletions
+91
-29
roi_pooling.cu
modules/dnn/src/cuda/roi_pooling.cu
+85
-28
roi_pooling.hpp
modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
+1
-1
test_layers.cpp
modules/dnn/test/test_layers.cpp
+5
-0
No files found.
modules/dnn/src/cuda/roi_pooling.cu
View file @
2ced568d
...
...
@@ -24,49 +24,75 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T>
template <class T
, std::size_t CHANNELS_PER_ITER
>
__global__ void roi_pooling(
Span<T> output, size_type pooled_height, size_type pooled_width,
View<T> input, size_type in_height, size_type in_width,
View<T> rois, size_type num_channels,
T
spatial_scale)
View<T> rois, size_type num_channels,
float
spatial_scale)
{
// input: [1, num_channels, in_height, in_width]
const auto in_image_size = in_height * in_width;
// rois: [num_rois, 5]
auto num_rois = rois.size() / 5;
// output: [num_rois, num_channels, pooled_height, pooled_width]
const auto out_spatial_size = pooled_height * pooled_width;
const auto out_roi_size = num_channels * out_spatial_size;
/* every element in the output is mapped to a window in the input and each thread processes several windows */
for (auto idx : grid_stride_range(output.size()))
/* we have to compute the output value for every combination of (roi, c, y, x) in the output
*
* the computation involving (y, x) are identical for all non-spatial dimensions
* the computation and memory requests involving the roi are identical for remaining three axes
*
* we process multiple channels every iteration to reuse the identical computation
* and memory requests involved with the roi and spatial dimensions
*/
/*
* if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
* (num_channels / CHANNELS_PER_ITER) iterations per (roi, x, y)
*/
auto num_channel_iters_per_roi_xy = num_channels / CHANNELS_PER_ITER;
/* we need `num_channel_iters_per_roi_xy` iterations per (roi, x, y) and there are
* `num_rois` rois and `out_spatial_size` combinations of (x, y)
*/
auto iters_per_roi = num_channel_iters_per_roi_xy * out_spatial_size;
auto iters_required = num_rois * iters_per_roi;
for (auto iter : grid_stride_range(iters_required))
{
const auto n = idx / out_roi_size;
const auto c = (idx % out_roi_size) / out_spatial_size;
const auto y = (idx % out_spatial_size) / pooled_width;
const auto x = idx % pooled_width;
const index_type roi_no = iter / iters_per_roi;
const index_type c_start = ((iter % iters_per_roi) / out_spatial_size) * CHANNELS_PER_ITER;
/* note here that consecutive `iter` values will often have consecutive `x` values
* => stores into output will be coalesced across threads
*/
const index_type y = (iter % out_spatial_size) / pooled_width;
const index_type x = iter % pooled_width;
const index_type roi_offset =
n
* 5;
const index_type roi_offset =
roi_no
* 5;
using device::round;
const index_type batch_id = rois[roi_offset + 0];
const index_type x_start_roi = round(
rois[roi_offset + 1]
* spatial_scale);
const index_type y_start_roi = round(
rois[roi_offset + 2]
* spatial_scale);
const index_type x_end_roi = round(
rois[roi_offset + 3]
* spatial_scale);
const index_type y_end_roi = round(
rois[roi_offset + 4]
* spatial_scale);
const index_type x_start_roi = round(
static_cast<float>(rois[roi_offset + 1])
* spatial_scale);
const index_type y_start_roi = round(
static_cast<float>(rois[roi_offset + 2])
* spatial_scale);
const index_type x_end_roi = round(
static_cast<float>(rois[roi_offset + 3])
* spatial_scale);
const index_type y_end_roi = round(
static_cast<float>(rois[roi_offset + 4])
* spatial_scale);
using device::max;
const auto roi_width = max<index_type>(x_end_roi - x_start_roi + 1, 1);
const auto roi_height = max<index_type>(y_end_roi - y_start_roi + 1, 1);
const auto roi_width_ratio = static_cast<
T>(roi_width) / static_cast<T>(pooled_width)
;
const auto roi_height_ratio = static_cast<
T>(roi_height) / static_cast<T>(pooled_height)
;
const auto roi_width_ratio = static_cast<
float>(roi_width) / pooled_width
;
const auto roi_height_ratio = static_cast<
float>(roi_height) / pooled_height
;
auto x_start = x_start_roi + static_cast<index_type>(
static_cast<T>(x)
* roi_width_ratio);
auto y_start = y_start_roi + static_cast<index_type>(
static_cast<T>(y)
* roi_height_ratio);
auto x_start = x_start_roi + static_cast<index_type>(
x
* roi_width_ratio);
auto y_start = y_start_roi + static_cast<index_type>(
y
* roi_height_ratio);
using device::ceil;
auto x_end = x_start_roi + static_cast<index_type>(ceil(
static_cast<T>
(x + 1) * roi_width_ratio));
auto y_end = y_start_roi + static_cast<index_type>(ceil(
static_cast<T>
(y + 1) * roi_height_ratio));
auto x_end = x_start_roi + static_cast<index_type>(ceil((x + 1) * roi_width_ratio));
auto y_end = y_start_roi + static_cast<index_type>(ceil((y + 1) * roi_height_ratio));
using device::max;
x_start = max<index_type>(x_start, 0);
...
...
@@ -76,29 +102,48 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
x_end = min<index_type>(x_end, in_width);
y_end = min<index_type>(y_end, in_height);
index_type in_offset = (batch_id * num_channels + c_start) * in_height * in_width;
index_type out_idx = roi_no * out_roi_size + c_start * out_spatial_size + y * pooled_width + x;
for (int i = 0; i < CHANNELS_PER_ITER; i++)
{
/* We have to set the output to zero if (x_start >= x_end) or (y_start >= y_end). If either
* condition is true, the loops below won't execute even a single iteration. Hence, by setting
* `max_val` to zero in this case, we can combine it with the `else` code.
*/
T max_val = (x_start >= x_end || y_start >= y_end) ? T(0) : device::numeric_limits<T>::lowest();
const index_type in_offset = (batch_id * num_channels + c) * in_height * in_width;
for (auto iy = y_start; iy < y_end; iy++)
{
const auto in_idx = in_offset + iy * in_width;
for (auto ix = x_start; ix < x_end; ix++)
{
const auto in_idx = in_offset + iy * in_width + ix;
max_val = max(max_val, input[in_idx]);
max_val = max(max_val, input[in_idx + ix]);
}
}
output[idx] = max_val;
output[out_idx] = max_val;
in_offset += in_image_size;
out_idx += out_spatial_size;
}
}
}
}
template <class T, std::size_t CHANNELS_PER_ITER> static
void launch_multichannel_roi_pooling(const Stream& stream,
Span<T> output, size_type pooled_height, size_type pooled_width,
View<T> input, size_type in_height, size_type in_width,
View<T> rois, size_type num_channels, float spatial_scale)
{
auto kernel = raw::roi_pooling<T, CHANNELS_PER_ITER>;
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
launch_kernel(kernel, policy, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
}
template <class T>
void roi_pooling(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> rois,
T
spatial_scale)
void roi_pooling(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> rois,
float
spatial_scale)
{
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
...
...
@@ -110,13 +155,25 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
size_type in_height = input.get_axis_size(2);
size_type in_width = input.get_axis_size(3);
auto kernel = raw::roi_pooling<T>;
auto policy = make_policy(kernel, output.size(), 0, stream);
launch_kernel(kernel, policy, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
if (num_channels % 64 == 0) {
launch_multichannel_roi_pooling<T, 64>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 32 == 0) {
launch_multichannel_roi_pooling<T, 32>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 16 == 0) {
launch_multichannel_roi_pooling<T, 16>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 8 == 0) {
launch_multichannel_roi_pooling<T, 8>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 4 == 0) {
launch_multichannel_roi_pooling<T, 4>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else if (num_channels % 2 == 0) {
launch_multichannel_roi_pooling<T, 2>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
} else {
launch_multichannel_roi_pooling<T, 1>(stream, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
}
}
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
template void roi_pooling(const Stream& stream, TensorSpan<__half> output, TensorView<__half> input, View<__half> rois,
__half
spatial_scale);
template void roi_pooling(const Stream& stream, TensorSpan<__half> output, TensorView<__half> input, View<__half> rois,
float
spatial_scale);
#endif
template void roi_pooling(const Stream& stream, TensorSpan<float> output, TensorView<float> input, View<float> rois, float spatial_scale);
...
...
modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
View file @
2ced568d
...
...
@@ -12,7 +12,7 @@
namespace
cv
{
namespace
dnn
{
namespace
cuda4dnn
{
namespace
kernels
{
template
<
class
T
>
void
roi_pooling
(
const
csl
::
Stream
&
stream
,
csl
::
TensorSpan
<
T
>
output
,
csl
::
TensorView
<
T
>
input
,
csl
::
View
<
T
>
rois
,
T
spatial_scale
);
void
roi_pooling
(
const
csl
::
Stream
&
stream
,
csl
::
TensorSpan
<
T
>
output
,
csl
::
TensorView
<
T
>
input
,
csl
::
View
<
T
>
rois
,
float
spatial_scale
);
}}}}
/* namespace cv::dnn::cuda4dnn::kernels */
...
...
modules/dnn/test/test_layers.cpp
View file @
2ced568d
...
...
@@ -600,6 +600,11 @@ TEST_P(Test_Caffe_layers, ROIPooling_Accuracy)
double
l1
=
(
target
==
DNN_TARGET_OPENCL_FP16
||
target
==
DNN_TARGET_MYRIAD
)
?
1e-3
:
1e-5
;
double
lInf
=
(
target
==
DNN_TARGET_OPENCL_FP16
||
target
==
DNN_TARGET_MYRIAD
)
?
1e-3
:
1e-4
;
if
(
target
==
DNN_TARGET_CUDA_FP16
)
{
l1
=
2e-4
;
lInf
=
9e-4
;
}
normAssert
(
out
,
ref
,
""
,
l1
,
lInf
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment