Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b505cf84
Commit
b505cf84
authored
Dec 09, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #16096 from YashasSamaga:cuda4dnn-region-optimize
parents
476a0273
dd3f517f
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
117 additions
and
146 deletions
+117
-146
region.cu
modules/dnn/src/cuda/region.cu
+104
-126
region.hpp
modules/dnn/src/cuda4dnn/kernels/region.hpp
+4
-11
region.hpp
modules/dnn/src/cuda4dnn/primitives/region.hpp
+9
-9
No files found.
modules/dnn/src/cuda/region.cu
View file @
b505cf84
...
...
@@ -24,176 +24,154 @@ using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <class T>
__global__ void sigmoid_strided(Span<T> output, View<T> input, size_type n, size_type stride, size_type offset) {
/* - the input is divided into equal blocks strided by `stride`
* - we must apply sigmoid to a continuous range of `n` values starting from `offset` in every block
*/
for (auto i : grid_stride_range(n * output.size() / stride)) {
auto block_idx = i / n;
auto index = block_idx * stride + offset + (i % n);
using device::sigmoid;
output[index] = sigmoid(input[index]);
}
}
template <class T>
__global__ void softmax_strided(Span<T> output, View<T> input, size_type n, size_type stride, size_type offset_) {
for (auto idx : grid_stride_range(output.size() / stride)) {
index_type offset = idx * stride + offset_;
auto largest = numeric_limits<T>::lowest();
for (int i = 0; i < n; i++) {
using device::max;
largest = max(largest, output[offset + i]);
}
auto sum = T(0);
for (int i = 0; i < n; i++) {
using device::exp;
auto temp = exp(output[offset + i] - largest);
sum += temp;
output[offset + i] = temp;
}
for (int i = 0; i < n; i++) {
output[offset + i] /= sum;
}
}
}
template <class T>
__global__ void region_finalize(Span<T> output, View<T> input, View<T> bias,
T object_prob_cutoff, T class_prob_cutoff,
size_type height_norm, size_type width_norm,
__global__ void region_box(
Span<T> output, View<T> input, View<T> bias,
size_type boxes_per_cell, size_type box_size,
size_type rows, size_type cols,
size_type boxes_per_cell,
size_type box_size,
size_type classes)
size_type height_norm, size_type width_norm,
T object_prob_cutoff)
{
using vector2_type = get_vector_type_t<T, 2>;
auto bias_vPtr = vector2_type::get_pointer(bias.data());
for (auto box_index : grid_stride_range(output.size() / box_size)) {
auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
auto box_offset = box_index * box_size;
const
auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
const
auto box_offset = box_index * box_size;
auto batch_inner_size = rows * cols * boxes_per_cell;
auto row_inner_size = cols * boxes_per_cell;
auto col_inner_size = boxes_per_cell;
const
auto batch_inner_size = rows * cols * boxes_per_cell;
const
auto row_inner_size = cols * boxes_per_cell;
const
auto col_inner_size = boxes_per_cell;
auto y = (box_index % batch_inner_size) / row_inner_size;
auto x = (box_index % row_inner_size) / col_inner_size;
const
auto y = (box_index % batch_inner_size) / row_inner_size;
const
auto x = (box_index % row_inner_size) / col_inner_size;
using device::sigmoid;
using device::exp;
output[box_offset + 0] = (T(x) + sigmoid(input[box_offset + 0])) / T(cols);
output[box_offset + 1] = (T(y) + sigmoid(input[box_offset + 1])) / T(rows);
output[box_offset + 2] = exp(input[box_offset + 2]) * bias[2 * box_of_the_cell + 0] / T(width_norm);
output[box_offset + 3] = exp(input[box_offset + 3]) * bias[2 * box_of_the_cell + 1] / T(height_norm);
vector2_type bias_xy;
v_load(bias_xy, bias_vPtr[box_of_the_cell]);
using device::exp;
output[box_offset + 2] = exp(input[box_offset + 2]) * bias_xy.data[0] / T(width_norm);
output[box_offset + 3] = exp(input[box_offset + 3]) * bias_xy.data[1] / T(height_norm);
/* squash objectness score into a probability */
using device::sigmoid;
T objectness_prob = sigmoid(output[box_offset + 4]);
output[box_offset + 4] = objectness_prob;
T objectness_prob = sigmoid(input[box_offset + 4]);
/* ignore prediction if the objectness probability is less than the cutoff */
if (objectness_prob < object_prob_cutoff)
objectness_prob = 0;
/* the class probabilities we have currently are conditional class probabilities
output[box_offset + 4] = objectness_prob;
}
}
template <class T>
__global__ void region_sigmoid_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size)
{
for (auto idx : grid_stride_range(output.size())) {
const index_type box_no = idx / box_size;
const index_type start_of_box = box_no * box_size;
const index_type box_offset = idx % box_size;
if (box_offset < 5) {
/* continue as we have already processed these in region_box */
continue;
}
auto objectness_prob = output[start_of_box + 4];
/* the class probabilities we currently have are conditional class probabilities
* given the object
*
* to obtain the actual class probability, we multiply the conditional probability
* with the object probability
*/
const index_type class_begin = box_offset + 5; /* 4 box coordinates, 1 obj prob, class probs... */
const index_type class_end = class_begin + classes;
index_type offset = class_begin;
auto actual_class_prob = objectness_prob * sigmoid(input[idx]);
if (actual_class_prob <= class_prob_cutoff)
actual_class_prob = T(0);
output[idx] = actual_class_prob;
}
}
using vector_type = get_vector_type_t<T, 4>;
template <class T>
__global__ void region_softmax_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size) {
for (auto box_no : grid_stride_range(output.size() / box_size)) {
const index_type start_of_box = box_no * box_size;
const index_type start_idx = start_of_box + 5;
const index_type end_idx = start_of_box + box_size;
/* process each class independently until the offset is aligned to an n-element boundary */
while (offset % vector_type::size() != 0 && offset < class_end) {
T actual_class_prob = objectness_prob * output[offset];
if (actual_class_prob <= class_prob_cutoff)
actual_class_prob = T(0);
output[offset] = actual_class_prob;
offset++;
auto largest = numeric_limits<T>::lowest();
for (int idx = start_idx; idx < end_idx; idx++) {
using device::max;
largest = max(largest, input[idx]);
}
auto output_vPtr = vector_type::get_pointer(output.data() + offset);
auto input_vPtr = vector_type::get_pointer(input.data() + offset);
for (int i = 0; (offset + vector_type::size()) < class_end; i++) {
vector_type vec;
v_load(vec, output_vPtr[i]);
for (int j = 0; j < vector_type::size(); j++) {
T actual_class_prob = objectness_prob * vec.data[j];
if (actual_class_prob <= class_prob_cutoff)
actual_class_prob = T(0);
vec.data[j] = actual_class_prob;
}
v_store(output_vPtr[i], vec);
offset += vector_type::size();
auto sum = T(0);
for (int idx = start_idx; idx < end_idx; idx++) {
using device::exp;
auto temp = exp(input[idx] - largest);
sum += temp;
output[idx] = temp;
}
/* process the remaining classes */
while (offset < class_end) {
T actual_class_prob = objectness_prob * output[offset];
for (int idx = start_idx; idx < end_idx; idx++) {
auto softmax_score = output[idx] / sum;
/* the class probabilities we currently have are conditional class probabilities
* given the object
*
* to obtain the actual class probability, we multiply the conditional probability
* with the object probability
*/
auto objectness_prob = output[start_of_box + 4];
auto actual_class_prob = objectness_prob * softmax_score;
if (actual_class_prob <= class_prob_cutoff)
actual_class_prob = T(0);
output[offset] = actual_class_prob;
offset++;
output[idx] = actual_class_prob;
}
}
}
}
template <class T>
void sigmoid_strided(const Stream& stream, Span<T> output, View<T> input, std::size_t n, std::size_t stride, std::size_t offset) {
CV_Assert(output.size() % stride == 0);
auto kernel = raw::sigmoid_strided<T>;
auto policy = make_policy(kernel, n * output.size() / stride, 0, stream);
launch_kernel(kernel, policy, output, input, n, stride, offset);
}
template void sigmoid_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t);
template void sigmoid_strided(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t);
template <class T>
void softmax_strided(const Stream& stream, Span<T> output, View<T> input, std::size_t n, std::size_t stride, std::size_t offset) {
CV_Assert(output.size() % stride == 0);
auto kernel = raw::softmax_strided<T>;
auto policy = make_policy(kernel, output.size() / stride, 0, stream);
launch_kernel(kernel, policy, output, input, n, stride, offset);
}
template void softmax_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t);
template void softmax_strided(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t);
template <class T>
void region_finalize(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
void region(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
T object_prob_cutoff, T class_prob_cutoff,
std::size_t
height_norm, std::size_t width_norm
,
std::size_t
boxes_per_cell, std::size_t box_size
,
std::size_t rows, std::size_t cols,
std::size_t boxes_per_cell,
std::size_t box_size,
std::size_t classes)
std::size_t height_norm, std::size_t width_norm,
bool if_true_sigmoid_else_softmax /* true = sigmoid, false = softmax */)
{
CV_Assert(output.size() == input.size());
CV_Assert(output.size() % box_size == 0);
auto kernel = raw::region_finalize<T>;
auto policy = make_policy(kernel, output.size() / box_size, 0, stream);
launch_kernel(kernel, policy, output, input, bias,
object_prob_cutoff, class_prob_cutoff,
height_norm, width_norm,
rows, cols, boxes_per_cell, box_size, classes);
CV_Assert(is_fully_aligned(bias, 2));
auto box_kernel = raw::region_box<T>;
auto box_policy = make_policy(box_kernel, output.size() / box_size, 0, stream);
launch_kernel(box_kernel, box_policy,
output, input, bias, boxes_per_cell, box_size,
rows, cols, height_norm, width_norm,
object_prob_cutoff);
if (if_true_sigmoid_else_softmax) {
auto kernel_score = raw::region_sigmoid_class_score<T>;
auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
} else {
auto kernel_score = raw::region_softmax_class_score<T>;
auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
}
}
template void region
_finalize
(const Stream&, Span<__half>, View<__half>, View<__half>,
__half, __half, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t,
std::size_t
);
template void region(const Stream&, Span<__half>, View<__half>, View<__half>,
__half, __half, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t,
bool
);
template void region
_finalize
(const Stream&, Span<float>, View<float>, View<float>,
float, float, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t,
std::size_t
);
template void region(const Stream&, Span<float>, View<float>, View<float>,
float, float, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t,
bool
);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
modules/dnn/src/cuda4dnn/kernels/region.hpp
View file @
b505cf84
...
...
@@ -13,19 +13,12 @@
namespace
cv
{
namespace
dnn
{
namespace
cuda4dnn
{
namespace
kernels
{
template
<
class
T
>
void
sigmoid_strided
(
const
csl
::
Stream
&
stream
,
csl
::
Span
<
T
>
output
,
csl
::
View
<
T
>
input
,
std
::
size_t
n
,
std
::
size_t
stride
,
std
::
size_t
offset
);
template
<
class
T
>
void
softmax_strided
(
const
csl
::
Stream
&
stream
,
csl
::
Span
<
T
>
output
,
csl
::
View
<
T
>
input
,
std
::
size_t
n
,
std
::
size_t
stride
,
std
::
size_t
offset
);
template
<
class
T
>
void
region_finalize
(
const
csl
::
Stream
&
stream
,
csl
::
Span
<
T
>
output
,
csl
::
View
<
T
>
input
,
csl
::
View
<
T
>
bias
,
void
region
(
const
csl
::
Stream
&
stream
,
csl
::
Span
<
T
>
output
,
csl
::
View
<
T
>
input
,
csl
::
View
<
T
>
bias
,
T
object_prob_cutoff
,
T
class_prob_cutoff
,
std
::
size_t
height_norm
,
std
::
size_t
width_norm
,
std
::
size_t
boxes_per_cell
,
std
::
size_t
box_size
,
std
::
size_t
rows
,
std
::
size_t
cols
,
std
::
size_t
boxes_per_cell
,
std
::
size_t
box_size
,
std
::
size_t
classes
);
std
::
size_t
height_norm
,
std
::
size_t
width_norm
,
bool
if_true_sigmoid_else_softmax
);
}}}}
/* namespace cv::dnn::cuda4dnn::kernels */
...
...
modules/dnn/src/cuda4dnn/primitives/region.hpp
View file @
b505cf84
...
...
@@ -102,21 +102,21 @@ namespace cv { namespace dnn { namespace cuda4dnn {
auto
output_wrapper
=
outputs
[
0
].
dynamicCast
<
wrapper_type
>
();
auto
output
=
output_wrapper
->
getSpan
();
csl
::
memcpy
<
T
>
(
output
.
get
(),
input
.
get
(),
output
.
size
(),
stream
);
auto
rows
=
input
.
get_axis_size
(
1
);
auto
cols
=
input
.
get_axis_size
(
2
);
auto
cell_box_size
=
classes
+
4
+
1
;
/* we squash class scores into probabilities using softmax or sigmoid */
if
(
squash_type
==
SquashMethod
::
SOFTMAX
)
kernels
::
softmax_strided
<
T
>
(
stream
,
output
,
input
,
classes
,
cell_box_size
,
5
);
else
if
(
squash_type
==
SquashMethod
::
SIGMOID
)
kernels
::
sigmoid_strided
<
T
>
(
stream
,
output
,
input
,
classes
,
cell_box_size
,
5
);
kernels
::
region_finalize
<
T
>
(
stream
,
output
,
input
,
biasTensor
,
object_prob_cutoff
,
class_prob_cutoff
,
height_norm
,
width_norm
,
rows
,
cols
,
boxes_per_cell
,
cell_box_size
,
classes
);
bool
if_true_sigmoid_else_softmax
=
(
squash_type
==
SquashMethod
::
SIGMOID
);
kernels
::
region
<
T
>
(
stream
,
output
,
input
,
biasTensor
,
object_prob_cutoff
,
class_prob_cutoff
,
boxes_per_cell
,
cell_box_size
,
rows
,
cols
,
height_norm
,
width_norm
,
if_true_sigmoid_else_softmax
);
if
(
nms_iou_threshold
>
0
)
{
auto
output_mat
=
output_wrapper
->
getMutableHostMat
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment