Merge pull request #16096 from YashasSamaga:cuda4dnn-region-optimize

b505cf84 · Alexander Alekhin · 476a0273 · dd3f517f · b505cf84 · b505cf84
Commit b505cf84 authored Dec 09, 2019 by Alexander Alekhin
Showing with 13 additions and 20 deletions

region.cu modules/dnn/src/cuda/region.cu +0 -0

region.hpp modules/dnn/src/cuda4dnn/kernels/region.hpp +4 -11

region.hpp modules/dnn/src/cuda4dnn/primitives/region.hpp +9 -9

No files found.
--- a/modules/dnn/src/cuda/region.cu
+++ b/modules/dnn/src/cuda/region.cu
--- a/modules/dnn/src/cuda4dnn/kernels/region.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/region.hpp
@@ -13,19 +13,12 @@
 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
    template <class T>
-    void sigmoid_strided(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t n, std::size_t stride, std::size_t offset);
+    void region(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<T> bias,
-    template <class T>
-    void softmax_strided(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t n, std::size_t stride, std::size_t offset);
-    template <class T>
-    void region_finalize(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<T> bias,
        T object_prob_cutoff, T class_prob_cutoff,
-        std::size_t height_norm, std::size_t width_norm,
+        std::size_t boxes_per_cell, std::size_t box_size,
        std::size_t rows, std::size_t cols,
-        std::size_t boxes_per_cell,
+        std::size_t height_norm, std::size_t width_norm,
-        std::size_t box_size,
+        bool if_true_sigmoid_else_softmax);
-        std::size_t classes);
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */

--- a/modules/dnn/src/cuda4dnn/primitives/region.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/region.hpp
@@ -102,21 +102,21 @@ namespace cv { namespace dnn { namespace cuda4dnn {
            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
            auto output = output_wrapper->getSpan();
-            csl::memcpy<T>(output.get(), input.get(), output.size(), stream);
            auto rows = input.get_axis_size(1);
            auto cols = input.get_axis_size(2);
            auto cell_box_size = classes + 4 + 1;
            /* we squash class scores into probabilities using softmax or sigmoid */
-            if (squash_type == SquashMethod::SOFTMAX)
+            bool if_true_sigmoid_else_softmax = (squash_type == SquashMethod::SIGMOID);
-                kernels::softmax_strided<T>(stream, output, input, classes, cell_box_size, 5);
-            else if (squash_type == SquashMethod::SIGMOID)
+            kernels::region<T>(stream, output, input, biasTensor,
-                kernels::sigmoid_strided<T>(stream, output, input, classes, cell_box_size, 5);
+                object_prob_cutoff, class_prob_cutoff,
+                boxes_per_cell, cell_box_size,
-            kernels::region_finalize<T>(stream, output, input, biasTensor, object_prob_cutoff, class_prob_cutoff,
+                rows, cols,
-                height_norm, width_norm, rows, cols, boxes_per_cell, cell_box_size, classes);
+                height_norm, width_norm,
+                if_true_sigmoid_else_softmax
+            );
            if (nms_iou_threshold > 0) {
                auto output_mat = output_wrapper->getMutableHostMat();