Commit 16bc505d authored by YashasSamaga's avatar YashasSamaga

improve reduction logic and add fast transpose kernel

parent ee4feb4b
......@@ -21,7 +21,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T, std::size_t N>
__global__ void fill_vec(Span<T> output, T value) {
using vector_type = get_vector_type_t<T, N>;
auto output_vPtr = vector_type::get_pointer(output.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
......@@ -30,6 +29,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
v_store(output_vPtr[i], vec);
}
}
template <class T, std::size_t N>
__global__ void copy_vec(Span<T> output, View<T> input) {
using vector_type = get_vector_type_t<T, N>;
auto input_vPtr = vector_type::get_pointer(input.data());
auto output_vPtr = vector_type::get_pointer(output.data());
for (auto i : grid_stride_range(output.size() / vector_type::size())) {
vector_type vec;
v_load(vec, input_vPtr[i]);
v_store(output_vPtr[i], vec);
}
}
}
template <class T, std::size_t N> static
......@@ -55,4 +66,28 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template void fill(const Stream&, Span<__half>, __half);
template void fill(const Stream&, Span<float>, float);
template <class T, std::size_t N> static
void launch_vectorized_copy(const Stream& stream, Span<T> output, View<T> input) {
CV_Assert(is_fully_aligned<T>(output, N));
CV_Assert(is_fully_aligned<T>(input, N));
auto kernel = raw::copy_vec<T, N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input);
}
template <class T>
void copy(const Stream& stream, Span<T> output, View<T> input) {
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
launch_vectorized_copy<T, 4>(stream, output, input);
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
launch_vectorized_copy<T, 2>(stream, output, input);
} else {
launch_vectorized_copy<T, 1>(stream, output, input);
}
}
template void copy(const Stream&, Span<__half>, View<__half>);
template void copy(const Stream&, Span<float>, View<float>);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
......@@ -16,7 +16,7 @@
#include "../cuda4dnn/csl/tensor.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/fill.hpp"
#include "../cuda4dnn/kernels/fill_copy.hpp"
#include <opencv2/core.hpp>
......
......@@ -15,7 +15,7 @@
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
#include "../cuda4dnn/kernels/fill.hpp"
#include "../cuda4dnn/kernels/fill_copy.hpp"
#include "../cuda4dnn/kernels/scale_shift.hpp"
#include <opencv2/core.hpp>
......
This diff is collapsed.
......@@ -2,8 +2,8 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP
#include "../csl/stream.hpp"
#include "../csl/span.hpp"
......@@ -13,6 +13,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
void fill(const csl::Stream& stream, csl::Span<T> output, T value);
template <class T>
void copy(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP */
......@@ -16,6 +16,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
template <class T>
void permute(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> order);
template <class T>
void transpose(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t in_width, std::size_t out_width);
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP */
......@@ -10,7 +10,7 @@
#include "../csl/stream.hpp"
#include "../csl/pointer.hpp"
#include "../kernels/fill.hpp"
#include "../kernels/fill_copy.hpp"
#include "../kernels/concat.hpp"
#include <opencv2/core.hpp>
......
......@@ -10,7 +10,7 @@
#include "../csl/stream.hpp"
#include "../csl/tensor.hpp"
#include "../kernels/fill.hpp"
#include "../kernels/fill_copy.hpp"
#include "../kernels/concat.hpp"
#include "../kernels/padding.hpp"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment