Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
1f2b2c52
Commit
1f2b2c52
authored
Jan 05, 2020
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #16230 from YashasSamaga:cuda4dnn-fp-conversion
parents
43a91f82
01f97f15
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
172 additions
and
3 deletions
+172
-3
fp_conversion.cu
modules/dnn/src/cuda/fp_conversion.cu
+102
-0
fp_conversion.hpp
modules/dnn/src/cuda4dnn/kernels/fp_conversion.hpp
+18
-0
op_cuda.hpp
modules/dnn/src/op_cuda.hpp
+52
-3
No files found.
modules/dnn/src/cuda/fp_conversion.cu
0 → 100644
View file @
1f2b2c52
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include "grid_stride_range.hpp"
#include "execution.hpp"
#include "vector_traits.hpp"
#include "../cuda4dnn/csl/stream.hpp"
#include "../cuda4dnn/csl/span.hpp"
using namespace cv::dnn::cuda4dnn::csl;
using namespace cv::dnn::cuda4dnn::csl::device;
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
namespace raw {
template <std::size_t N>
__global__ void fp32_to_fp16(Span<__half> output, View<float> input) {
using output_vector_type = get_vector_type_t<__half, N>;
using input_vector_type = get_vector_type_t<float, N>;
auto output_vPtr = output_vector_type::get_pointer(output.data());
auto input_vPtr = input_vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
input_vector_type in_vec;
v_load(in_vec, input_vPtr[i]);
output_vector_type out_vec;
for (int j = 0; j < output_vector_type::size(); j++)
out_vec.data[j] = __float2half(in_vec.data[j]);
v_store(output_vPtr[i], out_vec);
}
}
template <std::size_t N>
__global__ void fp16_to_fp32(Span<float> output, View<__half> input) {
using output_vector_type = get_vector_type_t<float, N>;
using input_vector_type = get_vector_type_t<__half, N>;
auto output_vPtr = output_vector_type::get_pointer(output.data());
auto input_vPtr = input_vector_type::get_pointer(input.data());
for (auto i : grid_stride_range(output.size() / output_vector_type::size())) {
input_vector_type in_vec;
v_load(in_vec, input_vPtr[i]);
output_vector_type out_vec;
for (int j = 0; j < output_vector_type::size(); j++)
out_vec.data[j] = __half2float(in_vec.data[j]);
v_store(output_vPtr[i], out_vec);
}
}
}
template <std::size_t N> static
void launch_vectorized_fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
CV_Assert(is_fully_aligned<__half>(output, N));
CV_Assert(is_fully_aligned<float>(input, N));
auto kernel = raw::fp32_to_fp16<N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input);
}
void fp32_to_fp16(const Stream& stream, Span<__half> output, View<float> input) {
if (is_fully_aligned<__half>(output, 4) && is_fully_aligned<float>(input, 4)) {
launch_vectorized_fp32_to_fp16<4>(stream, output, input);
} else if (is_fully_aligned<__half>(output, 2) && is_fully_aligned<float>(input, 2)) {
launch_vectorized_fp32_to_fp16<2>(stream, output, input);
} else {
launch_vectorized_fp32_to_fp16<1>(stream, output, input);
}
}
template <std::size_t N> static
void launch_vectorized_fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
CV_Assert(is_fully_aligned<float>(output, N));
CV_Assert(is_fully_aligned<__half>(input, N));
auto kernel = raw::fp16_to_fp32<N>;
auto policy = make_policy(kernel, output.size() / N, 0, stream);
launch_kernel(kernel, policy, output, input);
}
void fp16_to_fp32(const Stream& stream, Span<float> output, View<__half> input) {
if (is_fully_aligned<float>(output, 4) && is_fully_aligned<__half>(input, 4)) {
launch_vectorized_fp16_to_fp32<4>(stream, output, input);
} else if (is_fully_aligned<float>(output, 2) && is_fully_aligned<__half>(input, 2)) {
launch_vectorized_fp16_to_fp32<2>(stream, output, input);
} else {
launch_vectorized_fp16_to_fp32<1>(stream, output, input);
}
}
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
modules/dnn/src/cuda4dnn/kernels/fp_conversion.hpp
0 → 100644
View file @
1f2b2c52
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP
#include "../csl/stream.hpp"
#include "../csl/span.hpp"
namespace
cv
{
namespace
dnn
{
namespace
cuda4dnn
{
namespace
kernels
{
void
fp32_to_fp16
(
const
csl
::
Stream
&
stream
,
csl
::
Span
<
half
>
output
,
csl
::
View
<
float
>
input
);
void
fp16_to_fp32
(
const
csl
::
Stream
&
stream
,
csl
::
Span
<
float
>
output
,
csl
::
View
<
half
>
input
);
}}}}
/* namespace cv::dnn::cuda4dnn::kernels */
#endif
/* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FP_CONVERSION_HPP */
modules/dnn/src/op_cuda.hpp
View file @
1f2b2c52
...
...
@@ -13,6 +13,7 @@
#include "cuda4dnn/csl/memory.hpp"
#include "cuda4dnn/csl/fp16.hpp"
#include "cuda4dnn/csl/workspace.hpp"
#include "cuda4dnn/kernels/fp_conversion.hpp"
#endif
#include <opencv2/dnn/shape_utils.hpp>
...
...
@@ -149,7 +150,6 @@ namespace cv { namespace dnn {
if
(
temp
.
data
!=
destMat
.
data
)
temp
.
copyTo
(
destMat
);
}
}}
/* namespace cuda4dnn::csl */
/** base class for CUDA operation nodes (for all supported targets) */
...
...
@@ -219,6 +219,45 @@ namespace cv { namespace dnn {
virtual
void
setStream
(
cuda4dnn
::
csl
::
Stream
stream
)
noexcept
=
0
;
};
namespace
cuda4dnn
{
namespace
detail
{
template
<
class
U
>
void
convert_D2H
(
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
View
<
U
>
view
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
);
template
<>
inline
void
convert_D2H
<
half
>
(
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
View
<
half
>
view
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
)
{
if
(
device_temp
.
size
()
<
view
.
size
())
device_temp
.
reset
(
view
.
size
());
auto
temp_span
=
cuda4dnn
::
csl
::
Span
<
float
>
(
device_temp
.
get
(),
view
.
size
());
cuda4dnn
::
kernels
::
fp16_to_fp32
(
stream
,
temp_span
,
view
);
cuda4dnn
::
csl
::
memcpy
<
float
>
(
reinterpret_cast
<
float
*>
(
mat
.
data
),
temp_span
.
data
(),
view
.
size
(),
stream
);
}
template
<>
inline
void
convert_D2H
<
float
>
(
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
View
<
float
>
view
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
)
{
cuda4dnn
::
csl
::
memcpy
<
float
>
(
reinterpret_cast
<
float
*>
(
mat
.
data
),
view
.
data
(),
view
.
size
(),
stream
);
}
template
<
class
U
>
void
convert_H2D
(
cuda4dnn
::
csl
::
Span
<
U
>
span
,
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
);
template
<>
inline
void
convert_H2D
<
half
>
(
cuda4dnn
::
csl
::
Span
<
half
>
span
,
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
)
{
if
(
device_temp
.
size
()
<
span
.
size
())
device_temp
.
reset
(
span
.
size
());
auto
temp_span
=
cuda4dnn
::
csl
::
Span
<
float
>
(
device_temp
.
get
(),
span
.
size
());
cuda4dnn
::
csl
::
memcpy
<
float
>
(
temp_span
.
data
(),
reinterpret_cast
<
float
*>
(
mat
.
data
),
span
.
size
(),
stream
);
cuda4dnn
::
kernels
::
fp32_to_fp16
(
stream
,
span
,
temp_span
);
}
template
<>
inline
void
convert_H2D
<
float
>
(
cuda4dnn
::
csl
::
Span
<
float
>
span
,
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
)
{
cuda4dnn
::
csl
::
memcpy
<
float
>
(
span
.
data
(),
reinterpret_cast
<
float
*>
(
mat
.
data
),
span
.
size
(),
stream
);
}
}}
/* namespace cuda4dnn::detail */
template
<
class
T
,
int
TargetID
>
class
GenericCUDABackendWrapper
final
:
public
CUDABackendWrapper
{
public
:
...
...
@@ -283,8 +322,12 @@ namespace cv { namespace dnn {
* We use a view to ensure that only the required region of memory is copied.
*/
auto
view
=
tensor_view_type
(
shared_block
->
device
.
get
(),
std
::
begin
(
shape
),
std
::
end
(
shape
));
cuda4dnn
::
csl
::
copyTensorToMat
<
T
>
(
view
,
shared_block
->
host
,
shared_block
->
stream
);
auto
&
mat
=
shared_block
->
host
;
CV_Assert
(
mat
.
isContinuous
());
CV_Assert
(
mat
.
type
()
==
CV_32F
);
cuda4dnn
::
detail
::
convert_D2H
<
T
>
(
mat
,
view
,
shared_block
->
device_temp
,
shared_block
->
stream
);
shared_block
->
stream
.
synchronize
();
}
}
...
...
@@ -300,7 +343,12 @@ namespace cv { namespace dnn {
shared_block
->
device_dirty
=
false
;
auto
span
=
tensor_span_type
(
shared_block
->
device
.
get
(),
std
::
begin
(
shape
),
std
::
end
(
shape
));
cuda4dnn
::
csl
::
copyMatToTensor
<
T
>
(
shared_block
->
host
,
span
,
shared_block
->
stream
);
auto
&
mat
=
shared_block
->
host
;
CV_Assert
(
mat
.
isContinuous
());
CV_Assert
(
mat
.
type
()
==
CV_32F
);
cuda4dnn
::
detail
::
convert_H2D
<
T
>
(
span
,
mat
,
shared_block
->
device_temp
,
shared_block
->
stream
);
}
}
...
...
@@ -368,6 +416,7 @@ namespace cv { namespace dnn {
cuda4dnn
::
csl
::
MemoryLockGuard
memGuard
;
/* keeps host memory page-locked if possible */
cuda4dnn
::
csl
::
ManagedPtr
<
T
>
device
;
cuda4dnn
::
csl
::
ManagedPtr
<
float
>
device_temp
;
/* use for conversions */
cuda4dnn
::
csl
::
Stream
stream
;
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment