Commit c46d4546 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu replace slice optimize (#1411)

* optimize replace slice

* fix bugs

* fix bug

* optimize pad dynamic

* fix bug

* fix bug

* fix bug

* remove *

* add gpu_assignment to pass

* refactor cuda replace slice.

* fix bug

* refactor replace slice

* working version

* clang format

* us layout instead of assignment

* us layout instead of assignment in cmakelist

* update gpu_layout

* fix bugs

* resolve conflict

* GPUShape to NVShape

* using kernel args

* using kernel args

* fix bugs

* fix bugs

* fix bug, remove mkldnn.h from gpu_layout.cpp

* fix bug for pad_below

* remove cast to rep_slice

* fix bugs

* clang format

* change add_in_place_oi_pair({0, 0, false} to add_in_place_oi_pair({0, 0, true};
parent 7d3323c9
...@@ -38,6 +38,7 @@ set(SRC ...@@ -38,6 +38,7 @@ set(SRC
gpu_tensor_view.cpp gpu_tensor_view.cpp
gpu_util.cpp gpu_util.cpp
type_info.cpp type_info.cpp
pass/gpu_layout.cpp
pass/tensor_memory_reservation.cpp pass/tensor_memory_reservation.cpp
gpu_kernel_args.cpp gpu_kernel_args.cpp
) )
......
This diff is collapsed.
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "ngraph/op/convolution.hpp" #include "ngraph/op/convolution.hpp"
#include "ngraph/op/max_pool.hpp" #include "ngraph/op/max_pool.hpp"
#include "ngraph/op/replace_slice.hpp"
#include "ngraph/op/softmax.hpp" #include "ngraph/op/softmax.hpp"
namespace ngraph namespace ngraph
...@@ -46,6 +47,7 @@ namespace ngraph ...@@ -46,6 +47,7 @@ namespace ngraph
size_t build_primitive(const op::Softmax* node); size_t build_primitive(const op::Softmax* node);
size_t build_primitive(const op::Convolution* node); size_t build_primitive(const op::Convolution* node);
size_t build_primitive(const op::MaxPool* node); size_t build_primitive(const op::MaxPool* node);
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
public: public:
size_t build_pad(const std::array<std::string, 2>& dtypes, size_t build_pad(const std::array<std::string, 2>& dtypes,
...@@ -130,13 +132,6 @@ namespace ngraph ...@@ -130,13 +132,6 @@ namespace ngraph
save_elementwise); save_elementwise);
} }
size_t build_replace_slice(const std::array<std::string, 3>& dtypes,
NVShape tensor_shape,
NVShape source_shape,
NVShape lower_bounds,
NVShape upper_bounds,
NVShape slice_stride);
size_t build_broadcast(const std::array<std::string, 2>& dtypes, size_t build_broadcast(const std::array<std::string, 2>& dtypes,
NVShape result_shape, NVShape result_shape,
const std::set<size_t>& bcast_axes); const std::set<size_t>& bcast_axes);
......
...@@ -306,12 +306,11 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer, ...@@ -306,12 +306,11 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::array<std::string, 2>& data_types) GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank)
{ {
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, " writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
<< data_types[1] << "* out, uint32_t* input_strides, uint32_t* output_strides, "
"uint32_t* padding_below, uint32_t* "
"padding_interior, uint32_t rank, uint32_t n)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n"; writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
...@@ -319,17 +318,19 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op( ...@@ -319,17 +318,19 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
writer.block_begin(); writer.block_begin();
{ {
writer << "uint32_t output_idx = 0;\n"; writer << "uint32_t output_idx = 0;\n";
writer << "uint32_t input_idx = tid;\n";
writer << "for(uint32_t i = 0; i < rank; i++)\n"; if (rank > 0)
writer.block_begin();
{ {
writer << "output_idx += (input_idx / input_strides[i] * padding_interior[i] + " writer << "uint32_t input_idx = tid;\n";
"padding_below[i]) " }
"* output_strides[i];\n"; for (size_t i = 0; i < rank; i++)
writer << "input_idx %= input_strides[i];\n"; {
writer << "output_idx += (input_idx / input_strides" << i << " * padding_interior"
<< i << " + "
"padding_below"
<< i << ") * output_strides" << i << ";\n";
writer << "input_idx %= input_strides" << i << ";\n";
} }
writer.block_end();
writer << "out[output_idx] = in[tid];\n"; writer << "out[output_idx] = in[tid];\n";
} }
writer.block_end(); writer.block_end();
......
...@@ -93,7 +93,9 @@ namespace ngraph ...@@ -93,7 +93,9 @@ namespace ngraph
static void get_pad_dynamic_op(codegen::CodeWriter& writer, static void get_pad_dynamic_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::array<std::string, 2>& data_types); GPUKernelArgs& args,
const std::array<std::string, 2>& data_types,
size_t rank);
static void get_ew_collective_op(codegen::CodeWriter& writer, static void get_ew_collective_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
......
...@@ -1475,49 +1475,20 @@ namespace ngraph ...@@ -1475,49 +1475,20 @@ namespace ngraph
{ {
// assumes NC{d1,d2,...} format // assumes NC{d1,d2,...} format
auto rep_slice = static_cast<const ngraph::op::ReplaceSlice*>(node); auto rep_slice = static_cast<const ngraph::op::ReplaceSlice*>(node);
bool in_place_op = (args[0].get_name() == out[0].get_name());
writer.block_begin(); writer.block_begin();
{
auto& input_shape = args[0].get_shape();
auto& source_shape = args[1].get_shape();
auto& lower_bounds = rep_slice->get_lower_bounds();
auto& upper_bounds = rep_slice->get_upper_bounds();
auto& strides = rep_slice->get_strides();
Shape slice_shape(upper_bounds.size(), 0);
std::transform(upper_bounds.begin(),
upper_bounds.end(),
lower_bounds.begin(),
slice_shape.begin(),
std::minus<size_t>());
std::transform(slice_shape.begin(),
slice_shape.end(),
strides.begin(),
slice_shape.begin(),
std::divides<size_t>());
// replace the input with the source if the slice shape and input shape are equal
if (input_shape == slice_shape)
{
kernel::emit_memcpyDtD(writer, out[0], args[1]);
}
else
{ {
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto replace_slice_index = cuda_emitter->build_replace_slice( auto index = cuda_emitter->build_primitive(rep_slice, in_place_op);
{{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
input_shape,
source_shape,
lower_bounds,
upper_bounds,
rep_slice->get_strides());
writer << "gpu::invoke_primitive(ctx, " << replace_slice_index << ", "; writer << "gpu::invoke_primitive(ctx, " << index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << ", " writer << "std::vector<void*>{" << args[0].get_name() << ", "
<< args[1].get_name() << "}.data(), "; << args[1].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()"; writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n"; writer << ");\n";
} }
}
writer.block_end(); writer.block_end();
} }
......
...@@ -105,6 +105,7 @@ ...@@ -105,6 +105,7 @@
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_runtime_context.hpp" #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
#include "ngraph/runtime/gpu/pass/gpu_layout.hpp"
#include "ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp" #include "ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp"
using namespace std; using namespace std;
...@@ -647,6 +648,7 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -647,6 +648,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_pass_manager m_pass_manager
.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>(); .register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
m_pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
m_pass_manager.register_pass<ngraph::pass::Liveness>(); m_pass_manager.register_pass<ngraph::pass::Liveness>();
m_pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment); m_pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
......
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include "ngraph/op/util/op_annotations.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
/// \brief Annotations added to graph ops by GPU backend passes
class GPUOpAnnotations : public ngraph::op::util::OpAnnotations
{
public:
GPUOpAnnotations() {}
};
}
}
}
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <algorithm>
#include <memory>
#include <string>
#include <typeindex>
#include <typeinfo>
#include "gpu_layout.hpp"
#include "ngraph/op/replace_slice.hpp"
#include "ngraph/runtime/gpu/gpu_op_annotations.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace gpu
{
namespace pass
{
template <>
void GPULayout::LAYOUT_DECL(ngraph::op::ReplaceSlice)
{
auto rep_slice = static_cast<ngraph::op::ReplaceSlice*>(node.get());
auto op_annotations = rep_slice->get_op_annotations();
if (op_annotations)
{
// pass-through
op_annotations->add_in_place_oi_pair({0, 0, true});
}
else
{
op_annotations = std::make_shared<ngraph::runtime::gpu::GPUOpAnnotations>();
// pass-through
op_annotations->add_in_place_oi_pair({0, 0, true});
rep_slice->set_op_annotations(op_annotations);
}
}
}
}
}
}
#define TI(x) type_index(typeid(x))
static const runtime::gpu::pass::LayoutOpMap s_dispatcher{
{TI(ngraph::op::ReplaceSlice),
&runtime::gpu::pass::GPULayout::layout<ngraph::op::ReplaceSlice>},
};
bool runtime::gpu::pass::GPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
{
for (const auto& node : nodes)
{
auto& n = *node;
auto handler = s_dispatcher.find(TI(n));
if (handler != s_dispatcher.end())
{
handler->second(m_external_function, node);
}
}
return false;
}
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include "ngraph/pass/pass.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
#define LAYOUT_DECL(op_type) \
layout<op_type>(ngraph::runtime::gpu::GPU_ExternalFunction * external_function, \
std::shared_ptr<ngraph::Node> node)
namespace ngraph
{
namespace runtime
{
namespace gpu
{
namespace pass
{
using LayoutFunction =
std::function<void(GPU_ExternalFunction*, std::shared_ptr<ngraph::Node>)>;
using LayoutOpMap = std::unordered_map<std::type_index, LayoutFunction>;
class GPULayout : public ngraph::pass::CallGraphPass
{
public:
GPULayout(GPU_ExternalFunction* external_function)
: m_external_function(external_function)
{
}
virtual bool
run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
template <typename OP>
static void
layout(ngraph::runtime::gpu::GPU_ExternalFunction* external_function,
std::shared_ptr<ngraph::Node> node);
private:
GPU_ExternalFunction* m_external_function;
};
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment