gpu replace slice optimize (#1411)

* optimize replace slice * fix bugs * fix bug * optimize pad dynamic * fix bug * fix bug * fix bug * remove * * add gpu_assignment to pass * refactor cuda replace slice. * fix bug * refactor replace slice * working version * clang format * us layout instead of assignment * us layout instead of assignment in cmakelist * update gpu_layout * fix bugs * resolve conflict * GPUShape to NVShape * using kernel args * using kernel args * fix bugs * fix bugs * fix bug, remove mkldnn.h from gpu_layout.cpp * fix bug for pad_below * remove cast to rep_slice * fix bugs * clang format * change add_in_place_oi_pair({0, 0, false} to add_in_place_oi_pair({0, 0, true};

gpu replace slice optimize (#1411)
* optimize replace slice * fix bugs * fix bug * optimize pad dynamic * fix bug * fix bug * fix bug * remove * * add gpu_assignment to pass * refactor cuda replace slice. * fix bug * refactor replace slice * working version * clang format * us layout instead of assignment * us layout instead of assignment in cmakelist * update gpu_layout * fix bugs * resolve conflict * GPUShape to NVShape * using kernel args * using kernel args * fix bugs * fix bugs * fix bug, remove mkldnn.h from gpu_layout.cpp * fix bug for pad_below * remove cast to rep_slice * fix bugs * clang format * change add_in_place_oi_pair({0, 0, false} to add_in_place_oi_pair({0, 0, true};
c46d4546 · Fenglei · Robert Kimball · 7d3323c9 · c46d4546 · c46d4546
Commit c46d4546 authored Aug 21, 2018 by Fenglei Committed by Robert Kimball Aug 21, 2018
10 changed files
--- a/src/ngraph/runtime/gpu/CMakeLists.txt
+++ b/src/ngraph/runtime/gpu/CMakeLists.txt
@@ -38,6 +38,7 @@ set(SRC
    gpu_tensor_view.cpp
    gpu_util.cpp
    type_info.cpp
+    pass/gpu_layout.cpp
    pass/tensor_memory_reservation.cpp
    gpu_kernel_args.cpp
    )

--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -25,6 +25,7 @@
 #include "ngraph/op/convolution.hpp"
 #include "ngraph/op/max_pool.hpp"
+#include "ngraph/op/replace_slice.hpp"
 #include "ngraph/op/softmax.hpp"
 namespace ngraph
@@ -46,6 +47,7 @@ namespace ngraph
                size_t build_primitive(const op::Softmax* node);
                size_t build_primitive(const op::Convolution* node);
                size_t build_primitive(const op::MaxPool* node);
+                size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
            public:
                size_t build_pad(const std::array<std::string, 2>& dtypes,
@@ -130,13 +132,6 @@ namespace ngraph
                                                        save_elementwise);
                }
-                size_t build_replace_slice(const std::array<std::string, 3>& dtypes,
-                                           NVShape tensor_shape,
-                                           NVShape source_shape,
-                                           NVShape lower_bounds,
-                                           NVShape upper_bounds,
-                                           NVShape slice_stride);
                size_t build_broadcast(const std::array<std::string, 2>& dtypes,
                                       NVShape result_shape,
                                       const std::set<size_t>& bcast_axes);

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -306,12 +306,11 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
 void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
    codegen::CodeWriter& writer,
    const std::string& name,
-    const std::array<std::string, 2>& data_types)
+    GPUKernelArgs& args,
+    const std::array<std::string, 2>& data_types,
+    size_t rank)
 {
-    writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
-           << data_types[1] << "* out, uint32_t* input_strides, uint32_t* output_strides, "
-                               "uint32_t* padding_below, uint32_t* "
-                               "padding_interior, uint32_t rank, uint32_t n)\n";
    writer.block_begin();
    {
        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
@@ -319,17 +318,19 @@ void runtime::gpu::CudaKernelBuilder::get_pad_dynamic_op(
        writer.block_begin();
        {
            writer << "uint32_t output_idx = 0;\n";
-            writer << "uint32_t input_idx = tid;\n";
-            writer << "for(uint32_t i = 0; i < rank; i++)\n";
+            if (rank > 0)
-            writer.block_begin();
            {
-                writer << "output_idx += (input_idx / input_strides[i] * padding_interior[i]  + "
+                writer << "uint32_t input_idx = tid;\n";
-                          "padding_below[i]) "
+            }
-                          "* output_strides[i];\n";
+            for (size_t i = 0; i < rank; i++)
-                writer << "input_idx %= input_strides[i];\n";
+            {
+                writer << "output_idx += (input_idx / input_strides" << i << " * padding_interior"
+                       << i << "  + "
+                               "padding_below"
+                       << i << ") * output_strides" << i << ";\n";
+                writer << "input_idx %= input_strides" << i << ";\n";
            }
-            writer.block_end();
            writer << "out[output_idx] = in[tid];\n";
        }
        writer.block_end();

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -93,7 +93,9 @@ namespace ngraph
                static void get_pad_dynamic_op(codegen::CodeWriter& writer,
                                               const std::string& name,
-                                               const std::array<std::string, 2>& data_types);
+                                               GPUKernelArgs& args,
+                                               const std::array<std::string, 2>& data_types,
+                                               size_t rank);
                static void get_ew_collective_op(codegen::CodeWriter& writer,
                                                 const std::string& name,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -1475,48 +1475,19 @@ namespace ngraph
            {
                // assumes NC{d1,d2,...} format
                auto rep_slice = static_cast<const ngraph::op::ReplaceSlice*>(node);
+                bool in_place_op = (args[0].get_name() == out[0].get_name());
                writer.block_begin();
                {
-                    auto& input_shape = args[0].get_shape();
+                    auto& cuda_emitter =
-                    auto& source_shape = args[1].get_shape();
+                        external_function->get_primitive_emitter()->get_cuda_emitter();
-                    auto& lower_bounds = rep_slice->get_lower_bounds();
-                    auto& upper_bounds = rep_slice->get_upper_bounds();
-                    auto& strides = rep_slice->get_strides();
-                    Shape slice_shape(upper_bounds.size(), 0);
-                    std::transform(upper_bounds.begin(),
-                                   upper_bounds.end(),
-                                   lower_bounds.begin(),
-                                   slice_shape.begin(),
-                                   std::minus<size_t>());
-                    std::transform(slice_shape.begin(),
-                                   slice_shape.end(),
-                                   strides.begin(),
-                                   slice_shape.begin(),
-                                   std::divides<size_t>());
-                    // replace the input with the source if the slice shape and input shape are equal
-                    if (input_shape == slice_shape)
-                    {
-                        kernel::emit_memcpyDtD(writer, out[0], args[1]);
-                    }
-                    else
-                    {
-                        auto& cuda_emitter =
-                            external_function->get_primitive_emitter()->get_cuda_emitter();
-                        auto replace_slice_index = cuda_emitter->build_replace_slice(
+                    auto index = cuda_emitter->build_primitive(rep_slice, in_place_op);
-                            {{args[0].get_type(), args[1].get_type(), out[0].get_type()}},
-                            input_shape,
-                            source_shape,
-                            lower_bounds,
-                            upper_bounds,
-                            rep_slice->get_strides());
-                        writer << "gpu::invoke_primitive(ctx, " << replace_slice_index << ", ";
+                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
-                        writer << "std::vector<void*>{" << args[0].get_name() << ", "
+                    writer << "std::vector<void*>{" << args[0].get_name() << ", "
-                               << args[1].get_name() << "}.data(), ";
+                           << args[1].get_name() << "}.data(), ";
-                        writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
+                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                        writer << ");\n";
+                    writer << ");\n";
-                    }
                }
                writer.block_end();
            }

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -105,6 +105,7 @@
 #include "ngraph/runtime/gpu/gpu_external_function.hpp"
 #include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
 #include "ngraph/runtime/gpu/gpu_runtime_context.hpp"
+#include "ngraph/runtime/gpu/pass/gpu_layout.hpp"
 #include "ngraph/runtime/gpu/pass/tensor_memory_reservation.hpp"
 using namespace std;
@@ -647,6 +648,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    m_pass_manager
        .register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
+    m_pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
    m_pass_manager.register_pass<ngraph::pass::Liveness>();
    m_pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);

--- a/src/ngraph/runtime/gpu/gpu_op_annotations.hpp
+++ b/src/ngraph/runtime/gpu/gpu_op_annotations.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/op/util/op_annotations.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            /// \brief Annotations added to graph ops by GPU backend passes
+            class GPUOpAnnotations : public ngraph::op::util::OpAnnotations
+            {
+            public:
+                GPUOpAnnotations() {}
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <typeindex>
+#include <typeinfo>
+#include "gpu_layout.hpp"
+#include "ngraph/op/replace_slice.hpp"
+#include "ngraph/runtime/gpu/gpu_op_annotations.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace pass
+            {
+                template <>
+                void GPULayout::LAYOUT_DECL(ngraph::op::ReplaceSlice)
+                {
+                    auto rep_slice = static_cast<ngraph::op::ReplaceSlice*>(node.get());
+                    auto op_annotations = rep_slice->get_op_annotations();
+                    if (op_annotations)
+                    {
+                        // pass-through
+                        op_annotations->add_in_place_oi_pair({0, 0, true});
+                    }
+                    else
+                    {
+                        op_annotations = std::make_shared<ngraph::runtime::gpu::GPUOpAnnotations>();
+                        // pass-through
+                        op_annotations->add_in_place_oi_pair({0, 0, true});
+                        rep_slice->set_op_annotations(op_annotations);
+                    }
+                }
+            }
+        }
+    }
+}
+#define TI(x) type_index(typeid(x))
+static const runtime::gpu::pass::LayoutOpMap s_dispatcher{
+    {TI(ngraph::op::ReplaceSlice),
+     &runtime::gpu::pass::GPULayout::layout<ngraph::op::ReplaceSlice>},
+};
+bool runtime::gpu::pass::GPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
+{
+    for (const auto& node : nodes)
+    {
+        auto& n = *node;
+        auto handler = s_dispatcher.find(TI(n));
+        if (handler != s_dispatcher.end())
+        {
+            handler->second(m_external_function, node);
+        }
+    }
+    return false;
+}
--- a/src/ngraph/runtime/gpu/pass/gpu_layout.hpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/pass/pass.hpp"
+#include "ngraph/runtime/gpu/gpu_external_function.hpp"
+#define LAYOUT_DECL(op_type)                                                                       \
+    layout<op_type>(ngraph::runtime::gpu::GPU_ExternalFunction * external_function,                \
+                    std::shared_ptr<ngraph::Node> node)
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace gpu
+        {
+            namespace pass
+            {
+                using LayoutFunction =
+                    std::function<void(GPU_ExternalFunction*, std::shared_ptr<ngraph::Node>)>;
+                using LayoutOpMap = std::unordered_map<std::type_index, LayoutFunction>;
+                class GPULayout : public ngraph::pass::CallGraphPass
+                {
+                public:
+                    GPULayout(GPU_ExternalFunction* external_function)
+                        : m_external_function(external_function)
+                    {
+                    }
+                    virtual bool
+                        run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
+                    template <typename OP>
+                    static void
+                        layout(ngraph::runtime::gpu::GPU_ExternalFunction* external_function,
+                               std::shared_ptr<ngraph::Node> node);
+                private:
+                    GPU_ExternalFunction* m_external_function;
+                };
+            }
+        }
+    }
+}