IntelGPU backend: Pad operation (#1267)

* IntelGPU backend: Pad operation * PR1267. Comments addressed

IntelGPU backend: Pad operation (#1267)
* IntelGPU backend: Pad operation * PR1267. Comments addressed
e2e7042a · shssf · Scott Cyphers · 1c2e5b7a · e2e7042a · e2e7042a
Commit e2e7042a authored Jul 28, 2018 by shssf Committed by Scott Cyphers Jul 28, 2018
6 changed files
--- a/src/ngraph/runtime/intelgpu/CMakeLists.txt
+++ b/src/ngraph/runtime/intelgpu/CMakeLists.txt
@@ -20,6 +20,8 @@ set(SRC
    intelgpu_layout.cpp
    intelgpu_op_batchnorm.cpp
    intelgpu_op_broadcast.cpp
+    intelgpu_op_custom_kernels.cpp
+    code_writer.cpp
 )

 if (NGRAPH_INTELGPU_ENABLE)

--- a/src/ngraph/runtime/intelgpu/code_writer.cpp
+++ b/src/ngraph/runtime/intelgpu/code_writer.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "code_writer.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+codegen::CodeWriter::CodeWriter()
+    : indent(0)
+    , m_pending_indent(true)
+    , m_temporary_name_count(0)
+{
+}
+
+string codegen::CodeWriter::get_code() const
+{
+    return m_ss.str();
+}
+
+void codegen::CodeWriter::operator+=(const std::string& s)
+{
+    *this << s;
+}
+
+std::string codegen::CodeWriter::generate_temporary_name(std::string prefix)
+{
+    std::stringstream ss;
+
+    ss << prefix << m_temporary_name_count;
+    m_temporary_name_count++;
+
+    return ss.str();
+}
--- a/src/ngraph/runtime/intelgpu/code_writer.hpp
+++ b/src/ngraph/runtime/intelgpu/code_writer.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <sstream>
+#include <string>
+
+namespace ngraph
+{
+    namespace codegen
+    {
+        class CodeWriter;
+    }
+}
+
+class ngraph::codegen::CodeWriter
+{
+public:
+    CodeWriter();
+    std::string get_code() const;
+
+    void operator+=(const std::string&);
+
+    size_t indent;
+
+    template <typename T>
+    friend CodeWriter& operator<<(CodeWriter& out, const T& obj)
+    {
+        std::stringstream ss;
+        ss << obj;
+
+        for (char c : ss.str())
+        {
+            if (c == '\n')
+            {
+                out.m_pending_indent = true;
+            }
+            else
+            {
+                if (out.m_pending_indent)
+                {
+                    out.m_pending_indent = false;
+                    for (size_t i = 0; i < out.indent; i++)
+                    {
+                        out.m_ss << "    ";
+                    }
+                }
+            }
+            out.m_ss << c;
+        }
+
+        return out;
+    }
+
+    std::string generate_temporary_name(std::string prefix = "tempvar");
+
+    void block_begin()
+    {
+        *this << "{\n";
+        indent++;
+    }
+
+    void block_end()
+    {
+        indent--;
+        *this << "}\n";
+    }
+
+private:
+    std::stringstream m_ss;
+    bool m_pending_indent;
+    size_t m_temporary_name_count;
+};
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -24,6 +24,7 @@
 #include <CPP/permute.hpp>
 #include <CPP/pooling.hpp>
 #include <CPP/reorder.hpp>
+#include <CPP/reshape.hpp>
 #include <CPP/scale.hpp>
 #include <CPP/topology.hpp>

@@ -31,6 +32,7 @@
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"

 #include "ngraph/node.hpp"
@@ -38,9 +40,12 @@
 #include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/convolution.hpp"
+#include "ngraph/op/dot.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/max_pool.hpp"
+#include "ngraph/op/pad.hpp"
 #include "ngraph/op/reshape.hpp"
+#include "ngraph/op/sum.hpp"
 #include "ngraph/util.hpp"

 using namespace std;
@@ -327,6 +332,33 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
        {
            do_eltwise_operation(topology, op, cldnn::eltwise_mode::sub);
        }
+        else if ("Pad" == op->description())
+        {
+            arguments_check(op, 2, 1);
+
+            const string& input_name = op->get_inputs().at(0).get_tensor().get_name();
+            const Shape& input_shape = op->get_inputs().at(0).get_shape();
+            const string& scalar_name = op->get_inputs().at(1).get_tensor().get_name();
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+            const Shape& output_shape = op->get_outputs().begin()->get_shape();
+            const element::Type& output_type =
+                op->get_outputs().begin()->get_tensor().get_element_type();
+
+            const shared_ptr<op::Pad> pad = static_pointer_cast<op::Pad>(op);
+            const Shape& pad_above = pad->get_padding_above();
+            const Shape& pad_below = pad->get_padding_below();
+            const Shape& pad_interior = pad->get_padding_interior();
+
+            do_pad_kernel(topology,
+                          input_name,
+                          input_shape,
+                          scalar_name,
+                          output_name,
+                          output_shape,
+                          output_type,
+                          pad_below,
+                          pad_interior);
+        }
        else if ("BatchNorm" == op->description())
        {
            const shared_ptr<op::BatchNorm> batch_norm = static_pointer_cast<op::BatchNorm>(op);

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <CPP/custom_gpu_primitive.hpp>
+#include <CPP/reshape.hpp>
+
+#include "ngraph/runtime/intelgpu/code_writer.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
+
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static vector<cldnn_arg> parameters_2inp_1out = {{arg_input, 0}, {arg_input, 1}, {arg_output, 0}};
+
+static string array_dims(const Shape& dimentions)
+{
+    string buffer;
+
+    for (auto const& dim : dimentions)
+    {
+        buffer += "[" + to_string(dim) + "]";
+    }
+
+    return buffer;
+}
+
+static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
+{
+    size_t var_idx = 0;
+    string buffer;
+
+    for (auto i = dimentions.cbegin(); i != dimentions.cend(); ++i, ++var_idx)
+    {
+        if (axis.find(var_idx) == axis.end())
+        {
+            buffer += "[i" + to_string(var_idx) + "]";
+        }
+    }
+
+    return buffer;
+}
+
+static string
+    access_dims_strided(const Shape& dimentions, const Shape& pad_below, const Shape& pad_interior)
+{
+    string buffer;
+    size_t var_idx = 0;
+
+    for (auto i = dimentions.cbegin(); i != dimentions.cend(); ++i, ++var_idx)
+    {
+        buffer += "[i" + to_string(var_idx) + " * (" + to_string(pad_interior.at(var_idx)) +
+                  " + 1) + " + to_string(pad_below.at(var_idx)) + "]";
+    }
+
+    return buffer;
+}
+
+void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
+                                      const string& input_name,
+                                      const Shape& input_shape,
+                                      const string& scalar_name,
+                                      const string& output_name,
+                                      const Shape& output_shape,
+                                      const element::Type& output_type,
+                                      const Shape& pad_below,
+                                      const Shape& pad_interior)
+{
+    const size_t input_count = shape_size<Shape>(output_shape);
+    const string entry_point_name = "op_pad_kernel";
+    codegen::CodeWriter writer;
+
+    // The kernel name and parameters
+    writer << "__kernel void " << entry_point_name << "(const __global float input"
+           << array_dims(input_shape) << ", const __global float scalar[1], __global float output"
+           << array_dims(output_shape) << ")\n";
+
+    writer.block_begin();
+    {
+        // Loop for Broadcast scalar over full output tensor
+        size_t var_idx = 0;
+        for (auto i = output_shape.cbegin(); i != output_shape.cend(); ++i, ++var_idx)
+        {
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << *i << "; ++i"
+                   << var_idx << ")\n";
+            writer.block_begin();
+        }
+
+        writer << "output" << access_dims(output_shape) << " = scalar[0];\n";
+
+        // Closing brackets for Broadcast loop
+        for (auto const& i : output_shape)
+        {
+            writer.block_end();
+        }
+
+        // Loop for Copy input matrix into output matrix with padding.
+        // Padding include "pad_below" and "pad_interior" according nGraph documentation
+        var_idx = 0;
+        for (auto i = input_shape.cbegin(); i != input_shape.cend(); ++i, ++var_idx)
+        {
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << *i << "; ++i"
+                   << var_idx << ")\n";
+            writer.block_begin();
+        }
+
+        writer << "output" << access_dims_strided(input_shape, pad_below, pad_interior)
+               << " = input" << access_dims(input_shape) << ";\n";
+
+        // Closing brackets for main Copy loop
+        for (auto const& i : input_shape)
+        {
+            writer.block_end();
+        }
+
+    } // End of function bracket
+    writer.block_end();
+
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const cldnn::custom_gpu_primitive op_scalar(output_name,
+                                                {input_name, scalar_name},
+                                                {writer.get_code()},
+                                                entry_point_name,
+                                                parameters_2inp_1out,
+                                                "",
+                                                layout,
+                                                {1});
+    topology.add(op_scalar);
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <CPP/topology.hpp>
+
+#include "ngraph/axis_set.hpp"
+#include "ngraph/shape.hpp"
+#include "ngraph/type/element_type.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace intelgpu
+        {
+            void do_pad_kernel(cldnn::topology& topology,
+                               const std::string& input_name,
+                               const Shape& input_shape,
+                               const std::string& scalar_name,
+                               const std::string& output_name,
+                               const Shape& output_shape,
+                               const element::Type& output_type,
+                               const Shape& pad_below,
+                               const Shape& pad_interior);
+        }
+    }
+}