IntelGPUBackend: const, div, maxpool and max operations (#1234)

* IntelGPUBackend: const, div, maxpool and max operations * IntelGPUBackend: negative, abs, relu, sqrt, tanh and substract operations * Update intelgpu_backend.cpp

IntelGPUBackend: const, div, maxpool and max operations (#1234)
* IntelGPUBackend: const, div, maxpool and max operations * IntelGPUBackend: negative, abs, relu, sqrt, tanh and substract operations * Update intelgpu_backend.cpp
8908c9df · shssf · Robert Kimball · e2255fbd · 8908c9df · 8908c9df
Commit 8908c9df authored Jul 19, 2018 by shssf Committed by Robert Kimball Jul 19, 2018
7 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -14,23 +14,39 @@
 * limitations under the License.
 *******************************************************************************/
+#include <CPP/activation.hpp>
+#include <CPP/batch_norm.hpp>
 #include <CPP/concatenation.hpp>
+#include <CPP/convolution.hpp>
+#include <CPP/data.hpp>
 #include <CPP/eltwise.hpp>
 #include <CPP/input_layout.hpp>
 #include <CPP/layout.hpp>
-#include <CPP/network.hpp>
+#include <CPP/permute.hpp>
+#include <CPP/pooling.hpp>
 #include <CPP/reorder.hpp>
+#include <CPP/reshape.hpp>
 #include <CPP/scale.hpp>
 #include <CPP/topology.hpp>
 #include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/op/batch_norm.hpp"
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/get_output_element.hpp"
+#include "ngraph/op/max_pool.hpp"
+#include "ngraph/op/reshape.hpp"
 using namespace std;
 using namespace ngraph;
-void arguments_check(const shared_ptr<Node>& op, size_t input, size_t output)
+static void arguments_check(const shared_ptr<Node>& op, size_t input, size_t output)
 {
    if (op->get_input_size() != input || op->get_output_size() != output)
    {
@@ -42,25 +58,39 @@ void arguments_check(const shared_ptr<Node>& op, size_t input, size_t output)
    }
 }
-void do_eltwise_operation(cldnn::topology& topology,
+static void do_eltwise_operation(cldnn::topology& topology,
-                          const shared_ptr<Node>& op,
+                                 const shared_ptr<Node>& op,
-                          cldnn::eltwise_mode mode)
+                                 cldnn::eltwise_mode mode)
 {
    arguments_check(op, 2, 1);
-    std::vector<cldnn::primitive_id> op_add_inputs;
+    vector<cldnn::primitive_id> op_add_inputs;
    for (const descriptor::Input& op_input : op->get_inputs())
    {
-        const std::string& element_name = op_input.get_tensor().get_name();
+        const string& element_name = op_input.get_tensor().get_name();
        op_add_inputs.push_back(element_name);
    }
-    const std::string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+    const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
    const cldnn::eltwise op_add(output_name, op_add_inputs, mode);
    topology.add(op_add);
 }
+static void do_unary_operation(cldnn::topology& topology,
+                               const shared_ptr<Node>& op,
+                               cldnn_activation_func mode,
+                               const cldnn_activation_additional_params& param = {0.f, 0.f})
+{
+    arguments_check(op, 1, 1);
+    const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
+    const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+    const cldnn::activation cldnn_unary(output_name, input_name, mode, param);
+    topology.add(cldnn_unary);
+}
 extern "C" const char* get_ngraph_version_string()
 {
    return NGRAPH_VERSION;
@@ -78,7 +108,7 @@ extern "C" void delete_backend(runtime::Backend* backend)
 runtime::intelgpu::IntelGPUBackend::IntelGPUBackend()
 {
-    ocl_engine = std::make_shared<cldnn::engine>();
+    ocl_engine = make_shared<cldnn::engine>();
 }
 shared_ptr<runtime::TensorView>
@@ -111,7 +141,7 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
        {
            arguments_check(op, 0, 1);
-            const std::string& element_name = op->get_output_tensor_view()->get_tensor().get_name();
+            const string& element_name = op->get_output_tensor_view()->get_tensor().get_name();
            const cldnn::layout element_layout =
                IntelGPULayout::create_cldnn_layout(op->get_element_type(), op->get_shape());
@@ -124,8 +154,8 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            const descriptor::Tensor& input_tensor = op->get_inputs().begin()->get_tensor();
            const descriptor::Tensor& output_tensor = op->get_outputs().begin()->get_tensor();
-            const std::string& input_name = input_tensor.get_name();
+            const string& input_name = input_tensor.get_name();
-            const std::string& output_name = output_tensor.get_name();
+            const string& output_name = output_tensor.get_name();
            const cldnn::layout input_layout = IntelGPULayout::create_cldnn_layout(
                input_tensor.get_element_type(), op->get_inputs().begin()->get_shape());
@@ -140,6 +170,122 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
        {
            do_eltwise_operation(topology, op, cldnn::eltwise_mode::prod);
        }
+        else if ("Divide" == op->description())
+        {
+            do_eltwise_operation(topology, op, cldnn::eltwise_mode::div);
+        }
+        else if ("Maximum" == op->description())
+        {
+            do_eltwise_operation(topology, op, cldnn::eltwise_mode::max);
+        }
+        else if ("Constant" == op->description())
+        {
+            arguments_check(op, 0, 1);
+            auto input_it = op->get_outputs().cbegin();
+            const descriptor::Tensor& output_tensor = input_it->get_tensor();
+            const string& output_name = output_tensor.get_name();
+            const shared_ptr<op::Constant> constant_inst = static_pointer_cast<op::Constant>(op);
+            void* memory_pointer = const_cast<void*>(constant_inst->get_data_ptr());
+            const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(
+                output_tensor.get_element_type(), input_it->get_shape());
+            const cldnn::memory mem(
+                cldnn::memory::attach<void>(layout, memory_pointer, layout.bytes_count()));
+            const cldnn::data op_const(output_name, mem);
+            topology.add(op_const);
+        }
+        else if ("MaxPool" == op->description())
+        {
+            arguments_check(op, 1, 1);
+            const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+            const Shape& out_shape = op->get_outputs().begin()->get_shape();
+            const cldnn::tensor output_size =
+                runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(out_shape);
+            const shared_ptr<op::MaxPool> max_pool = static_pointer_cast<op::MaxPool>(op);
+            const Shape& pool_shape = max_pool->get_window_shape();
+            const Strides& pool_strides = max_pool->get_window_movement_strides();
+            const Shape& pad = max_pool->get_padding_below();
+            vector<cldnn::tensor::value_type> offset({0, 0, 0, 0}); // No action by default
+            size_t ridx = 4;
+            for (auto i = pad.rbegin(); i != pad.rend() && ridx > 0; ++i, --ridx)
+            {
+                offset.at(ridx - 1) = -(*i);
+            }
+            const cldnn::tensor input_offset(
+                offset.at(0), offset.at(1), offset.at(3), offset.at(2));
+            const cldnn::tensor size =
+                runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(pool_shape);
+            const cldnn::tensor strides =
+                runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(pool_strides);
+            const cldnn::pooling cldd_pooling(output_name,
+                                              input_name,
+                                              cldnn::pooling_mode::max,
+                                              size,
+                                              strides,
+                                              input_offset,
+                                              output_size);
+            topology.add(cldd_pooling);
+        }
+        else if ("Reshape" == op->description())
+        {
+            arguments_check(op, 1, 1);
+            const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+            const shared_ptr<op::Reshape> op_broadcast = static_pointer_cast<op::Reshape>(op);
+            const AxisVector& broadcast_axes = op_broadcast->get_input_order();
+            vector<uint16_t> permute_order({0, 1, 2, 3}); // No action by default
+            const size_t max_dim = 4;
+            const size_t scale =
+                broadcast_axes.size() < max_dim ? max_dim - broadcast_axes.size() : 0;
+            // Need to scale indexes up according on array rank.
+            // For example, in 2D array, indexes are 0,1 but in 4D array it should be 2,3
+            // because cldnn::tensor is always 4D assuming cldnn::bfyx model
+            size_t rindex = max_dim;
+            for (auto i = broadcast_axes.rbegin(); i != broadcast_axes.rend() && rindex > 0;
+                 ++i, --rindex)
+            {
+                permute_order.at(rindex - 1) = *i + scale;
+            }
+            const cldnn::permute cldnn_permute(output_name, input_name, permute_order);
+            topology.add(cldnn_permute);
+        }
+        else if ("Negative" == op->description())
+        {
+            const cldnn_activation_additional_params param = {-1.f, 0.f};
+            do_unary_operation(topology, op, activation_linear, param);
+        }
+        else if ("Relu" == op->description())
+        {
+            do_unary_operation(topology, op, activation_relu);
+        }
+        else if ("Abs" == op->description())
+        {
+            do_unary_operation(topology, op, activation_abs);
+        }
+        else if ("Sqrt" == op->description())
+        {
+            do_unary_operation(topology, op, activation_sqrt);
+        }
+        else if ("Tanh" == op->description())
+        {
+            do_unary_operation(topology, op, activation_hyperbolic_tan);
+        }
+        else if ("Subtract" == op->description())
+        {
+            do_eltwise_operation(topology, op, cldnn::eltwise_mode::sub);
+        }
        else if ("BatchNorm" == op->description())
        {
            const shared_ptr<op::BatchNorm> batch_norm = static_pointer_cast<op::BatchNorm>(op);
@@ -189,11 +335,11 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
        {
            ostringstream os;
            os << "Unsupported operation \"" << op->description() << '\"';
-            throw std::invalid_argument(os.str());
+            throw invalid_argument(os.str());
        }
    }
-    instance.ocl_network = std::make_shared<cldnn::network>(*ocl_engine, topology);
+    instance.ocl_network = make_shared<cldnn::network>(*ocl_engine, topology);
    return true;
 }
@@ -214,7 +360,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(
        }
    }
-    std::shared_ptr<cldnn::network> network = instance.ocl_network;
+    shared_ptr<cldnn::network> network = instance.ocl_network;
    // Process input parameters. Correctness of parameters was validated by validate_call.
    // Since we have no correlation between Function::m_parameters and inputs, there is
@@ -224,12 +370,12 @@ bool runtime::intelgpu::IntelGPUBackend::call(
        shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
            static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]);
        const op::ParameterVector& input_params = func->get_parameters();
-        network->set_input_data(input_params[i]->get_output_tensor().get_name(),
+        const string& tensor_name = input_params[i]->get_output_tensor().get_name();
-                                *tv->get_data_ptr());
+        network->set_input_data(tensor_name, *tv->get_data_ptr());
    }
    // Execute network
-    std::map<cldnn::primitive_id, cldnn::network_output> result = network->execute();
+    map<cldnn::primitive_id, cldnn::network_output> result = network->execute();
    // Process output parameters. Correctness of parameters was validated by validate_call.
    // Since we have no correlation between Function::m_results and outputs, there is
@@ -238,7 +384,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(
    {
        shared_ptr<runtime::intelgpu::IntelGPUTensorView> ngraph_res =
            static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(outputs[i]);
-        const std::string& tensor_name = func->get_output_op(i)->get_output_tensor().get_name();
+        const string& tensor_name = func->get_output_op(i)->get_output_tensor().get_name();
        auto result_memory = result.at(tensor_name).get_memory().pointer<char>();
        ngraph_res->write(result_memory.data(), 0, result_memory.size());

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
@@ -20,6 +20,7 @@
 #include <memory>
 #include <CPP/engine.hpp>
+#include <CPP/network.hpp>
 #include "ngraph/runtime/backend.hpp"

--- a/src/ngraph/runtime/intelgpu/intelgpu_layout.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_layout.cpp
@@ -73,18 +73,39 @@ cldnn::data_types
    else
    {
        ostringstream os;
-        os << "IntelGPUTensorView::get_cldnn_type: Unknown type " << element_type;
+        os << "IntelGPULayout::get_cldnn_type: Unknown type " << element_type;
        throw std::invalid_argument(os.str());
    }
 }
+cldnn::tensor runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(const Shape& element_shape)
+{
+    std::vector<size_t> idx(4, 1);
+    size_t index = 0;
+    for (auto i = element_shape.rbegin(); i != element_shape.rend() && index < 3; ++i, ++index)
+    {
+        idx.at(index) = *i;
+    }
+    if (element_shape.size() > 3)
+    {
+        idx.at(3) =
+            accumulate(element_shape.rbegin() + 3, element_shape.rend(), 1, multiplies<size_t>());
+    }
+    //Parameters for this ctor: batch, feature, spatial_x, spatial_y
+    const cldnn::tensor tns(idx.at(3), idx.at(2), idx.at(0), idx.at(1));
+    return tns;
+}
 cldnn::layout runtime::intelgpu::IntelGPULayout::create_cldnn_layout(
    const ngraph::element::Type& element_type, const Shape& element_shape)
 {
-    const size_t mem_size = shape_size(element_shape);
    const cldnn::data_types data_type = get_cldnn_type(element_type);
-    const cldnn::tensor tensor(1, mem_size, 1, 1);
+    const cldnn::format::type format = cldnn::format::bfyx;
-    const cldnn::format::type format = cldnn::format::yxfb;
+    const cldnn::tensor tensor = create_cldnn_tensor(element_shape);
    return cldnn::layout(data_type, format, tensor);
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_layout.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_layout.hpp
@@ -17,6 +17,7 @@
 #pragma once
 #include <CPP/layout.hpp>
+#include <CPP/tensor.hpp>
 #include "ngraph/descriptor/layout/tensor_view_layout.hpp"
@@ -48,6 +49,7 @@ public:
    static cldnn::data_types get_cldnn_type(const ngraph::element::Type& element_type);
    static cldnn::layout create_cldnn_layout(const ngraph::element::Type& element_type,
                                             const Shape& element_shape);
+    static cldnn::tensor create_cldnn_tensor(const Shape& element_shape);
 private:
    Strides strides;

--- a/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.cpp
@@ -41,7 +41,7 @@ runtime::intelgpu::IntelGPUTensorView::IntelGPUTensorView(const ngraph::element:
    if (nullptr != memory_pointer)
    {
        ocl_memory = make_shared<cldnn::memory>(
-            cldnn::memory::attach<void>(layout, memory_pointer, layout.get_linear_size()));
+            cldnn::memory::attach<void>(layout, memory_pointer, layout.bytes_count()));
    }
    else
    {

--- a/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp
@@ -17,11 +17,9 @@
 #pragma once
 #include <CPP/engine.hpp>
-#include <CPP/layout.hpp>
 #include <CPP/memory.hpp>
 #include "ngraph/runtime/tensor_view.hpp"
-#include "ngraph/type/element_type.hpp"
 namespace ngraph
 {
@@ -56,7 +54,5 @@ public:
    cldnn::memory* get_data_ptr() { return ocl_memory.get(); }
 private:
-    cldnn::data_types get_cldnn_type(const ngraph::element::Type& element_type) const;
    std::shared_ptr<cldnn::memory> ocl_memory;
 };
--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
-ab
-abc
 abc_int64
 abs
 acos
@@ -18,11 +16,10 @@ avg_pool_2d_2channel_2image_padded_3x3_strided
 avg_pool_2d_2channel_2image_padded_3x3_strided_uneven
 avg_pool_2d_2channel_2image_padded_only_above
 avg_pool_2d_2channel_2image_padded_only_below
-backwards_abc
+avg_pool_3d
 backwards_abs
 backwards_acos
 backwards_add
-backwards_add_nested
 backwards_asin
 backwards_atan
 backwards_avgpool_n1_c1_hw2x2
@@ -56,18 +53,17 @@ backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
 backwards_maxpool_n4_c1_hw4_2x2_max
 backwards_minimum
-backwards_multiply
 backwards_negative
 backwards_parameter
 backwards_power
 backwards_relu
 backwards_replace_slice
-backwards_reshape
 backwards_reverse_3d_02
 backwards_reverse_sequence_n3_c2_h3
 backwards_reverse_sequence_n4d2c3h2w2
 backwards_select
 backwards_select_nested
+backwards_sigmoid
 backwards_sign
 backwards_sin
 backwards_sinh
@@ -152,9 +148,7 @@ convolution_4d_4items_strided_dilated_padded_same
 convolution_outlining
 cos
 cosh
-divide
 divide_adjoint_stability
-divide_by_zero_float32
 divide_by_zero_int32
 dot_0_0
 dot1d
@@ -176,8 +170,6 @@ equal
 exp
 floor
 function_call
-function_name
-fuse_max_with_constant_zero_input_as_relu
 greater
 greatereq
 kahan_sum_3d_to_vector
@@ -193,20 +185,16 @@ max_3d_to_matrix_least_sig
 max_3d_to_matrix_most_sig
 max_3d_to_scalar
 max_3d_to_vector
-maximum
 max_matrix_cols_zero
 max_matrix_columns
 max_matrix_rows
 max_matrix_rows_zero
 max_matrix_to_scalar_zero_by_zero
-max_pool_1d_1channel_1image
-max_pool_1d_1channel_2image
-max_pool_1d_2channel_2image
 max_pool_2d_1channel_1image_overpadded
 max_pool_2d_1channel_1image_padded
 max_pool_2d_1channel_1image_padded_negative_values
 max_pool_2d_1channel_1image_strided
-max_pool_2d_2channel_2image
+max_pool_3d
 max_to_scalar
 max_trivial
 max_trivial_5d
@@ -226,10 +214,10 @@ min_to_scalar
 min_trivial
 min_trivial_5d
 min_vector_zero
+mkldnn_layouts
 multiple_backends
 multiple_result
 negative
-node_name
 not
 notequal
 numeric_double_inf
@@ -257,7 +245,6 @@ pad_interior_1d
 pad_interior_exterior_1d
 pad_interior_exterior_2d
 pad_interior_exterior_4d_2x0x3x2
-parameter_as_output
 power
 product_3d_eliminate_zero_dim
 product_3d_to_matrix_least_sig
@@ -297,19 +284,7 @@ replace_slice_3d_strided_different_strides
 replace_slice_matrix
 replace_slice_scalar
 replace_slice_vector
-reshape_3d_transpose
-reshape_4d_transpose
 reshape_6d
-reshape_m2m_dim_change_transpose
-reshape_m2m_same
-reshape_m2m_transpose
-reshape_s2t
-reshape_t2s_012
-reshape_t2s_120
-reshape_t2v_012
-reshape_v2m_col
-reshape_v2m_row
-reshape_v2t_middle
 reverse_0d
 reverse_1d_0
 reverse_1d_nochange
@@ -328,12 +303,14 @@ reverse_3d_nochange
 reverse_sequence_n2c3h4w2
 reverse_sequence_n4c3h2w2
 reverse_sequence_n4d2c3h2w2
-scalar_constant_float32
 scalar_constant_int64
 select
 select_and_scatter_3d_without_overlap
 select_and_scatter_without_overlap
 select_and_scatter_with_overlap
+sigmoid_bprop_n1c1h4
+sigmoid_n1c1h2w2
+sigmoid_n1c1h4
 sign
 sin
 sinh
@@ -369,11 +346,8 @@ sum_trivial_5d
 sum_vector_zero
 tan
 tanh
-tensor_constant
-tensor_constant_float32
 tensor_constant_int64
 tensor_constant_with_op
-tensorview_custom_mem
 validate_call_input_type
 validate_call_output_type
 zero_sized_abs