IntelGPU backend: broadcast operation (#1252)

* IntelGPUBackend: Broadcast operation * IntelGPUBackend: more tests for Broadcast operation * Move macro to static C function in Broadcast tests

IntelGPU backend: broadcast operation (#1252)
* IntelGPUBackend: Broadcast operation * IntelGPUBackend: more tests for Broadcast operation * Move macro to static C function in Broadcast tests
d4349db8 · shssf · Robert Kimball · 8c1aad8f · d4349db8 · d4349db8
Commit d4349db8 authored Jul 26, 2018 by shssf Committed by Robert Kimball Jul 26, 2018
9 changed files
--- a/src/ngraph/runtime/intelgpu/CMakeLists.txt
+++ b/src/ngraph/runtime/intelgpu/CMakeLists.txt
@@ -19,6 +19,7 @@ set(SRC
    intelgpu_tensor_view.cpp
    intelgpu_layout.cpp
    intelgpu_op_batchnorm.cpp
+    intelgpu_op_broadcast.cpp
 )

 if (NGRAPH_INTELGPU_ENABLE)

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -16,7 +16,6 @@

 #include <CPP/activation.hpp>
 #include <CPP/batch_norm.hpp>
-#include <CPP/concatenation.hpp>
 #include <CPP/convolution.hpp>
 #include <CPP/data.hpp>
 #include <CPP/eltwise.hpp>
@@ -25,13 +24,13 @@
 #include <CPP/permute.hpp>
 #include <CPP/pooling.hpp>
 #include <CPP/reorder.hpp>
-#include <CPP/reshape.hpp>
 #include <CPP/scale.hpp>
 #include <CPP/topology.hpp>

 #include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"

 #include "ngraph/node.hpp"
@@ -42,6 +41,7 @@
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/reshape.hpp"
+#include "ngraph/util.hpp"

 using namespace std;
 using namespace ngraph;
@@ -91,6 +91,18 @@ static void do_unary_operation(cldnn::topology& topology,
    topology.add(cldnn_unary);
 }

+// This function needed to only change the name of the data in topology
+// No real data copy needed
+static void do_equal_propagation(cldnn::topology& topology,
+                                 const string& input_name,
+                                 const string& output_name)
+{
+    const vector<cldnn::primitive_id> input_names(1, input_name);
+
+    const cldnn::concatenation op_concat(output_name, input_names, cldnn::concatenation::along_x);
+    topology.add(op_concat);
+}
+
 extern "C" const char* get_ngraph_version_string()
 {
    return NGRAPH_VERSION;
@@ -152,15 +164,21 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
        {
            arguments_check(op, 1, 1);

-            const descriptor::Tensor& input_tensor = op->get_inputs().begin()->get_tensor();
-            const descriptor::Tensor& output_tensor = op->get_outputs().begin()->get_tensor();
-            const string& input_name = input_tensor.get_name();
-            const string& output_name = output_tensor.get_name();
-            const cldnn::layout input_layout = IntelGPULayout::create_cldnn_layout(
-                input_tensor.get_element_type(), op->get_inputs().begin()->get_shape());
+            const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();

-            const cldnn::reorder op_reorder(output_name, input_name, input_layout);
-            topology.add(op_reorder);
+            do_equal_propagation(topology, input_name, output_name);
+        }
+        else if ("GetOutputElement" == op->description())
+        {
+            arguments_check(op, 3, 1);
+
+            const shared_ptr<op::GetOutputElement> elem =
+                static_pointer_cast<op::GetOutputElement>(op);
+            const string& input_name = op->get_inputs().at(elem->get_n()).get_tensor().get_name();
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+
+            do_equal_propagation(topology, input_name, output_name);
        }
        else if ("Add" == op->description())
        {
@@ -213,7 +231,7 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)

            vector<cldnn::tensor::value_type> offset({0, 0, 0, 0}); // No action by default
            size_t ridx = 4;
-            for (auto i = pad.rbegin(); i != pad.rend() && ridx > 0; ++i, --ridx)
+            for (auto i = pad.crbegin(); i != pad.crend() && ridx > 0; ++i, --ridx)
            {
                offset.at(ridx - 1) = -(*i);
            }
@@ -234,6 +252,29 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
                                              output_size);
            topology.add(cldd_pooling);
        }
+        else if ("Broadcast" == op->description())
+        {
+            arguments_check(op, 1, 1);
+
+            const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
+            const Shape& input_shape = op->get_inputs().begin()->get_shape();
+
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+            const Shape& output_shape = op->get_outputs().begin()->get_shape();
+
+            const shared_ptr<op::Broadcast> broadcast = static_pointer_cast<op::Broadcast>(op);
+            const AxisSet& axis = broadcast->get_broadcast_axes();
+
+            if (axis.empty())
+            {
+                do_equal_propagation(topology, input_name, output_name);
+            }
+            else
+            {
+                do_broadcast_operation(
+                    topology, input_name, input_shape, output_name, output_shape, axis);
+            }
+        }
        else if ("Reshape" == op->description())
        {
            arguments_check(op, 1, 1);
@@ -252,7 +293,7 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            // For example, in 2D array, indexes are 0,1 but in 4D array it should be 2,3
            // because cldnn::tensor is always 4D assuming cldnn::bfyx model
            size_t rindex = max_dim;
-            for (auto i = broadcast_axes.rbegin(); i != broadcast_axes.rend() && rindex > 0;
+            for (auto i = broadcast_axes.crbegin(); i != broadcast_axes.crend() && rindex > 0;
                 ++i, --rindex)
            {
                permute_order.at(rindex - 1) = *i + scale;

--- a/src/ngraph/runtime/intelgpu/intelgpu_layout.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_layout.cpp
@@ -29,7 +29,7 @@ runtime::intelgpu::IntelGPULayout::IntelGPULayout(const descriptor::TensorView&
 {
 }

-size_t runtime::intelgpu::IntelGPULayout::get_index_offset(const std::vector<size_t>& indices)
+size_t runtime::intelgpu::IntelGPULayout::get_index_offset(const vector<size_t>& indices)
 {
    if (indices.size() != strides.size())
    {
@@ -74,16 +74,16 @@ cldnn::data_types
    {
        ostringstream os;
        os << "IntelGPULayout::get_cldnn_type: Unknown type " << element_type;
-        throw std::invalid_argument(os.str());
+        throw invalid_argument(os.str());
    }
 }

 cldnn::tensor runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(const Shape& element_shape)
 {
-    std::vector<size_t> idx(4, 1);
+    vector<size_t> idx(4, 1);
    size_t index = 0;

-    for (auto i = element_shape.rbegin(); i != element_shape.rend() && index < 3; ++i, ++index)
+    for (auto i = element_shape.crbegin(); i != element_shape.crend() && index < 3; ++i, ++index)
    {
        idx.at(index) = *i;
    }
@@ -109,3 +109,21 @@ cldnn::layout runtime::intelgpu::IntelGPULayout::create_cldnn_layout(

    return cldnn::layout(data_type, format, tensor);
 }
+
+cldnn::concatenation::concatenation_axis
+    runtime::intelgpu::IntelGPULayout::get_cldnn_axis(size_t tensor_channel)
+{
+    switch (tensor_channel)
+    {
+    case 0: return cldnn::concatenation::along_b;
+    case 1: return cldnn::concatenation::along_f;
+    case 2: return cldnn::concatenation::along_y;
+    case 3: return cldnn::concatenation::along_x;
+    default:
+    {
+        ostringstream os;
+        os << "IntelGPULayout::get_cldnn_axis: wrong tensor channel " << tensor_channel;
+        throw invalid_argument(os.str());
+    }
+    }
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_layout.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_layout.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include <CPP/concatenation.hpp>
 #include <CPP/layout.hpp>
 #include <CPP/tensor.hpp>

@@ -51,6 +52,9 @@ public:
                                             const Shape& element_shape);
    static cldnn::tensor create_cldnn_tensor(const Shape& element_shape);

+    // This function converts Shape dimension_id into cldnn::concatenation id
+    static cldnn::concatenation::concatenation_axis get_cldnn_axis(size_t tensor_channel);
+
 private:
    Strides strides;
    cldnn::layout cldnn_layout;

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
@@ -19,6 +19,7 @@
 #include <CPP/scale.hpp>
 #include <CPP/split.hpp>

+#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"

 #include "ngraph/op/batch_norm.hpp"
@@ -26,19 +27,6 @@
 using namespace std;
 using namespace ngraph;

-// This function converts Shape dimension id into cldnn::concatenation id
-static cldnn::concatenation::concatenation_axis get_cldnn_axis(size_t tensor_channel)
-{
-    switch (tensor_channel)
-    {
-    case 0: return cldnn::concatenation::along_b;
-    case 1: return cldnn::concatenation::along_f;
-    case 2: return cldnn::concatenation::along_y;
-    case 3: return cldnn::concatenation::along_x;
-    default: throw invalid_argument("intelgpu::get_cldnn_axis() wrong input tensor channel.");
-    }
-}
-
 static string do_matrix_split(cldnn::topology& topology,
                              const string& name,
                              const vector<pair<cldnn::primitive_id, cldnn::tensor>>& offsets)
@@ -88,6 +76,8 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
    // Also, input data must be at least 2D array
    const size_t shape_channel = 1;
    const size_t cldnn_channel = 4 - input_shape.size() + shape_channel;
+    const cldnn::concatenation::concatenation_axis direction =
+        runtime::intelgpu::IntelGPULayout::get_cldnn_axis(cldnn_channel);

    const size_t split_arr_count = input_shape.at(shape_channel);
    for (size_t i = 0; i < split_arr_count; ++i)
@@ -99,7 +89,6 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
        vector<cldnn::tensor::value_type> offset({0, 0, 0, 0}); // No action by default
        offset.at(cldnn_channel) = i;

-        cout << "Splitted to " << i << " with " << vector_to_string(offset) << "\n";
        const cldnn::tensor input_offset(offset.at(0), offset.at(1), offset.at(3), offset.at(2));
        split_offsets.push_back(pair<cldnn::primitive_id, cldnn::tensor>(str_i, input_offset));
    }
@@ -141,6 +130,6 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
        dim_set.push_back(output_name + suf);
    }

-    const cldnn::concatenation op_concat(output_name, dim_set, get_cldnn_axis(cldnn_channel));
+    const cldnn::concatenation op_concat(output_name, dim_set, direction);
    topology.add(op_concat);
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <CPP/concatenation.hpp>
+#include <CPP/reshape.hpp>
+
+#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
+
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static const string reshape_suf("_reshape");
+
+static Shape propagate_backward(const Shape& input)
+{
+    Shape result({0, 0, 0, 0});
+    size_t idx = result.size() - 1;
+
+    for (auto i = input.crbegin(); i != input.crend(); ++i, --idx)
+    {
+        result.at(idx) = *i;
+    }
+
+    return result;
+}
+
+static Shape propagate_forward(const Shape& input)
+{
+    Shape result({0, 0, 0, 0});
+    size_t idx = 0;
+
+    for (auto i = input.cbegin(); i != input.cend(); ++i, ++idx)
+    {
+        result.at(idx) = *i;
+    }
+
+    return result;
+}
+
+static Shape apply_axis(const Shape& input, const AxisSet& axis)
+{
+    Shape result = input;
+
+    for (auto const& i : axis)
+    {
+        result.at(i) = 0;
+    }
+
+    return result;
+}
+
+// This function broadcast input data to all other dimensions of the output
+// it operates in two mode only (controlled by is_forward flag):
+// [forward]: propagate data from left to right in Shape array term
+//            in[2], out[2,3,4,5], axis[1,2,3]
+// [backward]: propagate data from right to left in Shape array term
+//            in[5], out[2,3,4,5], axis[0,1,2]
+// Input and output shapes can be up to 4 dimensions
+// Other variants, like: in[4] out[2,3,4,5] axis[0,1,3], unsupported yet
+static void do_propagation(cldnn::topology& topology,
+                           const string& input_name,
+                           const Shape& input_shape,
+                           const string& output_name,
+                           const Shape& output_shape,
+                           const AxisSet& axis,
+                           bool is_forward)
+{
+    //default value used in "forward" mode
+    cldnn::concatenation::concatenation_axis direction =
+        runtime::intelgpu::IntelGPULayout::get_cldnn_axis(3);
+
+    string input_name_it = input_name;
+    string output_name_it = output_name;
+    Shape input_shape_it = input_shape;
+    for (auto axis_id = axis.crbegin(); axis_id != axis.crend();)
+    {
+        const size_t input_count = output_shape.at(*axis_id);
+
+        if (is_forward)
+        {
+            input_shape_it.push_back(1);
+            const cldnn::tensor my_tensor =
+                runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(input_shape_it);
+
+            const cldnn::reshape op_reshape(input_name_it + reshape_suf, input_name_it, my_tensor);
+            topology.add(op_reshape);
+
+            input_shape_it.back() = input_count;
+            input_name_it += reshape_suf;
+        }
+        else
+        {
+            direction = runtime::intelgpu::IntelGPULayout::get_cldnn_axis(*axis_id);
+        }
+
+        const vector<cldnn::primitive_id> input_names(input_count, input_name_it);
+
+        ++axis_id;
+        if (axis_id == axis.crend())
+        {
+            output_name_it = output_name;
+        }
+        else
+        {
+            output_name_it += ":_";
+            input_name_it = output_name_it;
+        }
+
+        const cldnn::concatenation op_concat(output_name_it, input_names, direction);
+        topology.add(op_concat);
+    }
+}
+
+// Assume input is scalar. All output data will be populated by the scalar
+// The function extremely non optimal from performance perspective
+static void do_scalar_propagation(cldnn::topology& topology,
+                                  const string& input_name,
+                                  const string& output_name,
+                                  const Shape& output_shape)
+{
+    const size_t input_count = shape_size<const Shape>(output_shape);
+    const vector<cldnn::primitive_id> input_names(input_count, input_name);
+
+    const cldnn::concatenation op_concat(output_name, input_names, cldnn::concatenation::along_x);
+    topology.add(op_concat);
+}
+
+void runtime::intelgpu::do_broadcast_operation(cldnn::topology& topology,
+                                               const string& input_name,
+                                               const Shape& input_shape,
+                                               const string& output_name,
+                                               const Shape& output_shape,
+                                               const AxisSet& axis)
+{
+    if (input_shape.size() > 4 || output_shape.size() > 4)
+    {
+        throw invalid_argument("IntelGPU::Broadcast supports 4D shapes maximum.");
+    }
+
+    if (input_shape.empty())
+    {
+        do_scalar_propagation(topology, input_name, output_name, output_shape);
+
+        return;
+    }
+
+    const Shape output_shape_axis = apply_axis(output_shape, axis);
+    const Shape input_shape_forward = propagate_forward(input_shape);
+    const Shape output_shape_forward = propagate_forward(output_shape_axis);
+    const Shape input_shape_backward = propagate_backward(input_shape);
+    const Shape output_shape_backward = propagate_backward(output_shape_axis);
+
+    if (input_shape_forward == output_shape_forward)
+    {
+        do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, true);
+    }
+    else if (input_shape_backward == output_shape_backward)
+    {
+        do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, false);
+    }
+    else
+    {
+        ostringstream os;
+        os << "IntelGP::Broadcast unsupported mode. input" << vector_to_string(input_shape)
+           << " output" << vector_to_string(output_shape) << " axis" << vector_to_string(axis);
+        throw invalid_argument(os.str());
+    }
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <CPP/topology.hpp>
+
+#include "ngraph/axis_set.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace intelgpu
+        {
+            // This implements Broadcast nGraph operation
+            void do_broadcast_operation(cldnn::topology& topology,
+                                        const std::string& input_name,
+                                        const Shape& input_shape,
+                                        const std::string& output_name,
+                                        const Shape& output_shape,
+                                        const AxisSet& axis);
+        }
+    }
+}
--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
 abc_int64
-abs
 acos
 aliased_output
 asin
@@ -19,7 +18,6 @@ avg_pool_2d_2channel_2image_padded_only_below
 avg_pool_3d
 backwards_abs
 backwards_acos
-backwards_add
 backwards_asin
 backwards_atan
 backwards_avgpool_n1_c1_hw2x2
@@ -36,7 +34,6 @@ backwards_concat_axis_1
 backwards_concat_vector
 backwards_cos
 backwards_cosh
-backwards_divide
 backwards_dot_scalar_scalar
 backwards_dot_scalar_tensor
 backwards_dot_tensor2_tensor2
@@ -53,8 +50,6 @@ backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1
 backwards_maxpool_n4_c1_hw4_2x2_max
 backwards_minimum
-backwards_negative
-backwards_parameter
 backwards_power
 backwards_relu
 backwards_replace_slice
@@ -72,29 +67,23 @@ backwards_softmax_3d
 backwards_softmax_all
 backwards_softmax_axis
 backwards_softmax_underflow
-backwards_subtract
 backwards_sum_m2s
 backwards_sum_m2v_0
 backwards_sum_m2v_1
 backwards_sum_v2s
 backwards_tan
-backwards_tanh
 batchnorm_bprop_n4c3h2w2
 batchnorm_fprop_b1c2h2w2
 batchnorm_fprop_b2c2h2w1
-batchnorm_fprop_globalstats_b2c2w2h1
-batchnorm_fprop_inference_b2c2h2w1
 batch_norm_one_output
 batch_norm_three_outputs
-broadcast_matrix_0
+broadcast_algo_3d_stride_1
+broadcast_algo_3d_stride_2
+broadcast_algo_matrix_stride_1
+broadcast_algo_matrix_stride_2
+broadcast_algo_matrix_stride_3
+broadcast_algo_vector_middle
 broadcast_matrix_1
-broadcast_matrix_2
-broadcast_scalar_matrix
-broadcast_scalar_tensor
-broadcast_scalar_vector
-broadcast_trivial
-broadcast_vector_colwise
-broadcast_vector_rowwise
 broadcast_vector_rowwise_int64
 broadcast_vector_rowwise_reversed
 ceiling
@@ -148,7 +137,6 @@ convolution_4d_4items_strided_dilated_padded_same
 convolution_outlining
 cos
 cosh
-divide_adjoint_stability
 divide_by_zero_int32
 dot_0_0
 dot1d
@@ -190,10 +178,6 @@ max_matrix_columns
 max_matrix_rows
 max_matrix_rows_zero
 max_matrix_to_scalar_zero_by_zero
-max_pool_2d_1channel_1image_overpadded
-max_pool_2d_1channel_1image_padded
-max_pool_2d_1channel_1image_padded_negative_values
-max_pool_2d_1channel_1image_strided
 max_pool_3d
 max_to_scalar
 max_trivial
@@ -214,10 +198,6 @@ min_to_scalar
 min_trivial
 min_trivial_5d
 min_vector_zero
-mkldnn_layouts
-multiple_backends
-multiple_result
-negative
 not
 notequal
 numeric_double_inf
@@ -275,9 +255,7 @@ reduce_window_emulating_max_pool_1d_2channel_2image
 reduce_window_emulating_max_pool_2d_1channel_1image_strided
 reduce_window_emulating_max_pool_2d_2channel_2image
 relu_2Dbackprop
-relu_2Dfprop
 relu_4Dbackprop
-relu_4Dfprop
 replace_slice_3d
 replace_slice_3d_strided
 replace_slice_3d_strided_different_strides
@@ -327,8 +305,6 @@ softmax_axis_2
 softmax_axis_3d
 softmax_axis_3d_trivial
 softmax_underflow
-sqrt
-subtract
 sum_3d_eliminate_zero_dim
 sum_3d_to_matrix_least_sig
 sum_3d_to_matrix_most_sig
@@ -345,9 +321,7 @@ sum_trivial
 sum_trivial_5d
 sum_vector_zero
 tan
-tanh
 tensor_constant_int64
-tensor_constant_with_op
 validate_call_input_type
 validate_call_output_type
 zero_sized_abs

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -1908,6 +1908,152 @@ NGRAPH_TEST(${BACKEND_NAME}, broadcast_vector_rowwise_int64)
    EXPECT_EQ((vector<int64_t>{1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}), read_vector<int64_t>(result));
 }

+static void broadcast_test_helper(const Shape& shape_a, const Shape& shape_r, const AxisSet& axis)
+{
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+
+    vector<float> inp_data(shape_size<const Shape>(shape_a));
+    iota(inp_data.begin(), inp_data.end(), 1);
+
+    auto f =
+        make_shared<Function>(make_shared<op::Broadcast>(A, shape_r, axis), op::ParameterVector{A});
+
+    auto ref_backend = runtime::Backend::create("INTERPRETER");
+    auto wrk_backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto wrk_a = wrk_backend->create_tensor(element::f32, shape_a);
+    copy_data(wrk_a, inp_data);
+
+    auto ref_a = ref_backend->create_tensor(element::f32, shape_a);
+    copy_data(ref_a, inp_data);
+
+    auto wrk_result = wrk_backend->create_tensor(element::f32, shape_r);
+    auto ref_result = ref_backend->create_tensor(element::f32, shape_r);
+
+    wrk_backend->call(f, {wrk_result}, {wrk_a});
+    ref_backend->call(f, {ref_result}, {ref_a});
+    EXPECT_EQ(read_vector<float>(ref_result), read_vector<float>(wrk_result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_vector_middle)
+{
+    Shape shape_a{2};
+    Shape shape_r{3, 2, 4};
+    AxisSet axis{0, 2};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_vector_forward_2)
+{
+    Shape shape_a{2};
+    Shape shape_r{3, 2};
+    AxisSet axis{0};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_vector_forward_3)
+{
+    Shape shape_a{2};
+    Shape shape_r{4, 3, 2};
+    AxisSet axis{0, 1};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_vector_forward_4)
+{
+    Shape shape_a{2};
+    Shape shape_r{5, 4, 3, 2};
+    AxisSet axis{0, 1, 2};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_scalar)
+{
+    Shape shape_a{};
+    Shape shape_r{5, 4, 3, 2};
+    AxisSet axis{0, 1, 2, 3};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_vector_backward_2)
+{
+    Shape shape_a{2};
+    Shape shape_r{2, 3};
+    AxisSet axis{1};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_vector_backward_3)
+{
+    Shape shape_a{2};
+    Shape shape_r{2, 3, 4};
+    AxisSet axis{1, 2};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_vector_backward_4)
+{
+    Shape shape_a{2};
+    Shape shape_r{2, 3, 4, 5};
+    AxisSet axis{1, 2, 3};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_matrix_backward_4)
+{
+    Shape shape_a{4, 5};
+    Shape shape_r{2, 3, 4, 5};
+    AxisSet axis{0, 1};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_matrix_stride_1)
+{
+    Shape shape_a{3, 5};
+    Shape shape_r{2, 3, 4, 5};
+    AxisSet axis{0, 2};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_matrix_stride_2)
+{
+    Shape shape_a{3, 4};
+    Shape shape_r{2, 3, 4, 5};
+    AxisSet axis{0, 3};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_matrix_stride_3)
+{
+    Shape shape_a{2, 4};
+    Shape shape_r{2, 3, 4, 5};
+    AxisSet axis{1, 3};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_3d_backward)
+{
+    Shape shape_a{2, 3, 4};
+    Shape shape_r{5, 2, 3, 4};
+    AxisSet axis{0};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_3d_stride_1)
+{
+    Shape shape_a{2, 3, 4};
+    Shape shape_r{2, 5, 3, 4};
+    AxisSet axis{1};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, broadcast_algo_3d_stride_2)
+{
+    Shape shape_a{2, 3, 4};
+    Shape shape_r{2, 3, 5, 4};
+    AxisSet axis{2};
+    broadcast_test_helper(shape_a, shape_r, axis);
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, broadcast_matrix_0)
 {
    Shape shape_a{2, 2};