Use Eigen kernel for Gather when axis is 0. (#3014)

* Use Eigen kernel for Gather when axis is 0. * Fix style error. * Rename variables. Fix bugs. Use helper function. * Add one unit test. * Check if openmp is defined. * Change unit tests. * Use Eigen kernel for uint8_t type. Add one uint8_t unit test. Address PR feedback. * Update cpu emitter.

Use Eigen kernel for Gather when axis is 0. (#3014)
* Use Eigen kernel for Gather when axis is 0. * Fix style error. * Rename variables. Fix bugs. Use helper function. * Add one unit test. * Check if openmp is defined. * Change unit tests. * Use Eigen kernel for uint8_t type. Add one uint8_t unit test. Address PR feedback. * Update cpu emitter.
ea64f5bf · Amy Zhuang · Scott Cyphers · 002aef50 · ea64f5bf · ea64f5bf
Commit ea64f5bf authored Jun 06, 2019 by Amy Zhuang Committed by Scott Cyphers Jun 06, 2019
9 changed files
--- a/src/ngraph/runtime/cpu/builder/gather.cpp
+++ b/src/ngraph/runtime/cpu/builder/gather.cpp
@@ -18,6 +18,7 @@

 #include "ngraph/op/gather.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/kernel/gather.hpp"
 #include "ngraph/runtime/reference/gather.hpp"

 using namespace std;
@@ -52,15 +53,49 @@ namespace ngraph

                    if (is_int64)
                    {
-                        return
-                            [&,
-                             params_shape,
-                             indices_shape,
-                             out_shape,
-                             axis,
-                             params_buffer_index,
-                             indices_buffer_index,
-                             out_buffer_index](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
+                        if ((args[0].get_element_type() == element::f32 ||
+                             args[0].get_element_type() == element::f64 ||
+                             args[0].get_element_type() == element::u8) &&
+                            axis == 0)
+                        {
+                            std::function<decltype(runtime::cpu::kernel::gather_i64<float, 2, 2>)>
+                                kernel;
+
+                            SELECT_KERNEL_BY_2RANKS(kernel,
+                                                    args[0].get_element_type(),
+                                                    params_shape.size(),
+                                                    out_shape.size(),
+                                                    runtime::cpu::kernel::gather_i64);
+
+                            return [&,
+                                    kernel,
+                                    params_shape,
+                                    indices_shape,
+                                    out_shape,
+                                    params_buffer_index,
+                                    indices_buffer_index,
+                                    out_buffer_index](CPURuntimeContext* ctx,
+                                                      CPUExecutionContext* ectx) {
+                                kernel(ctx->buffer_data[params_buffer_index],
+                                       ctx->buffer_data[indices_buffer_index],
+                                       ctx->buffer_data[out_buffer_index],
+                                       params_shape,
+                                       indices_shape,
+                                       out_shape,
+                                       ectx->arena);
+                            };
+                        }
+                        else
+                        {
+                            return [&,
+                                    params_shape,
+                                    indices_shape,
+                                    out_shape,
+                                    axis,
+                                    params_buffer_index,
+                                    indices_buffer_index,
+                                    out_buffer_index](CPURuntimeContext* ctx,
+                                                      CPUExecutionContext* ectx) {
                                ngraph::runtime::reference::gather<T, int64_t>(
                                    static_cast<T*>(ctx->buffer_data[params_buffer_index]),
                                    static_cast<int64_t*>(ctx->buffer_data[indices_buffer_index]),
@@ -70,18 +105,54 @@ namespace ngraph
                                    out_shape,
                                    axis);
                            };
+                        }
                    }
+
                    else
                    {
-                        return
-                            [&,
-                             params_shape,
-                             indices_shape,
-                             out_shape,
-                             axis,
-                             params_buffer_index,
-                             indices_buffer_index,
-                             out_buffer_index](CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
+                        if ((args[0].get_element_type() == element::f32 ||
+                             args[0].get_element_type() == element::f64 ||
+                             args[0].get_element_type() == element::u8) &&
+                            axis == 0)
+                        {
+                            std::function<decltype(runtime::cpu::kernel::gather_i32<float, 2, 2>)>
+                                kernel;
+
+                            SELECT_KERNEL_BY_2RANKS(kernel,
+                                                    args[0].get_element_type(),
+                                                    params_shape.size(),
+                                                    out_shape.size(),
+                                                    runtime::cpu::kernel::gather_i32);
+
+                            return [&,
+                                    kernel,
+                                    params_shape,
+                                    indices_shape,
+                                    out_shape,
+                                    params_buffer_index,
+                                    indices_buffer_index,
+                                    out_buffer_index](CPURuntimeContext* ctx,
+                                                      CPUExecutionContext* ectx) {
+                                kernel(ctx->buffer_data[params_buffer_index],
+                                       ctx->buffer_data[indices_buffer_index],
+                                       ctx->buffer_data[out_buffer_index],
+                                       params_shape,
+                                       indices_shape,
+                                       out_shape,
+                                       ectx->arena);
+                            };
+                        }
+                        else
+                        {
+                            return [&,
+                                    params_shape,
+                                    indices_shape,
+                                    out_shape,
+                                    axis,
+                                    params_buffer_index,
+                                    indices_buffer_index,
+                                    out_buffer_index](CPURuntimeContext* ctx,
+                                                      CPUExecutionContext* ectx) {
                                ngraph::runtime::reference::gather<T, int32_t>(
                                    static_cast<T*>(ctx->buffer_data[params_buffer_index]),
                                    static_cast<int32_t*>(ctx->buffer_data[indices_buffer_index]),
@@ -91,6 +162,7 @@ namespace ngraph
                                    out_shape,
                                    axis);
                            };
+                        }
                    }
                }
            } // namespace

--- a/src/ngraph/runtime/cpu/cpu_builder.hpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.hpp
@@ -266,6 +266,10 @@
    {                                                                                              \
        SELECT_2RANKS(KV, double, R1, R2, K);                                                      \
    }                                                                                              \
+    else if (ET == element::u8)                                                                    \
+    {                                                                                              \
+        SELECT_2RANKS(KV, uint8_t, R1, R2, K);                                                     \
+    }                                                                                              \
    else                                                                                           \
    {                                                                                              \
        throw ngraph_error("Unsupported element type " + ET.c_type_string() + " for kernel " #K);  \

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -1815,15 +1815,34 @@ namespace ngraph
                }

                writer.block_begin();
-                writer << "reference::gather<" << args[0].get_type() << ", "
-                       << args[1].get_element_type().c_type_string() << ">(" << args[0].get_name()
-                       << ",\n";
-                writer << "                   " << args[1].get_name() << ",\n";
-                writer << "                   " << out[0].get_name() << ",\n";
-                writer << "                   {" << join(args[0].get_shape()) << "},\n";
-                writer << "                   {" << join(args[1].get_shape()) << "},\n";
-                writer << "                   {" << join(out[0].get_shape()) << "},\n";
-                writer << "                   " << gather->get_axis() << ");\n";
+                if ((args[0].get_element_type() == element::f64 ||
+                     args[0].get_element_type() == element::f32 ||
+                     args[0].get_element_type() == element::u8) &&
+                    gather->get_axis() == 0)
+                {
+                    writer << "cpu::kernel::gather<" << args[0].get_type() << ", "
+                           << args[1].get_element_type().c_type_string() << ", "
+                           << args[0].get_shape().size() << ", " << out[0].get_shape().size()
+                           << ">(" << args[0].get_name() << ",\n";
+                    writer << "                   " << args[1].get_name() << ",\n";
+                    writer << "                   " << out[0].get_name() << ",\n";
+                    writer << "                   {" << join(args[0].get_shape()) << "},\n";
+                    writer << "                   {" << join(args[1].get_shape()) << "},\n";
+                    writer << "                   {" << join(out[0].get_shape()) << "},\n";
+                    writer << "                   0);\n";
+                }
+                else
+                {
+                    writer << "reference::gather<" << args[0].get_type() << ", "
+                           << args[1].get_element_type().c_type_string() << ">("
+                           << args[0].get_name() << ",\n";
+                    writer << "                   " << args[1].get_name() << ",\n";
+                    writer << "                   " << out[0].get_name() << ",\n";
+                    writer << "                   {" << join(args[0].get_shape()) << "},\n";
+                    writer << "                   {" << join(args[1].get_shape()) << "},\n";
+                    writer << "                   {" << join(out[0].get_shape()) << "},\n";
+                    writer << "                   " << gather->get_axis() << ");\n";
+                }
                writer.block_end();
            }


--- a/src/ngraph/runtime/cpu/cpu_kernels.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.hpp
@@ -225,6 +225,18 @@ namespace ngraph
                template <typename ElementType>
                void reference_erf(void* arg, void* out, size_t count);

+                template <typename ElementType,
+                          typename IndicesType,
+                          unsigned int Rank1,
+                          unsigned int Rank2>
+                void gather(void* inputs,
+                            void* indices,
+                            void* output,
+                            const Shape& inputs_shape,
+                            const Shape& indices_shape,
+                            const Shape& output_shape,
+                            int arena);
+
                template <typename ElementType,
                          typename IndicesType,
                          unsigned int Rank1,

--- a/src/ngraph/runtime/cpu/kernel/gather.hpp
+++ b/src/ngraph/runtime/cpu/kernel/gather.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include "ngraph/coordinate.hpp"
+#include "ngraph/runtime/cpu/cpu_executor.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                static void
+                    get_leading_indices(const Shape& shape, int index, std::vector<int>& indices)
+                {
+                    auto rank = shape.size();
+                    std::vector<int> partial_sum(rank);
+
+                    partial_sum[rank - 1] = 1;
+                    for (int j = rank - 2; j >= 0; j--)
+                    {
+                        partial_sum[j] = partial_sum[j + 1] * shape[j + 1];
+                    }
+                    for (int j = 0; j < rank; j++)
+                    {
+                        indices[j] = index / partial_sum[j];
+                        index = index % partial_sum[j];
+                    }
+                }
+
+                template <typename ElementType,
+                          typename IndicesType,
+                          unsigned int Rank1,
+                          unsigned int Rank2>
+                void gather(void* inputs,
+                            void* indices,
+                            void* output,
+                            const Shape& inputs_shape,
+                            const Shape& indices_shape,
+                            const Shape& output_shape,
+                            int arena)
+                {
+                    Eigen::array<Eigen::Index, Rank1> in_dims;
+                    Eigen::array<Eigen::Index, Rank2> out_dims;
+
+                    for (int i = 0; i < Rank1; i++)
+                    {
+                        in_dims[i] = inputs_shape[i];
+                    }
+                    for (int i = 0; i < Rank2; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                    }
+
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank2, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank1, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(inputs), in_dims);
+
+                    auto indices_ptr = static_cast<IndicesType*>(indices);
+                    auto indices_rank = indices_shape.size();
+
+                    if (indices_rank == 0)
+                    {
+                        Eigen::array<Eigen::Index, Rank1> in_extents, in_offsets;
+                        Eigen::array<Eigen::Index, Rank2> out_extents, out_offsets;
+
+                        for (int i = 0; i < Rank1; i++)
+                        {
+                            in_extents[i] = inputs_shape[i];
+                            in_offsets[i] = 0;
+                        }
+                        in_extents[0] = 1;
+                        in_offsets[0] = indices_ptr[0];
+                        for (int i = 0; i < Rank2; i++)
+                        {
+                            out_extents[i] = output_shape[i];
+                            out_offsets[i] = 0;
+                        }
+
+                        out.slice(out_offsets, out_extents)
+                            .device(ngraph::runtime::cpu::executor::GetCPUExecutor().get_device(
+                                arena)) = in.slice(in_offsets, in_extents).reshape(out_extents);
+                    }
+                    else
+                    {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+                        for (int i = 0; i < shape_size(indices_shape); i++)
+                        {
+                            // Declare these inside the loop for omp parallel
+                            Eigen::array<Eigen::Index, Rank1> in_extents, in_offsets;
+                            Eigen::array<Eigen::Index, Rank2> out_extents, out_offsets;
+                            std::vector<int> leading_indices(indices_rank);
+
+                            for (int r = 0; r < Rank1; r++)
+                            {
+                                in_extents[r] = inputs_shape[r];
+                                in_offsets[r] = 0;
+                            }
+                            in_extents[0] = 1;
+                            in_offsets[0] = indices_ptr[i];
+
+                            for (int r = 0; r < Rank2; r++)
+                            {
+                                out_extents[r] = output_shape[r];
+                                out_offsets[r] = 0;
+                            }
+                            get_leading_indices(indices_shape, i, leading_indices);
+                            for (int j = 0; j < indices_rank; j++)
+                            {
+                                out_extents[j] = 1;
+                                out_offsets[j] = leading_indices[j];
+                            }
+                            out.slice(out_offsets, out_extents)
+                                .device(ngraph::runtime::cpu::executor::GetCPUExecutor().get_device(
+                                    arena)) = in.slice(in_offsets, in_extents).reshape(out_extents);
+                        }
+                    }
+                }
+
+                template <typename ElementType, unsigned int Rank1, unsigned int Rank2>
+                void gather_i64(void* inputs,
+                                void* indices,
+                                void* output,
+                                const Shape& inputs_shape,
+                                const Shape& indices_shape,
+                                const Shape& output_shape,
+                                int arena)
+                {
+                    gather<ElementType, int64_t, Rank1, Rank2>(
+                        inputs, indices, output, inputs_shape, indices_shape, output_shape, arena);
+                }
+
+                template <typename ElementType, unsigned int Rank1, unsigned int Rank2>
+                void gather_i32(void* inputs,
+                                void* indices,
+                                void* output,
+                                const Shape& inputs_shape,
+                                const Shape& indices_shape,
+                                const Shape& output_shape,
+                                int arena)
+                {
+                    gather<ElementType, int32_t, Rank1, Rank2>(
+                        inputs, indices, output, inputs_shape, indices_shape, output_shape, arena);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
@@ -142,7 +142,6 @@ erf
 zero_sized_erf
 model_erf
 model_erf_int32
-gather_no_axis
 gather
 gather_nd_scalar_from_2d
 gather_nd_1d_from_2d
@@ -154,9 +153,13 @@ gather_nd_batch_1d_from_2d
 gather_nd_batch_scalar_from_3d
 gather_nd_batch_1d_from_3d
 gather_nd_batch_2d_from_3d
-gather_scalar_indices_no_axis
-gather_scalar_indices
 gather_nd_single_indices
+gather_scalar_indices
+gather_scalar_indices_no_axis
+gather_2d_indices_no_axis
+gather_3d_indices_no_axis
+gather_4d_indices_no_axis
+gather_4d_indices_no_axis_uint8
 gemm
 gemm_broadcast_input_C
 model_hardmax

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -43,7 +43,6 @@ pad_reflect_2d_with_neg
 batch_mat_mul_forward
 backwards_batchmatmul_tensor2_tensor2
 erf
-gather_no_axis
 gather
 gather_nd_scalar_from_2d
 gather_nd_1d_from_2d
@@ -55,9 +54,13 @@ gather_nd_batch_1d_from_2d
 gather_nd_batch_scalar_from_3d
 gather_nd_batch_1d_from_3d
 gather_nd_batch_2d_from_3d
-gather_scalar_indices_no_axis
-gather_scalar_indices
 gather_nd_single_indices
+gather_scalar_indices
+gather_scalar_indices_no_axis
+gather_2d_indices_no_axis
+gather_3d_indices_no_axis
+gather_4d_indices_no_axis
+gather_4d_indices_no_axis_uint8
 gemm
 gemm_broadcast_input_C
 normalize_across_chw_scalar_scale_4d

--- a/src/ngraph/runtime/plaidml/unit_test.manifest
+++ b/src/ngraph/runtime/plaidml/unit_test.manifest
@@ -102,7 +102,6 @@ embedding_lookup_10x1_arbitrary
 embedding_lookup_10x1_arbitrary_index_type_int
 embedding_lookup_10x1_arbitrary_index_type_int64
 floor_int32
-gather_no_axis
 gather
 gather_nd_scalar_from_2d
 gather_nd_1d_from_2d
@@ -114,9 +113,13 @@ gather_nd_batch_1d_from_2d
 gather_nd_batch_scalar_from_3d
 gather_nd_batch_1d_from_3d
 gather_nd_batch_2d_from_3d
-gather_scalar_indices_no_axis
-gather_scalar_indices
 gather_nd_single_indices
+gather_scalar_indices
+gather_scalar_indices_no_axis
+gather_2d_indices_no_axis
+gather_3d_indices_no_axis
+gather_4d_indices_no_axis
+gather_4d_indices_no_axis_uint8
 scatter_add_4d_indices
 scatter_add_3d_indices
 scatter_add_2d_indices

--- a/test/backend_gather.in.cpp
+++ b/test/backend_gather.in.cpp
@@ -36,7 +36,107 @@ using namespace ngraph;

 static string s_manifest = "${MANIFEST}";

-NGRAPH_TEST(${BACKEND_NAME}, gather_no_axis)
+NGRAPH_TEST(${BACKEND_NAME}, gather_4d_indices_no_axis_uint8)
+{
+    Shape params_shape{3, 2};
+    Shape indices_shape{2, 2, 3, 4};
+    Shape out_shape{2, 2, 3, 4, 2};
+    auto P = make_shared<op::Parameter>(element::u8, params_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto G = make_shared<op::Gather>(P, I);
+    auto f = make_shared<Function>(make_shared<op::GetOutputElement>(G, 0), ParameterVector{P, I});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto p = backend->create_tensor(element::u8, params_shape);
+    copy_data(p, vector<uint8_t>{10, 11, 20, 21, 30, 31});
+    auto i = backend->create_tensor(element::i32, indices_shape);
+    copy_data(i, vector<int32_t>{0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
+                                 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
+                                 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2});
+    auto result = backend->create_tensor(element::u8, out_shape);
+
+    auto c = backend->compile(f);
+    c->call_with_validate({result}, {p, i});
+    EXPECT_TRUE(test::all_close(
+        (vector<uint8_t>{10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
+                         10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
+                         10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
+                         10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
+                         10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31,
+                         10, 11, 20, 21, 20, 21, 30, 31, 10, 11, 20, 21, 20, 21, 30, 31}),
+        read_vector<uint8_t>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_4d_indices_no_axis)
+{
+    Shape params_shape{3, 2};
+    Shape indices_shape{2, 2, 3, 4};
+    Shape out_shape{2, 2, 3, 4, 2};
+    auto P = make_shared<op::Parameter>(element::f32, params_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto G = make_shared<op::Gather>(P, I);
+    auto f = make_shared<Function>(make_shared<op::GetOutputElement>(G, 0), ParameterVector{P, I});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto p = backend->create_tensor(element::f32, params_shape);
+    copy_data(p, vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 3.0f, 3.1f});
+    auto i = backend->create_tensor(element::i32, indices_shape);
+    copy_data(i, vector<int32_t>{0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
+                                 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2,
+                                 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2});
+    auto result = backend->create_tensor(element::f32, out_shape);
+
+    auto c = backend->compile(f);
+    c->call_with_validate({result}, {p, i});
+    EXPECT_TRUE(test::all_close_f(
+        (vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
+                       2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
+                       1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
+                       2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
+                       1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
+                       2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
+                       1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
+                       2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f}),
+        read_vector<float>(result),
+        MIN_FLOAT_TOLERANCE_BITS));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_3d_indices_no_axis)
+{
+    Shape params_shape{3, 2};
+    Shape indices_shape{2, 3, 4};
+    Shape out_shape{2, 3, 4, 2};
+    auto P = make_shared<op::Parameter>(element::f32, params_shape);
+    auto I = make_shared<op::Parameter>(element::i32, indices_shape);
+    auto G = make_shared<op::Gather>(P, I);
+    auto f = make_shared<Function>(make_shared<op::GetOutputElement>(G, 0), ParameterVector{P, I});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto p = backend->create_tensor(element::f32, params_shape);
+    copy_data(p, vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 3.0f, 3.1f});
+    auto i = backend->create_tensor(element::i32, indices_shape);
+    copy_data(
+        i, vector<int32_t>{0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2});
+    auto result = backend->create_tensor(element::f32, out_shape);
+
+    auto c = backend->compile(f);
+    c->call_with_validate({result}, {p, i});
+    EXPECT_TRUE(test::all_close_f(
+        (vector<float>{1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
+                       2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f,
+                       1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f,
+                       2.0f, 2.1f, 3.0f, 3.1f, 1.0f, 1.1f, 2.0f, 2.1f, 2.0f, 2.1f, 3.0f, 3.1f}),
+        read_vector<float>(result),
+        MIN_FLOAT_TOLERANCE_BITS));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, gather_2d_indices_no_axis)
 {
    Shape params_shape{3, 2};
    Shape indices_shape{2, 2};