Add CPU backend support for Tile op using Eigen kernel. (#2977)

* Add CPU backend support for Tile op using Eigen kernel. * Emit Tile code for CODEGEN. * Fix a bug. * Add one comment and fix error message typo.

Add CPU backend support for Tile op using Eigen kernel. (#2977)
* Add CPU backend support for Tile op using Eigen kernel. * Emit Tile code for CODEGEN. * Fix a bug. * Add one comment and fix error message typo.
eb6bca0e · Amy Zhuang · Scott Cyphers · 78d6a880 · eb6bca0e · eb6bca0e
Commit eb6bca0e authored Jun 11, 2019 by Amy Zhuang Committed by Scott Cyphers Jun 11, 2019
8 changed files
--- a/src/ngraph/op/experimental/tile.cpp
+++ b/src/ngraph/op/experimental/tile.cpp
@@ -51,7 +51,7 @@ void op::Tile::validate_and_infer_types()
        // Repeats shapes should be of form {arg_rank} or dynamic
        NODE_VALIDATION_CHECK(this,
                              repeats_shape.compatible(PartialShape{arg_rank}),
-                              "Arg and padding below ranks mismatch");
+                              "Arg and repeats ranks mismatch");
        output_rank = arg_rank;
    }

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -82,6 +82,7 @@ set(SRC
    builder/softmax.cpp
    builder/get_output_element.cpp
    builder/sum.cpp
+    builder/tile.cpp
    builder/topk.cpp
    builder/update_slice.cpp
    kernel/pad.cpp

--- a/src/ngraph/runtime/cpu/builder/tile.cpp
+++ b/src/ngraph/runtime/cpu/builder/tile.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/runtime/cpu/kernel/tile.hpp"
+#include "ngraph/op/experimental/tile.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Tile)
+            {
+                auto arg_shape = args[0].get_shape();
+                auto arg_rank = arg_shape.size();
+                auto& functors = external_function->get_functors();
+                auto arg_buffer_index = external_function->get_buffer_index(args[0].get_name());
+                auto out_buffer_index = external_function->get_buffer_index(out[0].get_name());
+                auto out_shape = out[0].get_shape();
+                // keep it here in case we want to support scalar input in the future.
+                if (arg_rank == 0)
+                {
+                    size_t repeats = shape_size(out_shape);
+                    std::function<decltype(runtime::cpu::kernel::tile_rank_0<float>)> kernel;
+                    SELECT_KERNEL(
+                        kernel, out[0].get_element_type(), runtime::cpu::kernel::tile_rank_0);
+                    auto functor = [&, kernel, repeats, arg_buffer_index, out_buffer_index](
+                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
+                        kernel(ctx->buffer_data[arg_buffer_index],
+                               ctx->buffer_data[out_buffer_index],
+                               repeats);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    std::function<decltype(runtime::cpu::kernel::tile<float, 2>)> kernel;
+                    SELECT_KERNEL_BY_RANK(
+                        kernel, out[0].get_element_type(), arg_rank, runtime::cpu::kernel::tile);
+                    auto functor =
+                        [&, kernel, arg_shape, out_shape, arg_buffer_index, out_buffer_index](
+                            CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
+                            kernel(ctx->buffer_data[arg_buffer_index],
+                                   ctx->buffer_data[out_buffer_index],
+                                   arg_shape,
+                                   out_shape,
+                                   ectx->arena);
+                        };
+                    functors.emplace_back(functor);
+                }
+            }
+            REGISTER_OP_BUILDER(Tile);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -61,6 +61,7 @@
 #include "ngraph/op/experimental/quantized_dot.hpp"
 #include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
+#include "ngraph/op/experimental/tile.hpp"
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/fused/conv_fused.hpp"
 #include "ngraph/op/fused/group_conv.hpp"
@@ -4062,6 +4063,37 @@ namespace ngraph
                }
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Tile)
+            {
+                auto arg_shape = args[0].get_shape();
+                auto arg_rank = arg_shape.size();
+                auto out_shape = out[0].get_shape();
+                const element::Type& et = args[0].get_element_type();
+                if (arg_rank == 0)
+                {
+                    size_t repeats = shape_size(out_shape);
+                    writer.block_begin();
+                    writer << "cpu::kernel::tile_rank_0<" << et.c_type_string() << ">("
+                           << args[0].get_name() << ", " << out[0].get_name() << ", "
+                           << std::to_string(repeats) << ");\n";
+                    writer.block_end();
+                }
+                else
+                {
+                    writer.block_begin();
+                    writer << "cpu::kernel::tile<" << et.c_type_string() << ", "
+                           << std::to_string(arg_rank) << ">(" << args[0].get_name() << ", "
+                           << out[0].get_name() << ", {" << join(arg_shape) << "}, {"
+                           << join(out_shape) << "}, 0);\n";
+                    writer.block_end();
+                }
+            }
 #undef TI
        } // namespace cpu
    }     // namespace runtime

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -78,6 +78,7 @@
 #include "ngraph/op/experimental/quantized_dot.hpp"
 #include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
+#include "ngraph/op/experimental/tile.hpp"
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/fused/conv_fused.hpp"
 #include "ngraph/op/fused/group_conv.hpp"
@@ -437,6 +438,7 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::DeconvolutionBias),
     &runtime::cpu::CPU_Emitter::emit<ngraph::op::DeconvolutionBias>},
    {TI(ngraph::op::QuantizedConcat), &runtime::cpu::CPU_Emitter::emit<op::QuantizedConcat>},
+    {TI(ngraph::op::Tile), &runtime::cpu::CPU_Emitter::emit<op::Tile>},
 };
 static void

--- a/src/ngraph/runtime/cpu/cpu_kernels.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.hpp
@@ -225,6 +225,16 @@ namespace ngraph
                template <typename ElementType>
                void reference_erf(void* arg, void* out, size_t count);
+                template <typename ElementType>
+                void tile_rank_0(void* input, void* output, size_t repeats);
+                template <typename ElementType, unsigned int Rank>
+                void tile(void* input,
+                          void* output,
+                          const Shape& input_shape,
+                          const Shape& output_shape,
+                          int arena);
                template <typename ElementType,
                          typename IndicesType,
                          unsigned int Rank1,

--- a/src/ngraph/runtime/cpu/kernel/tile.hpp
+++ b/src/ngraph/runtime/cpu/kernel/tile.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/axis_set.hpp"
+#include "ngraph/runtime/cpu/cpu_executor.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void tile_rank_0(void* input, void* output, size_t repeats)
+                {
+                    auto data = (static_cast<ElementType*>(input))[0];
+                    auto output_ptr = static_cast<ElementType*>(output);
+                    for (auto i = 0; i < repeats; i++)
+                    {
+                        output_ptr[i] = data;
+                    }
+                }
+                template <typename ElementType, unsigned int Rank>
+                void tile(void* input,
+                          void* output,
+                          const Shape& input_shape,
+                          const Shape& output_shape,
+                          int arena)
+                {
+                    Eigen::array<Eigen::Index, Rank> out_dims;
+                    Eigen::array<Eigen::Index, Rank> in_dims;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                        in_dims[i] = input_shape[i];
+                    }
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(
+                        static_cast<ElementType*>(input), in_dims);
+                    Eigen::array<ptrdiff_t, Rank> factors;
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        factors[i] = output_shape[i] / input_shape[i];
+                    }
+                    out.device(ngraph::runtime::cpu::executor::GetCPUExecutor().get_device(arena)) =
+                        in.broadcast(factors);
+                }
+            }
+        }
+    }
+}
--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/ngraph.hpp"
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/op/erf.hpp"
+#include "ngraph/op/experimental/tile.hpp"
 #include "ngraph/op/fused/conv_fused.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/parameter.hpp"
@@ -1731,6 +1732,168 @@ TEST(cpu_test, avg_pool_bprop_2d_2channel_2image)
        MIN_FLOAT_TOLERANCE_BITS));
 }
+TEST(cpu_test, tile_1d_with_zero_repeats)
+{
+    Shape shape_a{2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_re{1};
+    auto repeats = make_shared<op::Constant>(element::i64, shape_re, vector<int>{0});
+    Shape shape_r{0};
+    auto tile = make_shared<op::Tile>(A, repeats);
+    auto f = make_shared<Function>(tile, ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_TRUE(
+        test::all_close_f(vector<float>{}, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+TEST(cpu_test, tile_1d)
+{
+    Shape shape_a{2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_re{1};
+    auto repeats = make_shared<op::Constant>(element::i64, shape_re, vector<int>{2});
+    Shape shape_r{4};
+    auto tile = make_shared<op::Tile>(A, repeats);
+    auto f = make_shared<Function>(tile, ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_TRUE(test::all_close_f(
+        vector<float>{1, 2, 1, 2}, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+TEST(cpu_test, tile_2d_with_zero_repeats)
+{
+    Shape shape_a{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_re{2};
+    auto repeats = make_shared<op::Constant>(element::i64, shape_re, vector<int>{2, 0});
+    Shape shape_r{4, 0};
+    auto tile = make_shared<op::Tile>(A, repeats);
+    auto f = make_shared<Function>(tile, ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_TRUE(
+        test::all_close_f(vector<float>{}, read_vector<float>(result), MIN_FLOAT_TOLERANCE_BITS));
+}
+TEST(cpu_test, tile_2d_1axis)
+{
+    Shape shape_a{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_re{2};
+    auto repeats = make_shared<op::Constant>(element::i64, shape_re, vector<int>{3, 1});
+    Shape shape_r{6, 2};
+    auto tile = make_shared<op::Tile>(A, repeats);
+    auto f = make_shared<Function>(tile, ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_TRUE(test::all_close_f(vector<float>{1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4},
+                                  read_vector<float>(result),
+                                  MIN_FLOAT_TOLERANCE_BITS));
+}
+TEST(cpu_test, tile_2d_2axes)
+{
+    Shape shape_a{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_re{2};
+    auto repeats = make_shared<op::Constant>(element::i64, shape_re, vector<int>{3, 3});
+    Shape shape_r{6, 6};
+    auto tile = make_shared<op::Tile>(A, repeats);
+    auto f = make_shared<Function>(tile, ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_TRUE(
+        test::all_close_f(vector<float>{1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 1, 2, 1, 2, 1, 2,
+                                        3, 4, 3, 4, 3, 4, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4},
+                          read_vector<float>(result),
+                          MIN_FLOAT_TOLERANCE_BITS));
+}
+TEST(cpu_test, tile_3d)
+{
+    Shape shape_a{2, 1, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_re{3};
+    auto repeats = make_shared<op::Constant>(element::i64, shape_re, vector<int>{2, 2, 1});
+    Shape shape_r{4, 2, 3};
+    auto tile = make_shared<op::Tile>(A, repeats);
+    auto f = make_shared<Function>(tile, ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto result = backend->create_tensor(element::f32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a});
+    EXPECT_TRUE(test::all_close_f(
+        vector<float>{1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6},
+        read_vector<float>(result),
+        MIN_FLOAT_TOLERANCE_BITS));
+}
 TEST(cpu_test, scatter_add_1d_indices_in_place)
 {
    Shape ref_shape{2, 3, 3};