Optimized Pad (#658)

* CPU: Eigen-based Pad kernel * CPU: Create a global Eigen thread pool and use it for padding * Formatting fixes

Optimized Pad (#658)
* CPU: Eigen-based Pad kernel * CPU: Create a global Eigen thread pool and use it for padding * Formatting fixes
e14c0565 · Jai Menon · Scott Cyphers · 41a4d55f · e14c0565 · e14c0565
Commit e14c0565 authored Mar 16, 2018 by Jai Menon Committed by Scott Cyphers Mar 16, 2018
8 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -188,6 +188,8 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/mkldnn_emitter.cpp
        runtime/cpu/mkldnn_invoke.cpp
        runtime/cpu/mkldnn_utils.cpp
+        runtime/cpu/kernels/eigen_thread_pool.cpp
+        runtime/cpu/kernels/pad.cpp
        runtime/cpu/ops/conv_bias.cpp
        runtime/cpu/ops/convert_layout.cpp
        runtime/cpu/ops/sigmoid.cpp

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -2766,15 +2766,31 @@ namespace ngraph
                auto arg0_shape = args[0].get_shape();
                auto result_shape = out[0].get_shape();

-                writer << "kernel::pad<" << out[0].get_type() << ">(" << args[0].get_name()
-                       << ",\n";
-                writer << "            " << args[1].get_name() << ",\n";
-                writer << "            " << out[0].get_name() << ",\n";
-                writer << "            {" << join(arg0_shape) << "},\n";
-                writer << "            {" << join(result_shape) << "},\n";
-                writer << "            {" << join(pad->get_padding_below()) << "},\n";
-                writer << "            {" << join(pad->get_padding_above()) << "},\n";
-                writer << "            {" << join(pad->get_padding_interior()) << "});\n";
+                if (arg0_shape.size() == 4 && args[0].get_element_type() == element::f32 &&
+                    pad->get_padding_interior() == Shape(arg0_shape.size()))
+                {
+                    writer << "cpu::kernel::pad_4d_float32(" << args[0].get_name() << ",\n"
+                           << "                            " << out[0].get_name() << ",\n"
+                           << "                            *(" << args[1].get_name() << "),\n"
+                           << "                            {" << join(arg0_shape) << "},\n"
+                           << "                            {" << join(result_shape) << "},\n"
+                           << "                            {" << join(pad->get_padding_below())
+                           << "},\n"
+                           << "                            {" << join(pad->get_padding_above())
+                           << "});\n";
+                }
+                else
+                {
+                    writer << "kernel::pad<" << out[0].get_type() << ">(" << args[0].get_name()
+                           << ",\n";
+                    writer << "            " << args[1].get_name() << ",\n";
+                    writer << "            " << out[0].get_name() << ",\n";
+                    writer << "            {" << join(arg0_shape) << "},\n";
+                    writer << "            {" << join(result_shape) << "},\n";
+                    writer << "            {" << join(pad->get_padding_below()) << "},\n";
+                    writer << "            {" << join(pad->get_padding_above()) << "},\n";
+                    writer << "            {" << join(pad->get_padding_interior()) << "});\n";
+                }
            }

            template <>

--- a/src/ngraph/runtime/cpu/cpu_kernels.hpp
+++ b/src/ngraph/runtime/cpu/cpu_kernels.hpp
@@ -104,3 +104,25 @@ namespace mkl
                       size_t ldb);
    }
 }
+
+namespace ngraph
+{
+    class Shape;
+
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                void pad_4d_float32(float* input,
+                                    float* output,
+                                    float pad_value,
+                                    const Shape& input_shape,
+                                    const Shape& output_shape,
+                                    const Shape& padding_below,
+                                    const Shape& padding_above);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernels/eigen_thread_pool.cpp
+++ b/src/ngraph/runtime/cpu/kernels/eigen_thread_pool.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "eigen_thread_pool.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace eigen
+            {
+                Eigen::ThreadPool global_thread_pool(Eigen::nbThreads());
+                Eigen::ThreadPoolDevice global_thread_pool_device(&global_thread_pool,
+                                                                  Eigen::nbThreads());
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernels/eigen_thread_pool.hpp
+++ b/src/ngraph/runtime/cpu/kernels/eigen_thread_pool.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace eigen
+            {
+                extern Eigen::ThreadPool global_thread_pool;
+                extern Eigen::ThreadPoolDevice global_thread_pool_device;
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernels/pad.cpp
+++ b/src/ngraph/runtime/cpu/kernels/pad.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "pad.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                void pad_4d_float32(float* input,
+                                    float* output,
+                                    float pad_value,
+                                    const Shape& input_shape,
+                                    const Shape& output_shape,
+                                    const Shape& padding_below,
+                                    const Shape& padding_above)
+                {
+                    pad<float, 4>(input,
+                                  output,
+                                  pad_value,
+                                  input_shape,
+                                  output_shape,
+                                  padding_below,
+                                  padding_above);
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernels/pad.hpp
+++ b/src/ngraph/runtime/cpu/kernels/pad.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include "ngraph/runtime/cpu/kernels/eigen_thread_pool.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType, unsigned int Rank>
+                void pad(ElementType* input,
+                         ElementType* output,
+                         ElementType pad_value,
+                         const Shape& input_shape,
+                         const Shape& output_shape,
+                         const Shape& padding_below,
+                         const Shape& padding_above)
+                {
+                    Eigen::array<Eigen::Index, Rank> out_dims, in_dims;
+                    Eigen::array<Eigen::IndexPair<size_t>, Rank> padding;
+
+                    for (int i = 0; i < Rank; i++)
+                    {
+                        out_dims[i] = output_shape[i];
+                        in_dims[i] = input_shape[i];
+                        padding[i] = {padding_below[i], padding_above[i]};
+                    }
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> out(
+                        output, out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, Rank, Eigen::RowMajor>> in(input,
+                                                                                           in_dims);
+
+                    out.device(eigen::global_thread_pool_device) = in.pad(padding, pad_value);
+                }
+            }
+        }
+    }
+}
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -7118,6 +7118,74 @@ TEST(${BACKEND_NAME}, pad_exterior_2d_3x0)
              read_vector<float>(result));
 }

+TEST(${BACKEND_NAME}, pad_exterior_4d_1x2x2x2)
+{
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    SKIP_TEST_FOR("ARGON", "${BACKEND_NAME}");
+
+    Shape shape_a{1, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 2, 4, 4};
+    Shape padding_below{0, 0, 1, 1};
+    Shape padding_above{0, 0, 1, 1};
+    Shape padding_interior{0, 0, 0, 0};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    // clang-format off
+    copy_data(a, test::NDArray<float, 4>(
+        {
+            {
+                {
+                    {0.0f, 0.0f},
+                    {0.0f, 0.0f}
+                },
+                {
+                    {0.0f, 0.0f},
+                    {0.0f, 0.0f}
+                }
+            }
+        }).get_vector());
+    // clang-format on
+
+    auto b = backend->make_primary_tensor_view(element::f32, shape_b);
+    copy_data(b, vector<float>{42});
+
+    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
+
+    cf->call({a, b}, {result});
+    // clang-format off
+    EXPECT_EQ((test::NDArray<float, 4>(
+        {
+            {
+                {
+                    {42.0f, 42.0f, 42.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 42.0f, 42.0f, 42.0f}
+                },
+                {
+                    {42.0f, 42.0f, 42.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 42.0f, 42.0f, 42.0f}
+                }
+            }
+        }).get_vector()),
+        read_vector<float>(result));
+    // clang-format on
+}
+
 // This is a regression test for one of TF's unit tests, which was failing.
 // The problem was inappropriate handling of the shape computation for a
 // zero-length axis with interior padding. Rather than subtract 1 from the