Merge pull request #1108 from NervanaSystems/jmenon/dex2

DEX Part 2

Merge pull request #1108 from NervanaSystems/jmenon/dex2
DEX Part 2
4135f59d · Jayaram Bobba · GitHub · 291d927c · b3c8b5ea · 4135f59d
Unverified Commit 4135f59d authored Jun 18, 2018 by Jayaram Bobba Committed by GitHub Jun 18, 2018
Showing with 285 additions and 45 deletions

cpu_builder.cpp src/ngraph/runtime/cpu/cpu_builder.cpp +187 -45

ceil.hpp src/ngraph/runtime/cpu/kernel/ceil.hpp +49 -0

relu.hpp src/ngraph/runtime/cpu/kernel/relu.hpp +49 -0

No files found.
--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
@@ -90,10 +90,13 @@
 #include "ngraph/op/sum.hpp"
 #include "ngraph/op/tan.hpp"
 #include "ngraph/op/tanh.hpp"
+#include "ngraph/runtime/cpu/cpu_kernels.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/kernel/abs.hpp"
 #include "ngraph/runtime/cpu/kernel/add.hpp"
+#include "ngraph/runtime/cpu/kernel/ceil.hpp"
 #include "ngraph/runtime/cpu/kernel/multiply.hpp"
+#include "ngraph/runtime/cpu/kernel/relu.hpp"
 #include "ngraph/runtime/cpu/kernel/result.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
@@ -163,6 +166,39 @@ using namespace ngraph;
        KV = K<uint64_t>;                                                                          \
    }
+#define BUILD_UNARY_ELEMWISE_FUNCTOR(OP)                                                           \
+    auto& functors = external_function->get_functors();                                            \
+    auto& tensor_data = external_function->get_tensor_data();                                      \
+    std::function<void(void*, void*, size_t)> kernel;                                              \
+                                                                                                   \
+    SELECT_KERNEL(kernel, out[0].get_element_type(), OP);                                          \
+                                                                                                   \
+    auto element_count = out[0].get_size();                                                        \
+    auto& arg0_tensor = tensor_data[args[0].get_name()];                                           \
+    auto& out0_tensor = tensor_data[out[0].get_name()];                                            \
+                                                                                                   \
+    auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {                            \
+        kernel(arg0_tensor, out0_tensor, element_count);                                           \
+    };                                                                                             \
+    functors.emplace_back(functor);
+#define BUILD_BINARY_ELEMWISE_FUNCTOR(OP)                                                          \
+    auto& functors = external_function->get_functors();                                            \
+    auto& tensor_data = external_function->get_tensor_data();                                      \
+    std::function<void(void*, void*, void*, size_t)> kernel;                                       \
+                                                                                                   \
+    SELECT_KERNEL(kernel, out[0].get_element_type(), OP);                                          \
+                                                                                                   \
+    auto element_count = out[0].get_size();                                                        \
+    auto& arg0_tensor = tensor_data[args[0].get_name()];                                           \
+    auto& arg1_tensor = tensor_data[args[1].get_name()];                                           \
+    auto& out0_tensor = tensor_data[out[0].get_name()];                                            \
+                                                                                                   \
+    auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {                            \
+        kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);                              \
+    };                                                                                             \
+    functors.emplace_back(functor);
 namespace ngraph
 {
    namespace runtime
@@ -172,77 +208,180 @@ namespace ngraph
            template <>
            void Builder::BUILDER_DECL(ngraph::op::Add)
            {
-                auto& functors = external_function->get_functors();
+                BUILD_BINARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::add);
-                auto& tensor_data = external_function->get_tensor_data();
+            }
-                std::function<void(void*, void*, void*, size_t)> kernel;
-                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::add);
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Multiply)
+            {
+                BUILD_BINARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::multiply);
+            }
-                auto element_count = out[0].get_size();
+            template <>
-                auto& arg0_tensor = tensor_data[args[0].get_name()];
+            void Builder::BUILDER_DECL(ngraph::op::Abs)
-                auto& arg1_tensor = tensor_data[args[1].get_name()];
+            {
-                auto& out0_tensor = tensor_data[out[0].get_name()];
+                BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::abs);
+            }
-                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+            template <>
-                    kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);
+            void Builder::BUILDER_DECL(ngraph::op::Ceiling)
-                };
+            {
-                functors.emplace_back(functor);
+                BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::ceil);
            }
            template <>
-            void Builder::BUILDER_DECL(ngraph::op::Multiply)
+            void Builder::BUILDER_DECL(ngraph::op::Relu)
+            {
+                BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::relu);
+            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::Result)
+            {
+                BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::result);
+            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::MatmulBias)
            {
                auto& functors = external_function->get_functors();
                auto& tensor_data = external_function->get_tensor_data();
-                std::function<void(void*, void*, void*, size_t)> kernel;
-                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::multiply);
-                auto element_count = out[0].get_size();
                auto& arg0_tensor = tensor_data[args[0].get_name()];
                auto& arg1_tensor = tensor_data[args[1].get_name()];
                auto& out0_tensor = tensor_data[out[0].get_name()];
-                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                const ngraph::op::MatmulBias* mm = static_cast<const ngraph::op::MatmulBias*>(node);
-                    kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);
-                };
+                const auto& arg0_shape = mm->get_arg0_shape();
-                functors.emplace_back(functor);
+                const auto& arg1_shape = mm->get_arg1_shape();
+                const auto& arg2_shape = node->get_shape();
+                auto m = arg0_shape[0];
+                auto n = arg1_shape[1];
+                auto k = arg0_shape[1];
+                bool transpose_A = false, transpose_B = false;
+                auto lda = arg0_shape[1];
+                auto ldb = arg1_shape[1];
+                if (mm->get_is_arg0_transposed())
+                {
+                    transpose_A = true;
+                    m = arg0_shape[1];
+                    k = arg0_shape[0];
                }
-            template <>
+                if (mm->get_is_arg1_transposed())
-            void Builder::BUILDER_DECL(ngraph::op::Abs)
                {
-                auto& functors = external_function->get_functors();
+                    transpose_B = true;
-                auto& tensor_data = external_function->get_tensor_data();
+                    n = arg1_shape[0];
-                std::function<void(void*, void*, size_t)> kernel;
+                }
-                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::abs);
+                const float beta = 0.0f;
-                auto element_count = out[0].get_size();
+                auto mm_functor =
-                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                    [&, transpose_A, transpose_B, m, n, k, lda, ldb, beta, arg2_shape](
-                auto& out0_tensor = tensor_data[out[0].get_name()];
+                        CPURuntimeContext* ctx) {
+                        cblas::cblas_sgemm(
+                            cblas::Layout::RowMajor,
+                            transpose_A ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                            transpose_B ? cblas::Transpose::Transpose : cblas::Transpose::None,
+                            m,
+                            n,
+                            k,
+                            1.0f,
+                            static_cast<float*>(arg0_tensor),
+                            max(1UL, lda),
+                            static_cast<float*>(arg1_tensor),
+                            max(1UL, ldb),
+                            beta,
+                            static_cast<float*>(out0_tensor),
+                            max(1UL, arg2_shape[1]));
+                    };
+                function<void(CPURuntimeContext*)> bias_functor = [](CPURuntimeContext* ctx) {};
-                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
+                if (args.size() > 2)
-                    kernel(arg0_tensor, out0_tensor, element_count);
+                {
+                    auto& arg2_tensor = tensor_data[args[2].get_name()];
+                    auto axes = mm->get_broadcast_axes();
+                    if (axes.size() == 1)
+                    {
+                        if (*(axes.begin()) == 0)
+                        {
+                            vector<float> ones_row(arg2_shape[0], 1.0f);
+                            bias_functor = [&, ones_row, arg2_shape](CPURuntimeContext* ctx) {
+                                cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                                                   cblas::Transpose::None,
+                                                   cblas::Transpose::None,
+                                                   arg2_shape[0],
+                                                   arg2_shape[1],
+                                                   1,
+                                                   1.0f,
+                                                   ones_row.data(),
+                                                   1UL,
+                                                   static_cast<float*>(arg2_tensor),
+                                                   max(1UL, arg2_shape[1]),
+                                                   1.0f,
+                                                   static_cast<float*>(out0_tensor),
+                                                   max(1UL, arg2_shape[1]));
                            };
-                functors.emplace_back(functor);
                        }
+                        else
-            template <>
-            void Builder::BUILDER_DECL(ngraph::op::Result)
                        {
-                auto& functors = external_function->get_functors();
+                            vector<float> ones_col(arg2_shape[1], 1.0f);
-                auto& tensor_data = external_function->get_tensor_data();
+                            bias_functor = [&, ones_col, arg2_shape](CPURuntimeContext* ctx) {
-                std::function<void(void*, void*, size_t)> kernel;
+                                cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                                                   cblas::Transpose::None,
+                                                   cblas::Transpose::None,
+                                                   arg2_shape[0],
+                                                   arg2_shape[1],
+                                                   1,
+                                                   1.0f,
+                                                   static_cast<float*>(arg2_tensor),
+                                                   1UL,
+                                                   ones_col.data(),
+                                                   max(1UL, arg2_shape[1]),
+                                                   1.0f,
+                                                   static_cast<float*>(out0_tensor),
+                                                   max(1UL, arg2_shape[1]));
+                            };
+                        }
+                    }
+                    else
+                    {
+                        if (axes.size() != 2)
+                        {
+                            throw ngraph_error("unexpected broadcast rank");
+                        }
-                SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::result);
+                        vector<float> ones_scalar(arg2_shape[0], 1.0f);
-                auto& arg0_tensor = tensor_data[args[0].get_name()];
+                        bias_functor = [&, ones_scalar, arg2_shape](CPURuntimeContext* ctx) {
-                auto& out0_tensor = tensor_data[out[0].get_name()];
+                            vector<float> bias(arg2_shape[1], *static_cast<float*>(arg2_tensor));
-                auto size = shape_size(node->get_shape());
+                            cblas::cblas_sgemm(cblas::Layout::RowMajor,
+                                               cblas::Transpose::None,
+                                               cblas::Transpose::None,
+                                               arg2_shape[0],
+                                               arg2_shape[1],
+                                               1,
+                                               1.0f,
+                                               ones_scalar.data(),
+                                               1UL,
+                                               bias.data(),
+                                               max(1UL, arg2_shape[1]),
+                                               1.0f,
+                                               static_cast<float*>(out0_tensor),
+                                               max(1UL, arg2_shape[1]));
+                        };
+                    }
+                }
-                auto functor = [&, kernel, size](CPURuntimeContext* ctx) {
+                auto functor = [&, mm_functor, bias_functor](CPURuntimeContext* ctx) {
-                    kernel(arg0_tensor, out0_tensor, size);
+                    mm_functor(ctx);
+                    bias_functor(ctx);
                };
                functors.emplace_back(functor);
            }
@@ -279,7 +418,10 @@ namespace ngraph
                {TI(ngraph::op::Multiply), &runtime::cpu::Builder::build<ngraph::op::Multiply>},
                {TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
                {TI(ngraph::op::Abs), &runtime::cpu::Builder::build<ngraph::op::Abs>},
+                {TI(ngraph::op::Ceiling), &runtime::cpu::Builder::build<ngraph::op::Ceiling>},
+                {TI(ngraph::op::Relu), &runtime::cpu::Builder::build<ngraph::op::Relu>},
                {TI(ngraph::op::Result), &runtime::cpu::Builder::build<ngraph::op::Result>},
+                {TI(ngraph::op::MatmulBias), &runtime::cpu::Builder::build<ngraph::op::MatmulBias>},
                {TI(ngraph::op::Constant), &runtime::cpu::Builder::build<ngraph::op::Constant>}};
        }
    }

--- a/src/ngraph/runtime/cpu/kernel/ceil.hpp
+++ b/src/ngraph/runtime/cpu/kernel/ceil.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void ceil(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.ceil();
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/kernel/relu.hpp
+++ b/src/ngraph/runtime/cpu/kernel/relu.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#define EIGEN_USE_THREADS
+#include <unsupported/Eigen/CXX11/Tensor>
+#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace kernel
+            {
+                template <typename ElementType>
+                void relu(void* input0, void* output, size_t count)
+                {
+                    Eigen::array<Eigen::Index, 1> out_dims, in_dims;
+                    out_dims[0] = in_dims[0] = count;
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
+                        static_cast<ElementType*>(output), out_dims);
+                    Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
+                        static_cast<ElementType*>(input0), in_dims);
+                    out.device(eigen::global_thread_pool_device) = in0.cwiseMax(ElementType(0));
+                }
+            }
+        }
+    }
+}