Unverified Commit 4135f59d authored by Jayaram Bobba's avatar Jayaram Bobba Committed by GitHub

Merge pull request #1108 from NervanaSystems/jmenon/dex2

DEX Part 2
parents 291d927c b3c8b5ea
......@@ -90,10 +90,13 @@
#include "ngraph/op/sum.hpp"
#include "ngraph/op/tan.hpp"
#include "ngraph/op/tanh.hpp"
#include "ngraph/runtime/cpu/cpu_kernels.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/kernel/abs.hpp"
#include "ngraph/runtime/cpu/kernel/add.hpp"
#include "ngraph/runtime/cpu/kernel/ceil.hpp"
#include "ngraph/runtime/cpu/kernel/multiply.hpp"
#include "ngraph/runtime/cpu/kernel/relu.hpp"
#include "ngraph/runtime/cpu/kernel/result.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
......@@ -163,6 +166,39 @@ using namespace ngraph;
KV = K<uint64_t>; \
}
#define BUILD_UNARY_ELEMWISE_FUNCTOR(OP) \
auto& functors = external_function->get_functors(); \
auto& tensor_data = external_function->get_tensor_data(); \
std::function<void(void*, void*, size_t)> kernel; \
\
SELECT_KERNEL(kernel, out[0].get_element_type(), OP); \
\
auto element_count = out[0].get_size(); \
auto& arg0_tensor = tensor_data[args[0].get_name()]; \
auto& out0_tensor = tensor_data[out[0].get_name()]; \
\
auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) { \
kernel(arg0_tensor, out0_tensor, element_count); \
}; \
functors.emplace_back(functor);
#define BUILD_BINARY_ELEMWISE_FUNCTOR(OP) \
auto& functors = external_function->get_functors(); \
auto& tensor_data = external_function->get_tensor_data(); \
std::function<void(void*, void*, void*, size_t)> kernel; \
\
SELECT_KERNEL(kernel, out[0].get_element_type(), OP); \
\
auto element_count = out[0].get_size(); \
auto& arg0_tensor = tensor_data[args[0].get_name()]; \
auto& arg1_tensor = tensor_data[args[1].get_name()]; \
auto& out0_tensor = tensor_data[out[0].get_name()]; \
\
auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) { \
kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count); \
}; \
functors.emplace_back(functor);
namespace ngraph
{
namespace runtime
......@@ -172,77 +208,180 @@ namespace ngraph
template <>
void Builder::BUILDER_DECL(ngraph::op::Add)
{
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
std::function<void(void*, void*, void*, size_t)> kernel;
BUILD_BINARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::add);
}
SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::add);
template <>
void Builder::BUILDER_DECL(ngraph::op::Multiply)
{
BUILD_BINARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::multiply);
}
auto element_count = out[0].get_size();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& arg1_tensor = tensor_data[args[1].get_name()];
auto& out0_tensor = tensor_data[out[0].get_name()];
template <>
void Builder::BUILDER_DECL(ngraph::op::Abs)
{
BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::abs);
}
auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);
};
functors.emplace_back(functor);
template <>
void Builder::BUILDER_DECL(ngraph::op::Ceiling)
{
BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::ceil);
}
template <>
void Builder::BUILDER_DECL(ngraph::op::Multiply)
void Builder::BUILDER_DECL(ngraph::op::Relu)
{
BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::relu);
}
template <>
void Builder::BUILDER_DECL(ngraph::op::Result)
{
BUILD_UNARY_ELEMWISE_FUNCTOR(runtime::cpu::kernel::result);
}
template <>
void Builder::BUILDER_DECL(ngraph::op::MatmulBias)
{
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
std::function<void(void*, void*, void*, size_t)> kernel;
SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::multiply);
auto element_count = out[0].get_size();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& arg1_tensor = tensor_data[args[1].get_name()];
auto& out0_tensor = tensor_data[out[0].get_name()];
auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
kernel(arg0_tensor, arg1_tensor, out0_tensor, element_count);
};
functors.emplace_back(functor);
const ngraph::op::MatmulBias* mm = static_cast<const ngraph::op::MatmulBias*>(node);
const auto& arg0_shape = mm->get_arg0_shape();
const auto& arg1_shape = mm->get_arg1_shape();
const auto& arg2_shape = node->get_shape();
auto m = arg0_shape[0];
auto n = arg1_shape[1];
auto k = arg0_shape[1];
bool transpose_A = false, transpose_B = false;
auto lda = arg0_shape[1];
auto ldb = arg1_shape[1];
if (mm->get_is_arg0_transposed())
{
transpose_A = true;
m = arg0_shape[1];
k = arg0_shape[0];
}
template <>
void Builder::BUILDER_DECL(ngraph::op::Abs)
if (mm->get_is_arg1_transposed())
{
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
std::function<void(void*, void*, size_t)> kernel;
transpose_B = true;
n = arg1_shape[0];
}
SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::abs);
const float beta = 0.0f;
auto element_count = out[0].get_size();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& out0_tensor = tensor_data[out[0].get_name()];
auto mm_functor =
[&, transpose_A, transpose_B, m, n, k, lda, ldb, beta, arg2_shape](
CPURuntimeContext* ctx) {
cblas::cblas_sgemm(
cblas::Layout::RowMajor,
transpose_A ? cblas::Transpose::Transpose : cblas::Transpose::None,
transpose_B ? cblas::Transpose::Transpose : cblas::Transpose::None,
m,
n,
k,
1.0f,
static_cast<float*>(arg0_tensor),
max(1UL, lda),
static_cast<float*>(arg1_tensor),
max(1UL, ldb),
beta,
static_cast<float*>(out0_tensor),
max(1UL, arg2_shape[1]));
};
function<void(CPURuntimeContext*)> bias_functor = [](CPURuntimeContext* ctx) {};
auto functor = [&, kernel, element_count](CPURuntimeContext* ctx) {
kernel(arg0_tensor, out0_tensor, element_count);
if (args.size() > 2)
{
auto& arg2_tensor = tensor_data[args[2].get_name()];
auto axes = mm->get_broadcast_axes();
if (axes.size() == 1)
{
if (*(axes.begin()) == 0)
{
vector<float> ones_row(arg2_shape[0], 1.0f);
bias_functor = [&, ones_row, arg2_shape](CPURuntimeContext* ctx) {
cblas::cblas_sgemm(cblas::Layout::RowMajor,
cblas::Transpose::None,
cblas::Transpose::None,
arg2_shape[0],
arg2_shape[1],
1,
1.0f,
ones_row.data(),
1UL,
static_cast<float*>(arg2_tensor),
max(1UL, arg2_shape[1]),
1.0f,
static_cast<float*>(out0_tensor),
max(1UL, arg2_shape[1]));
};
functors.emplace_back(functor);
}
template <>
void Builder::BUILDER_DECL(ngraph::op::Result)
else
{
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
std::function<void(void*, void*, size_t)> kernel;
vector<float> ones_col(arg2_shape[1], 1.0f);
bias_functor = [&, ones_col, arg2_shape](CPURuntimeContext* ctx) {
cblas::cblas_sgemm(cblas::Layout::RowMajor,
cblas::Transpose::None,
cblas::Transpose::None,
arg2_shape[0],
arg2_shape[1],
1,
1.0f,
static_cast<float*>(arg2_tensor),
1UL,
ones_col.data(),
max(1UL, arg2_shape[1]),
1.0f,
static_cast<float*>(out0_tensor),
max(1UL, arg2_shape[1]));
};
}
}
else
{
if (axes.size() != 2)
{
throw ngraph_error("unexpected broadcast rank");
}
SELECT_KERNEL(kernel, out[0].get_element_type(), runtime::cpu::kernel::result);
vector<float> ones_scalar(arg2_shape[0], 1.0f);
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& out0_tensor = tensor_data[out[0].get_name()];
auto size = shape_size(node->get_shape());
bias_functor = [&, ones_scalar, arg2_shape](CPURuntimeContext* ctx) {
vector<float> bias(arg2_shape[1], *static_cast<float*>(arg2_tensor));
cblas::cblas_sgemm(cblas::Layout::RowMajor,
cblas::Transpose::None,
cblas::Transpose::None,
arg2_shape[0],
arg2_shape[1],
1,
1.0f,
ones_scalar.data(),
1UL,
bias.data(),
max(1UL, arg2_shape[1]),
1.0f,
static_cast<float*>(out0_tensor),
max(1UL, arg2_shape[1]));
};
}
}
auto functor = [&, kernel, size](CPURuntimeContext* ctx) {
kernel(arg0_tensor, out0_tensor, size);
auto functor = [&, mm_functor, bias_functor](CPURuntimeContext* ctx) {
mm_functor(ctx);
bias_functor(ctx);
};
functors.emplace_back(functor);
}
......@@ -279,7 +418,10 @@ namespace ngraph
{TI(ngraph::op::Multiply), &runtime::cpu::Builder::build<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
{TI(ngraph::op::Abs), &runtime::cpu::Builder::build<ngraph::op::Abs>},
{TI(ngraph::op::Ceiling), &runtime::cpu::Builder::build<ngraph::op::Ceiling>},
{TI(ngraph::op::Relu), &runtime::cpu::Builder::build<ngraph::op::Relu>},
{TI(ngraph::op::Result), &runtime::cpu::Builder::build<ngraph::op::Result>},
{TI(ngraph::op::MatmulBias), &runtime::cpu::Builder::build<ngraph::op::MatmulBias>},
{TI(ngraph::op::Constant), &runtime::cpu::Builder::build<ngraph::op::Constant>}};
}
}
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#define EIGEN_USE_THREADS
#include <unsupported/Eigen/CXX11/Tensor>
#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
template <typename ElementType>
void ceil(void* input0, void* output, size_t count)
{
Eigen::array<Eigen::Index, 1> out_dims, in_dims;
out_dims[0] = in_dims[0] = count;
Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
static_cast<ElementType*>(output), out_dims);
Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
static_cast<ElementType*>(input0), in_dims);
out.device(eigen::global_thread_pool_device) = in0.ceil();
}
}
}
}
}
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#define EIGEN_USE_THREADS
#include <unsupported/Eigen/CXX11/Tensor>
#include "ngraph/runtime/cpu/kernel/eigen_thread_pool.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
template <typename ElementType>
void relu(void* input0, void* output, size_t count)
{
Eigen::array<Eigen::Index, 1> out_dims, in_dims;
out_dims[0] = in_dims[0] = count;
Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> out(
static_cast<ElementType*>(output), out_dims);
Eigen::TensorMap<Eigen::Tensor<ElementType, 1, Eigen::RowMajor>> in0(
static_cast<ElementType*>(input0), in_dims);
out.device(eigen::global_thread_pool_device) = in0.cwiseMax(ElementType(0));
}
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment