Commit d37fa712 authored by Jaikrishnan Menon's avatar Jaikrishnan Menon Committed by Scott Cyphers

DEX Part 3 (#1184)

* CPU Direct Execution: Implement ConvertLayout and refactor

* CPU Direct Execution: Implement Convolution
parent 4cd2c602
......@@ -27,6 +27,8 @@ set(SRC
cpu_tensor_view_wrapper.cpp
cpu_tensor_view.cpp
cpu_tracing.cpp
builder/convert_layout.cpp
builder/convolution.cpp
kernel/eigen_thread_pool.cpp
kernel/pad.cpp
kernel/reduce_max.cpp
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace cpu
{
template <>
void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::ConvertLayout)
{
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto& arg_tensor = tensor_data[args[0].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
auto input_tvl =
node->get_inputs()[0].get_output().get_tensor_view()->get_tensor_view_layout();
auto input_cpu_tvl =
dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(input_tvl);
auto input_format = input_cpu_tvl->get_mkldnn_format();
// Reorder input shape if needed
auto input_axis_order = input_cpu_tvl->get_axis_order();
Shape input_shape(input_axis_order.size());
for (size_t idx = 0; idx < input_axis_order.size(); idx++)
{
input_shape[idx] = args[0].get_shape()[input_axis_order[idx]];
}
auto output_tvl = node->get_output_tensor_view(0)->get_tensor_view_layout();
auto output_format =
dynamic_cast<runtime::cpu::LayoutDescriptor&>(*output_tvl).get_mkldnn_format();
// MKLDNN relies on format names for selecting optimized kernel implementations
// Hacky way to deal with this until they move to using canonicalized layouts
if (input_format == mkldnn::memory::format::nchw &&
runtime::cpu::mkldnn_utils::is_mkldnn_filter_format(output_format))
{
input_format = mkldnn::memory::format::oihw;
}
if (output_format == mkldnn::memory::format::nchw &&
runtime::cpu::mkldnn_utils::is_mkldnn_filter_format(input_format))
{
output_format = mkldnn::memory::format::oihw;
}
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_emitter->build_memory_descriptor(
input_shape, args[0].get_element_type(), input_format);
auto result_desc = mkldnn_emitter->build_memory_descriptor(out[0], output_format);
size_t reorder_index = mkldnn_emitter->build_reorder(input_desc, result_desc);
auto& deps = mkldnn_emitter->get_primitive_deps(reorder_index);
auto functor = [&, reorder_index](CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, reorder_index);
};
functors.emplace_back(functor);
}
}
}
}
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "ngraph/op/convolution.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/kernel/convolution.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace cpu
{
template <>
void Builder::BUILDER_DECL(ngraph::op::Convolution)
{
auto convolution = static_cast<const ngraph::op::Convolution*>(node);
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto arg0_shape = args[0].get_shape();
auto arg1_shape = args[1].get_shape();
auto result_shape = out[0].get_shape();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& arg1_tensor = tensor_data[args[1].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
// For dilation, MKLDNN wants to know how many elements to insert between, not how far
// apart to space the elements like nGraph. So we have to subtract 1 from each pos.
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
auto input_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
auto weights_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1);
// HACK to help MKLDNN pick the right implementation
if (weights_format == mkldnn::memory::format::nchw)
{
weights_format = mkldnn::memory::format::oihw;
}
auto output_format =
runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_data_desc =
mkldnn_emitter->build_memory_descriptor(args[0], input_format);
auto weights_desc =
mkldnn_emitter->build_memory_descriptor(args[1], weights_format);
auto result_desc =
mkldnn_emitter->build_memory_descriptor(out[0], output_format);
size_t conv_index = 0;
conv_index = mkldnn_emitter->build_convolution_forward(
input_data_desc,
weights_desc,
result_desc,
convolution->get_window_movement_strides(),
window_dilation_strides_adjusted,
convolution->get_padding_below(),
convolution->get_padding_above());
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
};
functors.emplace_back(functor);
}
else
{
std::function<decltype(runtime::cpu::kernel::convolution<float>)> kernel;
SELECT_KERNEL(
kernel, out[0].get_element_type(), runtime::cpu::kernel::convolution);
auto window_movement_strides = convolution->get_window_movement_strides();
auto window_dilation_strides = convolution->get_window_dilation_strides();
auto padding_below = convolution->get_padding_below();
auto padding_above = convolution->get_padding_above();
auto data_dilation_strides = convolution->get_data_dilation_strides();
auto functor = [&,
kernel,
arg0_shape,
arg1_shape,
result_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides](CPURuntimeContext* ctx) {
kernel(arg0_tensor,
arg1_tensor,
out_tensor,
arg0_shape,
arg1_shape,
result_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides,
0,
1,
1,
0,
0,
1,
false);
};
functors.emplace_back(functor);
}
}
template <>
void Builder::BUILDER_DECL(ngraph::op::ConvolutionBackpropData)
{
auto convolution = static_cast<const ngraph::op::ConvolutionBackpropData*>(node);
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto arg0_shape = args[0].get_shape();
auto arg1_shape = args[1].get_shape();
auto result_shape = out[0].get_shape();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& arg1_tensor = tensor_data[args[1].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides_forward())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
// HACK to help MKLDNN pick the right implementation
auto weights_format =
runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0);
if (weights_format == mkldnn::memory::format::nchw)
{
weights_format = mkldnn::memory::format::oihw;
}
auto weights_desc =
mkldnn_emitter->build_memory_descriptor(args[0], weights_format);
auto delta_desc = mkldnn_emitter->build_memory_descriptor(
args[1], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1));
auto result_desc = mkldnn_emitter->build_memory_descriptor(
out[0], runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
size_t conv_bwd_data_index = mkldnn_emitter->build_convolution_backward_data(
weights_desc,
delta_desc,
result_desc,
convolution->get_window_movement_strides_forward(),
window_dilation_strides_adjusted,
convolution->get_padding_below_forward(),
convolution->get_padding_above_forward());
auto& deps = mkldnn_emitter->get_primitive_deps(conv_bwd_data_index);
auto functor = [&, conv_bwd_data_index](CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_bwd_data_index);
};
functors.emplace_back(functor);
}
else
{
std::function<decltype(runtime::cpu::kernel::convolution<float>)> kernel;
SELECT_KERNEL(
kernel, out[0].get_element_type(), runtime::cpu::kernel::convolution);
auto window_movement_strides =
convolution->get_window_movement_strides_backward();
auto window_dilation_strides =
convolution->get_window_dilation_strides_backward();
auto padding_below = convolution->get_padding_below_backward();
auto padding_above = convolution->get_padding_above_backward();
auto data_dilation_strides = convolution->get_data_dilation_strides_backward();
auto functor = [&,
kernel,
arg0_shape,
arg1_shape,
result_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides](CPURuntimeContext* ctx) {
kernel(arg1_tensor,
arg0_tensor,
out_tensor,
arg1_shape,
arg0_shape,
result_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides,
0,
1,
0,
1,
0,
1,
true);
};
functors.emplace_back(functor);
}
}
template <>
void Builder::BUILDER_DECL(ngraph::op::ConvolutionBackpropFilters)
{
auto convolution = static_cast<const ngraph::op::ConvolutionBackpropFilters*>(node);
auto& functors = external_function->get_functors();
auto& tensor_data = external_function->get_tensor_data();
auto arg0_shape = args[0].get_shape();
auto arg1_shape = args[1].get_shape();
auto result_shape = out[0].get_shape();
auto& arg0_tensor = tensor_data[args[0].get_name()];
auto& arg1_tensor = tensor_data[args[1].get_name()];
auto& out_tensor = tensor_data[out[0].get_name()];
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides_forward())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_emitter->build_memory_descriptor(
args[0], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 0));
auto delta_desc = mkldnn_emitter->build_memory_descriptor(
args[1], runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node, 1));
auto result_desc = mkldnn_emitter->build_memory_descriptor(
out[0], runtime::cpu::mkldnn_utils::get_output_mkldnn_format(node, 0));
size_t conv_bwd_weights_index =
mkldnn_emitter->build_convolution_backward_weights(
input_desc,
delta_desc,
result_desc,
convolution->get_window_movement_strides_forward(),
window_dilation_strides_adjusted,
convolution->get_padding_below_forward(),
convolution->get_padding_above_forward());
auto& deps = mkldnn_emitter->get_primitive_deps(conv_bwd_weights_index);
auto functor = [&, conv_bwd_weights_index](CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_bwd_weights_index);
};
functors.emplace_back(functor);
}
else
{
std::function<decltype(runtime::cpu::kernel::convolution<float>)> kernel;
SELECT_KERNEL(
kernel, out[0].get_element_type(), runtime::cpu::kernel::convolution);
auto window_movement_strides =
convolution->get_window_movement_strides_backward();
auto window_dilation_strides =
convolution->get_window_dilation_strides_backward();
auto padding_below = convolution->get_padding_below_backward();
auto padding_above = convolution->get_padding_above_backward();
auto data_dilation_strides = convolution->get_data_dilation_strides_backward();
auto functor = [&,
kernel,
arg0_shape,
arg1_shape,
result_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides](CPURuntimeContext* ctx) {
kernel(arg0_tensor,
arg1_tensor,
out_tensor,
arg0_shape,
arg1_shape,
result_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides,
1,
0,
0,
1,
1,
0,
false);
};
functors.emplace_back(functor);
}
}
}
}
}
......@@ -98,7 +98,6 @@
#include "ngraph/runtime/cpu/kernel/multiply.hpp"
#include "ngraph/runtime/cpu/kernel/relu.hpp"
#include "ngraph/runtime/cpu/kernel/result.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
#include "ngraph/runtime/cpu/op/conv_bias.hpp"
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
......@@ -119,53 +118,6 @@
using namespace std;
using namespace ngraph;
// Per-type kernel macro
#define SELECT_KERNEL(KV, ET, K) \
if (ET == element::boolean) \
{ \
KV = K<char>; \
} \
else if (ET == element::f32) \
{ \
KV = K<float>; \
} \
else if (ET == element::f64) \
{ \
KV = K<double>; \
} \
else if (ET == element::i8) \
{ \
KV = K<int8_t>; \
} \
else if (ET == element::i16) \
{ \
KV = K<int16_t>; \
} \
else if (ET == element::i32) \
{ \
KV = K<int32_t>; \
} \
else if (ET == element::i64) \
{ \
KV = K<int64_t>; \
} \
else if (ET == element::u8) \
{ \
KV = K<uint8_t>; \
} \
else if (ET == element::u16) \
{ \
KV = K<uint16_t>; \
} \
else if (ET == element::u32) \
{ \
KV = K<uint32_t>; \
} \
else if (ET == element::u64) \
{ \
KV = K<uint64_t>; \
}
#define BUILD_UNARY_ELEMWISE_FUNCTOR(OP) \
auto& functors = external_function->get_functors(); \
auto& tensor_data = external_function->get_tensor_data(); \
......@@ -419,6 +371,14 @@ namespace ngraph
{TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
{TI(ngraph::op::Abs), &runtime::cpu::Builder::build<ngraph::op::Abs>},
{TI(ngraph::op::Ceiling), &runtime::cpu::Builder::build<ngraph::op::Ceiling>},
{TI(ngraph::runtime::cpu::op::ConvertLayout),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::ConvertLayout>},
{TI(ngraph::op::Convolution),
&runtime::cpu::Builder::build<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropData),
&runtime::cpu::Builder::build<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::ConvolutionBackpropFilters),
&runtime::cpu::Builder::build<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::Relu), &runtime::cpu::Builder::build<ngraph::op::Relu>},
{TI(ngraph::op::Result), &runtime::cpu::Builder::build<ngraph::op::Result>},
{TI(ngraph::op::MatmulBias), &runtime::cpu::Builder::build<ngraph::op::MatmulBias>},
......
......@@ -29,6 +29,134 @@
const std::vector<TensorViewWrapper>& args, \
const std::vector<TensorViewWrapper>& out)
// Per-type kernel macro
#define SELECT_KERNEL(KV, ET, K) \
if (ET == element::boolean) \
{ \
KV = K<char>; \
} \
else if (ET == element::f32) \
{ \
KV = K<float>; \
} \
else if (ET == element::f64) \
{ \
KV = K<double>; \
} \
else if (ET == element::i8) \
{ \
KV = K<int8_t>; \
} \
else if (ET == element::i16) \
{ \
KV = K<int16_t>; \
} \
else if (ET == element::i32) \
{ \
KV = K<int32_t>; \
} \
else if (ET == element::i64) \
{ \
KV = K<int64_t>; \
} \
else if (ET == element::u8) \
{ \
KV = K<uint8_t>; \
} \
else if (ET == element::u16) \
{ \
KV = K<uint16_t>; \
} \
else if (ET == element::u32) \
{ \
KV = K<uint32_t>; \
} \
else if (ET == element::u64) \
{ \
KV = K<uint64_t>; \
}
#define SELECT_RANK(KV, ET, R, K) \
if (R == 1) \
KV = K<ET, 1>; \
else if (R == 2) \
KV = K<ET, 2>; \
else if (R == 3) \
KV = K<ET, 3>; \
else if (R == 4) \
KV = K<ET, 4>; \
else if (R == 5) \
KV = K<ET, 5>; \
else if (R == 6) \
KV = K<ET, 6>; \
else if (R == 7) \
KV = K<ET, 7>; \
else if (R == 8) \
KV = K<ET, 8>; \
else if (R == 9) \
KV = K<ET, 9>; \
else if (R == 10) \
KV = K<ET, 10>; \
else if (R == 11) \
KV = K<ET, 11>; \
else if (R == 12) \
KV = K<ET, 12>; \
else if (R == 13) \
KV = K<ET, 13>; \
else if (R == 14) \
KV = K<ET, 14>; \
else if (R == 15) \
KV = K<ET, 15>; \
else if (R == 16) \
KV = K<ET, 16>;
// Per-type and rank kernel macro
#define SELECT_KERNEL_BY_RANK(KV, ET, R, K) \
if (ET == element::boolean) \
{ \
SELECT_RANK(KV, char, R, K); \
} \
else if (ET == element::f32) \
{ \
SELECT_RANK(KV, float, R, K); \
} \
else if (ET == element::f64) \
{ \
SELECT_RANK(KV, double, R, K); \
} \
else if (ET == element::i8) \
{ \
SELECT_RANK(KV, int8_t, R, K); \
} \
else if (ET == element::i16) \
{ \
SELECT_RANK(KV, int16_t, R, K); \
} \
else if (ET == element::i32) \
{ \
SELECT_RANK(KV, int32_t, R, K); \
} \
else if (ET == element::i64) \
{ \
SELECT_RANK(KV, int64_t, R, K); \
} \
else if (ET == element::u8) \
{ \
SELECT_RANK(KV, uint8_t, R, K); \
} \
else if (ET == element::u16) \
{ \
SELECT_RANK(KV, uint16_t, R, K); \
} \
else if (ET == element::u32) \
{ \
SELECT_RANK(KV, uint32_t, R, K); \
} \
else if (ET == element::u64) \
{ \
SELECT_RANK(KV, uint64_t, R, K); \
}
namespace ngraph
{
namespace runtime
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include "ngraph/runtime/reference/convolution.hpp"
#include "ngraph/shape.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace kernel
{
template <typename ElementType>
void convolution(void* input0,
void* input1,
void* output,
const Shape& arg0_shape,
const Shape& arg1_shape,
const Shape& result_shape,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides,
size_t batch_axis_data,
size_t input_channel_axis_data,
size_t input_channel_axis_filters,
size_t output_channel_axis_filters,
size_t batch_axis_result,
size_t output_channel_axis_result,
bool rotate_filter)
{
reference::convolution<ElementType>(static_cast<const ElementType*>(input0),
static_cast<const ElementType*>(input1),
static_cast<ElementType*>(output),
arg0_shape,
arg1_shape,
result_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides,
batch_axis_data,
input_channel_axis_data,
input_channel_axis_filters,
output_channel_axis_filters,
batch_axis_result,
output_channel_axis_result,
rotate_filter);
}
}
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment