Commit a5457c97 authored by fenglei.tian's avatar fenglei.tian

merge with master, resolve conflict

parents 8f5b3e2e 89da71d3
......@@ -186,6 +186,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
runtime/cpu/mkldnn_emitter.cpp
runtime/cpu/mkldnn_invoke.cpp
runtime/cpu/mkldnn_utils.cpp
runtime/cpu/ops/conv_bias.cpp
runtime/cpu/ops/convert_layout.cpp
runtime/cpu/ops/sigmoid.cpp
runtime/cpu/ops/matmul_bias.cpp
......
......@@ -90,6 +90,7 @@
#include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/ops/conv_bias.hpp"
#include "ngraph/runtime/cpu/ops/convert_layout.hpp"
#include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
......@@ -2487,6 +2488,134 @@ namespace ngraph
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBias)
{
auto convolution = static_cast<const ngraph::op::ConvolutionBias*>(node);
const TensorViewWrapper& data = args[0];
const TensorViewWrapper& weights = args[1];
const TensorViewWrapper& bias = args[2];
const TensorViewWrapper& result = out[0];
using namespace runtime::cpu::mkldnn_utils;
if (mkldnn_utils::use_mkldnn_kernel(node))
{
auto data_format = mkldnn_utils::get_input_mkldnn_format(node, 0);
auto weights_format = mkldnn_utils::get_input_mkldnn_format(node, 1);
auto bias_format = mkldnn_utils::get_input_mkldnn_format(node, 2);
auto result_format = mkldnn_utils::get_output_mkldnn_format(node, 0);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto data_desc = mkldnn_emitter->build_memory_descriptor(data, data_format);
auto weights_desc =
mkldnn_emitter->build_memory_descriptor(weights, weights_format);
auto bias_desc = mkldnn_emitter->build_memory_descriptor(bias, bias_format);
auto result_desc =
mkldnn_emitter->build_memory_descriptor(result, result_format);
// For dilation, MKLDNN wants to know how many elements to insert between, not how far
// apart to space the elements like nGraph. So we have to subtract 1 from each pos.
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
size_t conv_index = mkldnn_emitter->build_convolution_forward(
data_desc,
weights_desc,
bias_desc,
result_desc,
convolution->get_window_movement_strides(),
window_dilation_strides_adjusted,
convolution->get_padding_below(),
convolution->get_padding_above());
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << data.get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << weights.get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
<< ", " << bias.get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[3])
<< ", " << result.get_name() << ");\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(conv_index) << ");\n";
}
else
{
throw ngraph_error("ConvolutionBias is only supported with MKLDNN kernel.");
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBiasBackpropFiltersBias)
{
auto convolution =
static_cast<const ngraph::op::ConvolutionBiasBackpropFiltersBias*>(node);
const TensorViewWrapper& data = args[0];
const TensorViewWrapper& delta = args[1];
const TensorViewWrapper& weights_delta = out[0];
const TensorViewWrapper& bias_delta = out[1];
using namespace runtime::cpu::mkldnn_utils;
if (mkldnn_utils::use_mkldnn_kernel(node))
{
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides_forward())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
auto data_format = mkldnn_utils::get_input_mkldnn_format(node, 0);
auto delta_format = mkldnn_utils::get_input_mkldnn_format(node, 1);
auto weights_delta_format = mkldnn_utils::get_output_mkldnn_format(node, 0);
auto bias_delta_format = mkldnn_utils::get_output_mkldnn_format(node, 1);
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto data_desc = mkldnn_emitter->build_memory_descriptor(data, data_format);
auto delta_desc = mkldnn_emitter->build_memory_descriptor(delta, delta_format);
auto weights_delta_desc = mkldnn_emitter->build_memory_descriptor(
weights_delta, weights_delta_format);
auto bias_delta_desc =
mkldnn_emitter->build_memory_descriptor(bias_delta, bias_delta_format);
size_t conv_index = mkldnn_emitter->build_convolution_backward_weights_bias(
data_desc,
delta_desc,
weights_delta_desc,
bias_delta_desc,
convolution->get_window_movement_strides_forward(),
window_dilation_strides_adjusted,
convolution->get_padding_below_forward(),
convolution->get_padding_above_forward());
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << data.get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << delta.get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
<< ", " << weights_delta.get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[3])
<< ", " << bias_delta.get_name() << ");\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(conv_index) << ");\n";
}
else
{
throw ngraph_error(
"ConvolutionBiasBackpropFiltersBias is only supported with MKLDNN kernel.");
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Not)
{
......
......@@ -109,6 +109,7 @@
#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
#include "ngraph/runtime/cpu/cpu_tracing.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/ops/conv_bias.hpp"
#include "ngraph/runtime/cpu/ops/convert_layout.hpp"
#include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
......@@ -226,6 +227,10 @@ static const runtime::cpu::OpMap dispatcher{
&runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData),
&runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropData>},
{TI(ngraph::op::ConvolutionBias), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBias>},
// conv+bias backprop for data share the same implementation as ConvolutionBackpropData
{TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
&runtime::cpu::CPU_Emitter::emit<op::ConvolutionBiasBackpropFiltersBias>},
{TI(ngraph::runtime::cpu::op::ConvertLayout),
&runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::ConvertLayout>},
{TI(ngraph::op::Not), &runtime::cpu::CPU_Emitter::emit<op::Not>},
......
......@@ -25,6 +25,12 @@
using namespace ngraph::runtime::cpu;
MKLDNNEmitter::~MKLDNNEmitter()
{
for (auto p : m_mkldnn_primitives)
delete p;
}
const std::vector<mkldnn::primitive*>& MKLDNNEmitter::get_mkldnn_primitives() const
{
return m_mkldnn_primitives;
......@@ -77,6 +83,7 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above)
{
......@@ -91,6 +98,7 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
weights_desc,
result_desc,
mkldnn::memory::dims(strides.begin(), strides.end()),
mkldnn::memory::dims(dilation_strides.begin(), dilation_strides.end()),
mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
mkldnn::padding_kind::zero},
......@@ -105,21 +113,24 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& input_data_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& bias_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above)
{
size_t input_data_index = build_memory_primitive(input_data_desc);
size_t weights_index = build_memory_primitive(weights_desc);
size_t result_index = build_memory_primitive(result_desc);
const size_t input_data_index = build_memory_primitive(input_data_desc);
const size_t weights_index = build_memory_primitive(weights_desc);
const size_t bias_index = build_memory_primitive(bias_desc);
const size_t result_index = build_memory_primitive(result_desc);
size_t conv_index = insert_primitive(new mkldnn::convolution_forward(
const size_t conv_index = insert_primitive(new mkldnn::convolution_forward(
{{mkldnn::prop_kind::forward,
mkldnn::algorithm::convolution_direct,
input_data_desc,
weights_desc,
bias_desc,
result_desc,
mkldnn::memory::dims(strides.begin(), strides.end()),
mkldnn::memory::dims(dilation_strides.begin(), dilation_strides.end()),
......@@ -129,9 +140,68 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
mkldnn_utils::global_cpu_engine},
*m_mkldnn_primitives[input_data_index],
*m_mkldnn_primitives[weights_index],
*m_mkldnn_primitives[bias_index],
*m_mkldnn_primitives[result_index]));
m_primitive_deps[conv_index] = {input_data_index, weights_index, result_index};
m_primitive_deps[conv_index] = {input_data_index, weights_index, bias_index, result_index};
return conv_index;
}
size_t MKLDNNEmitter::build_convolution_backward_weights_bias(
const mkldnn::memory::desc& in_data_desc,
const mkldnn::memory::desc& in_delta_desc,
const mkldnn::memory::desc& out_weights_delta_desc,
const mkldnn::memory::desc& out_bias_delta_desc,
const ngraph::Strides& ng_strides,
const ngraph::Strides& ng_dilation_strides,
const ngraph::CoordinateDiff& ng_padding_below,
const ngraph::CoordinateDiff& ng_padding_above)
{
const size_t in_data_index = build_memory_primitive(in_data_desc);
const size_t in_delta_index = build_memory_primitive(in_delta_desc);
const size_t out_weights_delta_index = build_memory_primitive(out_weights_delta_desc);
const size_t out_bias_delta_index = build_memory_primitive(out_bias_delta_desc);
mkldnn::memory::dims strides(ng_strides.begin(), ng_strides.end());
mkldnn::memory::dims dilation(ng_dilation_strides.begin(), ng_dilation_strides.end());
mkldnn::memory::dims padding_l(ng_padding_below.begin(), ng_padding_below.end());
mkldnn::memory::dims padding_r(ng_padding_above.begin(), ng_padding_above.end());
mkldnn::convolution_forward::primitive_desc fwd_pd{{mkldnn::prop_kind::forward,
mkldnn::algorithm::convolution_direct,
in_data_desc,
out_weights_delta_desc,
out_bias_delta_desc,
in_delta_desc,
strides,
dilation,
padding_l,
padding_r,
mkldnn::padding_kind::zero},
mkldnn_utils::global_cpu_engine};
mkldnn::convolution_backward_weights::primitive_desc bwd_pd{
{mkldnn::algorithm::convolution_direct,
in_data_desc,
out_weights_delta_desc,
out_bias_delta_desc,
in_delta_desc,
strides,
dilation,
padding_l,
padding_r,
mkldnn::padding_kind::zero},
mkldnn_utils::global_cpu_engine,
fwd_pd};
const size_t conv_index = insert_primitive(
new mkldnn::convolution_backward_weights(bwd_pd,
*m_mkldnn_primitives[in_data_index],
*m_mkldnn_primitives[in_delta_index],
*m_mkldnn_primitives[out_weights_delta_index],
*m_mkldnn_primitives[out_bias_delta_index]));
m_primitive_deps[conv_index] = {
in_data_index, in_delta_index, out_weights_delta_index, out_bias_delta_index};
return conv_index;
}
......
......@@ -39,6 +39,8 @@ namespace ngraph
{
public:
MKLDNNEmitter() {}
~MKLDNNEmitter();
const std::vector<mkldnn::primitive*>& get_mkldnn_primitives() const;
size_t insert_primitive(mkldnn::primitive* primitive);
......@@ -55,11 +57,16 @@ namespace ngraph
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above);
/**
* Convolution + bias forward
*/
size_t build_convolution_forward(const mkldnn::memory::desc& input_data_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& bias_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
......@@ -82,7 +89,18 @@ namespace ngraph
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above);
/**
* Convolution + bias backprop for weights and bias
*/
size_t build_convolution_backward_weights_bias(
const mkldnn::memory::desc& in_data_desc,
const mkldnn::memory::desc& in_delta_desc,
const mkldnn::memory::desc& out_weights_delta_desc,
const mkldnn::memory::desc& out_bias_delta_desc,
const ngraph::Strides& ng_strides,
const ngraph::Strides& ng_dilation_strides,
const ngraph::CoordinateDiff& ng_padding_below,
const ngraph::CoordinateDiff& ng_padding_above);
size_t build_pooling_forward(mkldnn::algorithm pooling_algorithm,
const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc,
......
......@@ -28,6 +28,7 @@
#include "ngraph/ops/relu.hpp"
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/ops/conv_bias.hpp"
#include "ngraph/types/element_type.hpp"
#include "mkldnn_utils.hpp"
......@@ -47,6 +48,8 @@ static const std::unordered_set<std::type_index> s_op_registry{
TI(ngraph::op::Convolution),
TI(ngraph::op::ConvolutionBackpropData),
TI(ngraph::op::ConvolutionBackpropFilters),
TI(ngraph::op::ConvolutionBias),
TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
TI(ngraph::op::MaxPool),
TI(ngraph::op::MaxPoolBackprop),
TI(ngraph::op::Relu),
......
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <numeric>
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/get_output_element.hpp"
#include "ngraph/runtime/cpu/ops/conv_bias.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
op::ConvolutionBias::ConvolutionBias(const std::shared_ptr<op::Convolution>& conv,
const std::shared_ptr<Node>& bias)
: RequiresTensorViewArgs("ConvolutionBias",
{conv->get_input_op(0), conv->get_input_op(1), bias})
, m_window_movement_strides(conv->get_window_movement_strides())
, m_window_dilation_strides(conv->get_window_dilation_strides())
, m_padding_below(conv->get_padding_below())
, m_padding_above(conv->get_padding_above())
, m_data_dilation_strides(conv->get_data_dilation_strides())
{
if (conv->get_element_type() != bias->get_element_type())
{
throw ngraph_error("Convolution's element type isn't equal to bias!");
}
set_value_type_checked(conv->get_element_type(), conv->get_shape());
}
op::ConvolutionBias::ConvolutionBias(const std::shared_ptr<Node>& data_batch,
const std::shared_ptr<Node>& filters,
const std::shared_ptr<Node>& bias,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides)
: RequiresTensorViewArgs("ConvolutionBias", {data_batch, filters, bias})
, m_window_movement_strides(window_movement_strides)
, m_window_dilation_strides(window_dilation_strides)
, m_padding_below(padding_below)
, m_padding_above(padding_above)
, m_data_dilation_strides(data_dilation_strides)
{
}
std::shared_ptr<Node> op::ConvolutionBias::copy_with_new_args(const NodeVector& new_args) const
{
if (new_args.size() != 2)
{
throw ngraph_error("Incorrect number of new arguments");
}
return std::shared_ptr<Node>(new ConvolutionBias(new_args.at(0),
new_args.at(1),
new_args.at(2),
get_window_movement_strides(),
get_window_dilation_strides(),
get_padding_below(),
get_padding_above(),
get_data_dilation_strides()));
}
void op::ConvolutionBias::generate_adjoints(autodiff::Adjoints& adjoints,
const std::shared_ptr<Node>& delta)
{
auto data = get_input_op(0);
const auto data_shape = data->get_shape();
auto filter = get_input_op(1);
const auto filter_shape = filter->get_shape();
auto bias = get_input_op(2);
const auto bias_shape = bias->get_shape();
// using regular convolution backprop for data
adjoints.add_delta(data,
std::make_shared<op::ConvolutionBackpropData>(data_shape,
filter,
delta,
m_window_movement_strides,
m_window_dilation_strides,
m_padding_below,
m_padding_above,
m_data_dilation_strides));
auto filter_bias_backprop =
std::make_shared<op::ConvolutionBiasBackpropFiltersBias>(data,
filter_shape,
bias_shape,
delta,
m_window_movement_strides,
m_window_dilation_strides,
m_padding_below,
m_padding_above,
m_data_dilation_strides);
auto filter_delta = std::make_shared<op::GetOutputElement>(filter_bias_backprop, 0);
auto bias_delta = std::make_shared<op::GetOutputElement>(filter_bias_backprop, 1);
adjoints.add_delta(filter, filter_delta);
adjoints.add_delta(bias, bias_delta);
}
op::ConvolutionBiasBackpropFiltersBias::ConvolutionBiasBackpropFiltersBias(
const std::shared_ptr<Node>& data_batch,
const Shape& filters_shape,
const Shape& bias_shape,
const std::shared_ptr<Node>& output_delta,
const Strides& window_movement_strides_forward,
const Strides& window_dilation_strides_forward,
const CoordinateDiff& padding_below_forward,
const CoordinateDiff& padding_above_forward,
const Strides& data_dilation_strides_forward)
: RequiresTensorViewArgs("ConvolutionBiasBackpropFiltersBias", {data_batch, output_delta})
, m_filters_shape(filters_shape)
, m_bias_shape(bias_shape)
, m_window_movement_strides_forward(window_movement_strides_forward)
, m_window_dilation_strides_forward(window_dilation_strides_forward)
, m_padding_below_forward(padding_below_forward)
, m_padding_above_forward(padding_above_forward)
, m_data_dilation_strides_forward(data_dilation_strides_forward)
{
auto& data_batch_shape = get_input_shape(0);
auto& data_batch_et = get_input_element_type(0);
auto& output_delta_et = get_input_element_type(1);
//
// Make sure data batch and output delta element types match.
//
if (data_batch_et != output_delta_et)
{
throw ngraph_error(
"ConvolutionBiasBackpropFilterBias data batch and output delta element types do not "
"match");
}
// Forward Backward
// Window movement strides q p_f
// Window dilation strides p_f q
// Padding below a_x a_x
// Padding above b_x b_x - (a_x + (S_x - 1)p_x + b_x - (S_f - 1)p_f) % q
// Data dilation strides p_x p_x
for (size_t i = 0; i < filters_shape.size() - 2; i++)
{
m_window_movement_strides_backward.push_back(window_dilation_strides_forward[i]);
m_window_dilation_strides_backward.push_back(window_movement_strides_forward[i]);
m_padding_below_backward.push_back(padding_below_forward[i]);
m_padding_above_backward.push_back(
padding_above_forward[i] -
(padding_below_forward[i] +
(data_batch_shape[i + 2] - 1) * data_dilation_strides_forward[i] +
padding_above_forward[i] -
(filters_shape[i + 2] - 1) * window_dilation_strides_forward[i]) %
window_movement_strides_forward[i]);
m_data_dilation_strides_backward.push_back(data_dilation_strides_forward[i]);
}
add_output(data_batch_et, filters_shape);
add_output(data_batch_et, bias_shape);
}
std::shared_ptr<Node>
op::ConvolutionBiasBackpropFiltersBias::copy_with_new_args(const NodeVector& new_args) const
{
if (new_args.size() != 2)
{
throw ngraph_error("Incorrect number of new arguments");
}
return std::make_shared<ConvolutionBiasBackpropFiltersBias>(new_args.at(0),
m_filters_shape,
m_bias_shape,
new_args.at(1),
m_window_movement_strides_forward,
m_window_dilation_strides_forward,
m_padding_below_forward,
m_padding_above_forward,
m_data_dilation_strides_forward);
}
/*******************************************************************************
* Copyright 2017-2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/util/requires_tensor_view_args.hpp"
namespace ngraph
{
namespace op
{
/// \brief Convolution + bias forward prop for batched convolution operation.
class ConvolutionBias : public util::RequiresTensorViewArgs
{
public:
ConvolutionBias(const std::shared_ptr<op::Convolution>& conv,
const std::shared_ptr<Node>& bias);
const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
const CoordinateDiff& get_padding_below() const { return m_padding_below; }
const CoordinateDiff& get_padding_above() const { return m_padding_above; }
const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
std::shared_ptr<Node> get_bias() { return get_input_op(2); }
std::shared_ptr<Node> get_filters() { return get_input_op(1); }
std::shared_ptr<Node> get_data_batch() { return get_input_op(0); }
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
void generate_adjoints(autodiff::Adjoints& adjoints,
const std::shared_ptr<Node>& delta) override;
protected:
Strides m_window_movement_strides;
Strides m_window_dilation_strides;
CoordinateDiff m_padding_below;
CoordinateDiff m_padding_above;
Strides m_data_dilation_strides;
private:
ConvolutionBias(const std::shared_ptr<Node>& data_batch,
const std::shared_ptr<Node>& filters,
const std::shared_ptr<Node>& bias,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides);
};
/// \brief Filters and bias backprop for batched convolution operation. Data backprop is
/// the same as regular convolution backprop for data.
class ConvolutionBiasBackpropFiltersBias : public util::RequiresTensorViewArgs
{
public:
ConvolutionBiasBackpropFiltersBias(const std::shared_ptr<Node>& data_batch,
const Shape& filters_shape,
const Shape& bias_shape,
const std::shared_ptr<Node>& output_delta,
const Strides& window_movement_strides_forward,
const Strides& window_dilation_strides_forward,
const CoordinateDiff& padding_below_forward,
const CoordinateDiff& padding_above_forward,
const Strides& data_dilation_strides_forward);
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
/// \return The filters tensor shape.
const Shape& get_filters_shape() const { return m_filters_shape; }
/// \return The bias tensor shape.
const Shape& get_bias_shape() const { return m_bias_shape; }
/// \return The window movement strides from the forward prop.
const Strides& get_window_movement_strides_forward() const
{
return m_window_movement_strides_forward;
}
/// \return The window dilation strides from the forward prop.
const Strides& get_window_dilation_strides_forward() const
{
return m_window_dilation_strides_forward;
}
/// \return The padding-below sizes (possibly negative) from the forward prop.
const CoordinateDiff& get_padding_below_forward() const
{
return m_padding_below_forward;
}
/// \return The padding-above sizes (possibly negative) from the forward prop.
const CoordinateDiff& get_padding_above_forward() const
{
return m_padding_above_forward;
}
/// \return The data dilation strides from the forward prop.
const Strides& get_data_dilation_strides_forward() const
{
return m_data_dilation_strides_forward;
}
/// \return The window movement strides for the backward prop.
const Strides& get_window_movement_strides_backward() const
{
return m_window_movement_strides_backward;
}
/// \return The window dilation strides for the backward prop.
const Strides& get_window_dilation_strides_backward() const
{
return m_window_dilation_strides_backward;
}
/// \return The padding-below sizes (possibly negative) for the backward prop.
const CoordinateDiff& get_padding_below_backward() const
{
return m_padding_below_backward;
}
/// \return The padding-above sizes (possibly negative) for the backward prop.
const CoordinateDiff& get_padding_above_backward() const
{
return m_padding_above_backward;
}
/// \return The data dilation strides for the backward prop.
const Strides& get_data_dilation_strides_backward() const
{
return m_data_dilation_strides_backward;
}
protected:
Shape m_filters_shape;
Shape m_bias_shape;
Strides m_window_movement_strides_forward;
Strides m_window_dilation_strides_forward;
CoordinateDiff m_padding_below_forward;
CoordinateDiff m_padding_above_forward;
Strides m_data_dilation_strides_forward;
Strides m_window_movement_strides_backward;
Strides m_window_dilation_strides_backward;
CoordinateDiff m_padding_below_backward;
CoordinateDiff m_padding_above_backward;
Strides m_data_dilation_strides_backward;
};
}
}
......@@ -32,6 +32,7 @@
#include "ngraph/ops/relu.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/ops/conv_bias.hpp"
#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
using namespace std;
......@@ -153,6 +154,59 @@ namespace ngraph
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionBias)
{
auto convolution = static_cast<op::ConvolutionBias*>(node);
auto data_shape = node->get_input_shape(0);
auto weights_shape = node->get_input_shape(1);
auto result_shape = node->get_output_shape(0);
auto data_rank = data_shape.size();
auto weights_rank = weights_shape.size();
bool data_dilated = false;
for (size_t s : convolution->get_data_dilation_strides())
{
data_dilated = data_dilated || (s != 1);
}
if (!data_dilated && data_rank == 4 && weights_rank == 4 &&
node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
convolution->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionBiasBackpropFiltersBias)
{
auto convolution = static_cast<op::ConvolutionBiasBackpropFiltersBias*>(node);
auto data_shape = node->get_input_shape(0);
auto delta_shape = node->get_input_shape(1);
auto data_rank = data_shape.size();
auto delta_rank = delta_shape.size();
bool data_dilated = false;
for (size_t s : convolution->get_data_dilation_strides_forward())
{
data_dilated = data_dilated || (s != 1);
}
if (!data_dilated && data_rank == 4 && delta_rank == 4 &&
node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
convolution->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::AvgPool)
{
......@@ -266,6 +320,10 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::ConvolutionBackpropFilters),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBias),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBias>},
{TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBiasBackpropFiltersBias>},
{TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPool>},
{TI(ngraph::op::AvgPoolBackprop),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPoolBackprop>},
......
......@@ -43,6 +43,7 @@
#include "ngraph/pattern/matcher.hpp"
#include "ngraph/pattern/op/any.hpp"
#include "ngraph/pattern/op/label.hpp"
#include "ngraph/runtime/cpu/ops/conv_bias.hpp"
#include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
......@@ -566,3 +567,37 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_sigmoid()
auto m = std::make_shared<ngraph::pattern::Matcher>(divide_1_over_exp, callback);
this->add_matcher(m);
}
void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias()
{
Shape shape{2, 2, 1, 1};
auto data_batch = std::make_shared<pattern::op::Label>(element::f32, shape);
auto filters = std::make_shared<pattern::op::Label>(element::f32, shape);
auto pbias = std::make_shared<pattern::op::Label>(element::f32, Shape{});
auto pbroadcast = std::make_shared<op::Broadcast>(pbias, shape, AxisSet{0, 1, 2, 3});
auto pconv1 = std::make_shared<op::Convolution>(data_batch,
filters,
Strides{1, 1},
Strides{1, 1},
CoordinateDiff{0, 0},
CoordinateDiff{0, 0},
Strides{1, 1});
auto p_conv_bias = pbroadcast + pconv1;
ngraph::pattern::gr_callback_fn callback = [](pattern::Matcher& m) {
NGRAPH_DEBUG << "In callback for construct_conv_bias against node = "
<< m.match_root()->get_name();
auto pattern_map = m.get_pattern_map();
std::shared_ptr<Node> nn;
auto conv = std::dynamic_pointer_cast<op::Convolution>(m.match_root()->get_input_op(0));
auto bias = m.match_root()->get_input_op(1)->get_input_op(0);
auto conv_bias = std::shared_ptr<Node>(new op::ConvolutionBias(conv, bias));
return conv_bias;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(p_conv_bias, callback);
this->add_matcher(m);
}
......@@ -44,11 +44,13 @@ public:
construct_zero_padded_reshaped_conv();
construct_zero_padded_conv();
construct_sigmoid();
construct_conv_bias();
}
private:
void construct_matmul_pattern();
void construct_matmulbias_pattern();
void construct_conv_bias();
void construct_fprop_bn();
void construct_sigmoid();
void construct_zero_padded_reshaped_conv();
......
This diff is collapsed.
......@@ -14,11 +14,18 @@
* limitations under the License.
*******************************************************************************/
#include <cctype>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
static const std::string s_output_dir = "gpu_codegen";
namespace ngraph
{
namespace runtime
......@@ -31,12 +38,20 @@ namespace ngraph
return pool;
}
void CudaFunctionPool::set(std::string& name, std::shared_ptr<CUfunction> function)
void CudaFunctionPool::set(const std::string& name, const std::string& kernel)
{
m_function_map.insert({name, function});
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string filename =
file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
std::ofstream out(filename);
out << kernel;
out.close();
m_function_map.insert(
{name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
}
std::shared_ptr<CUfunction> CudaFunctionPool::get(std::string& name)
std::shared_ptr<CUfunction> CudaFunctionPool::get(const std::string& name)
{
auto it = m_function_map.find(name);
if (it != m_function_map.end())
......
......@@ -36,8 +36,8 @@ namespace ngraph
CudaFunctionPool& operator=(CudaFunctionPool const&) = delete;
CudaFunctionPool& operator=(CudaFunctionPool&&) = delete;
void set(std::string& name, std::shared_ptr<CUfunction> function);
std::shared_ptr<CUfunction> get(std::string& name);
void set(const std::string& name, const std::string& kernel);
std::shared_ptr<CUfunction> get(const std::string& name);
protected:
CudaFunctionPool() {}
......
......@@ -14,6 +14,7 @@
* limitations under the License.
*******************************************************************************/
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/codegen/code_writer.hpp"
namespace ngraph
{
......@@ -21,51 +22,66 @@ namespace ngraph
{
namespace gpu
{
void CudaKernelBuilder::get_unary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
void CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& op,
const size_t& num_inputs)
{
kernel = R"(
extern "C" __global__
void cuda_)" + name + "(" +
data_type + "* in, " + data_type + "* out, size_t n)\n" + R"({
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
{
out[tid] =)" + op + "(in[tid]);\n" +
R"(}
})";
return;
}
writer << "extern \"C\" __global__ void cuda_" << name << "(";
for (size_t i = 0; i < num_inputs; i++)
{
writer << data_type << "* in" << i << ", ";
}
writer << data_type << "* out,"
<< "size_t n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{
writer << "out[tid] = " << op << "(";
for (size_t i = 0; i < num_inputs - 1; i++)
{
writer << "in" << i << "[tid], ";
}
writer << "in" << num_inputs - 1 << "[tid]);\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
void CudaKernelBuilder::get_binary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
{
kernel = R"(
extern "C" __global__
void )" + name + "(" + data_type +
"* in1, " + data_type + "* in2, " + data_type + "* out, size_t n)\n" +
R"({
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
{
out[tid] = in1[tid] )" + op +
"in2[tid]\n" +
R"(}
})";
return;
}
void
CudaKernelBuilder::get_arbitrary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel)
void CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& math_kernel,
const size_t& num_inputs)
{
kernel = "";
if (math_kernel.size())
{
writer << "__device__ " << data_type << " " << name << "(";
for (size_t i = 0; i < num_inputs - 1; i++)
{
writer << data_type << " x" << i << ", ";
}
writer << data_type << " x" << num_inputs - 1;
writer << ")\n";
writer << "{\n";
writer.indent++;
{
writer << "return " + math_kernel << ";\n";
}
writer.indent--;
writer << "}\n";
}
return;
}
}
......
......@@ -21,6 +21,10 @@
namespace ngraph
{
namespace codegen
{
class CodeWriter;
}
namespace runtime
{
namespace gpu
......@@ -28,20 +32,17 @@ namespace ngraph
class CudaKernelBuilder
{
public:
static void get_unary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel);
static void get_binary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel);
static void get_elementwise_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& op,
const size_t& num_inputs);
static void get_arbitrary_elementwise_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel);
static void get_device_helper(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& math_kernel,
const size_t& num_inputs);
};
}
}
......
......@@ -33,8 +33,6 @@ namespace ngraph
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
std::string data_type("float");
......@@ -50,9 +48,7 @@ void cuda_)" + name + "(" + data_type +
out[tid] = in[idx];
}
})";
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
CudaFunctionPool::instance().set(name, kernel);
}
//convert runtime ptr to driver api ptr
......
......@@ -18,7 +18,6 @@
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/coordinate.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_pool.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/strides.hpp"
......@@ -35,29 +34,34 @@ namespace ngraph
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
void emit_sign(void* in, void* out, size_t count);
template <typename T>
void emit_unary_elementwise_op(void* in, void* out, size_t count, std::string name)
template <typename T, typename... Inputs>
void emit_elementwise_op(std::string name,
size_t count,
CUdeviceptr out,
Inputs&&... inputs)
{
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
CudaKernelBuilder::get_unary_elementwise_op(
name, "float", CudaOpMap<T>::op, kernel);
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
codegen::CodeWriter writer;
if (CudaOpMap<T>::math_kernel)
{
CudaKernelBuilder::get_device_helper(writer,
CudaOpMap<T>::op,
CudaOpMap<T>::type,
CudaOpMap<T>::math_kernel,
sizeof...(inputs));
}
CudaKernelBuilder::get_elementwise_op(
writer, name, CudaOpMap<T>::type, CudaOpMap<T>::op, sizeof...(inputs));
std::string kernel = writer.get_code();
CudaFunctionPool::instance().set(name, kernel);
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
void* args_list[] = {&inputs..., &out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
......
......@@ -34,13 +34,25 @@ namespace ngraph
class Sinh;
class Tan;
class Tanh;
class Power;
class Subtract;
class Divide;
class Sign;
// requires different input and output types
class Convert;
class Equal;
class NotEqual;
class Greater;
class GreaterEq;
class Less;
class LessEq;
// Unimplemented or unused in favor of cuDNN impl.
class Max;
class Min;
class Negative;
class Not;
class Sign;
class Sqrt;
}
namespace runtime
......@@ -51,102 +63,168 @@ namespace ngraph
struct CudaOpMap<ngraph::op::Abs>
{
static constexpr const char* op = "fabsf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Acos>
{
static constexpr const char* op = "acosf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Asin>
{
static constexpr const char* op = "asinf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Atan>
{
static constexpr const char* op = "atanf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Ceiling>
{
static constexpr const char* op = "ceilf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Cos>
{
static constexpr const char* op = "cosf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Cosh>
{
static constexpr const char* op = "coshf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Exp>
{
static constexpr const char* op = "expf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Floor>
{
static constexpr const char* op = "floorf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Log>
{
static constexpr const char* op = "logf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Max>
{
static constexpr const char* op = "fmaxf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Min>
{
static constexpr const char* op = "fminf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Sin>
{
static constexpr const char* op = "sinf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Sinh>
{
static constexpr const char* op = "sinhf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Sqrt>
{
static constexpr const char* op = "sqrtf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Tan>
{
static constexpr const char* op = "tanf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Tanh>
{
static constexpr const char* op = "tanhf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Power>
{
static constexpr const char* op = "powf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Subtract>
{
static constexpr const char* op = "subtractf";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = "x0-x1";
};
template <>
struct CudaOpMap<ngraph::op::Divide>
{
static constexpr const char* op = "fdividef";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = nullptr;
};
template <>
struct CudaOpMap<ngraph::op::Sign>
{
static constexpr const char* op = "sign";
static constexpr const char* type = "float";
static constexpr const char* math_kernel = "(x0 > 0) - (x0 < 0)";
};
}
}
......
......@@ -105,38 +105,30 @@ namespace ngraph
{
namespace gpu
{
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Abs)
void runtime::gpu::GPU_Emitter::EmitElementwise(
GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
{
return;
}
writer << "{ // " << node->get_name() << "\n";
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "ngraph::runtime::gpu::emit_abs((void*) " << args[0].get_name()
<< ", (void*) " << out[0].get_name() << ", count);\n";
writer.indent--;
writer << "}\n";
}
void GPU_Emitter::EmitUnaryElementwise(GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer,
const ngraph::Node* node,
const std::vector<GPU_TensorViewWrapper>& args,
const std::vector<GPU_TensorViewWrapper>& out)
{
if (out[0].get_size() == 0)
writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::emit_elementwise_op<ngraph::op::"
<< n->description() << ">(\"" << n->description() << "\""
<< ", count"
<< ", (CUdeviceptr) " << out[0].get_name();
for (size_t i = 0; i < args.size(); i++)
{
return;
writer << ", (CUdeviceptr) " << args[i].get_name();
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "ngraph::runtime::gpu::emit_unary_elementwise_op<ngraph::op::"
<< node->description() << ">((void*) " << args[0].get_name() << ", (void*) "
<< out[0].get_name() << ", count, \"" << node->description() << "\");\n";
writer << ");\n";
writer.indent--;
writer << "}\n";
}
......
......@@ -58,11 +58,11 @@ namespace ngraph
{
}
static void EmitUnaryElementwise(GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer,
const ngraph::Node* node,
const std::vector<GPU_TensorViewWrapper>& args,
const std::vector<GPU_TensorViewWrapper>& out);
static void EmitElementwise(GPU_ExternalFunction* external_function,
codegen::CodeWriter& writer,
const ngraph::Node* node,
const std::vector<GPU_TensorViewWrapper>& args,
const std::vector<GPU_TensorViewWrapper>& out);
};
}
}
......
......@@ -170,9 +170,9 @@ namespace ngraph
{TI(ngraph::op::Dot), &GPU_Emitter::emit<ngraph::op::Dot>},
{TI(ngraph::op::Multiply), &GPU_Emitter::emit<ngraph::op::Multiply>},
{TI(ngraph::op::Parameter), &GPU_Emitter::nop},
{TI(ngraph::op::Abs), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Abs), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Concat), &GPU_Emitter::emit<ngraph::op::Concat>},
{TI(ngraph::op::Divide), &GPU_Emitter::emit<ngraph::op::Divide>},
{TI(ngraph::op::Divide), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Equal), &GPU_Emitter::emit<ngraph::op::Equal>},
{TI(ngraph::op::GetOutputElement),
&GPU_Emitter::emit<ngraph::op::GetOutputElement>},
......@@ -180,44 +180,44 @@ namespace ngraph
{TI(ngraph::op::GreaterEq), &GPU_Emitter::emit<ngraph::op::GreaterEq>},
{TI(ngraph::op::Less), &GPU_Emitter::emit<ngraph::op::Less>},
{TI(ngraph::op::LessEq), &GPU_Emitter::emit<ngraph::op::LessEq>},
{TI(ngraph::op::Log), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Log), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Maximum), &GPU_Emitter::emit<ngraph::op::Maximum>},
{TI(ngraph::op::Minimum), &GPU_Emitter::emit<ngraph::op::Minimum>},
{TI(ngraph::op::Negative), &GPU_Emitter::emit<ngraph::op::Negative>},
{TI(ngraph::op::NotEqual), &GPU_Emitter::emit<ngraph::op::NotEqual>},
{TI(ngraph::op::Power), &GPU_Emitter::emit<ngraph::op::Power>},
{TI(ngraph::op::Power), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Select), &GPU_Emitter::emit<ngraph::op::Select>},
{TI(ngraph::op::Subtract), &GPU_Emitter::emit<ngraph::op::Subtract>},
{TI(ngraph::op::Subtract), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Broadcast), &GPU_Emitter::emit<ngraph::op::Broadcast>},
{TI(ngraph::op::Convert), &GPU_Emitter::emit<ngraph::op::Convert>},
{TI(ngraph::op::Constant), &GPU_Emitter::emit<ngraph::op::Constant>},
{TI(ngraph::op::Reshape), &GPU_Emitter::emit<ngraph::op::Reshape>},
{TI(ngraph::op::FunctionCall), &GPU_Emitter::emit<ngraph::op::FunctionCall>},
{TI(ngraph::op::Reduce), &GPU_Emitter::emit<ngraph::op::Reduce>},
{TI(ngraph::op::Sign), &GPU_Emitter::emit<ngraph::op::Sign>},
{TI(ngraph::op::Sign), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Slice), &GPU_Emitter::emit<ngraph::op::Slice>},
{TI(ngraph::op::Sum), &GPU_Emitter::emit<ngraph::op::Sum>},
{TI(ngraph::op::Exp), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sin), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Sinh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cos), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Cosh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tan), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Tanh), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Asin), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Acos), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Atan), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Exp), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sin), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sinh), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cos), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Cosh), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tan), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Tanh), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Asin), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Acos), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Atan), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::ReplaceSlice), &GPU_Emitter::emit<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::OneHot), &GPU_Emitter::emit<ngraph::op::OneHot>},
{TI(ngraph::op::Floor), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Ceiling), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Floor), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Ceiling), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::Sqrt), &GPU_Emitter::emit<ngraph::op::Sqrt>},
{TI(ngraph::op::Convolution), &GPU_Emitter::emit<ngraph::op::Convolution>},
{TI(ngraph::op::ConvolutionBackpropFilters),
&GPU_Emitter::emit<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::ConvolutionBackpropData),
&GPU_Emitter::emit<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::Not), &GPU_Emitter::EmitUnaryElementwise},
{TI(ngraph::op::Not), &GPU_Emitter::EmitElementwise},
{TI(ngraph::op::MaxPool), &GPU_Emitter::emit<ngraph::op::MaxPool>},
{TI(ngraph::op::Reverse), &GPU_Emitter::emit<ngraph::op::Reverse>},
{TI(ngraph::op::Result), &GPU_Emitter::emit<ngraph::op::Result>},
......
......@@ -608,7 +608,6 @@ TEST(${BACKEND_NAME}, concat_5d)
TEST(${BACKEND_NAME}, divide)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -1513,7 +1512,6 @@ TEST(${BACKEND_NAME}, select)
TEST(${BACKEND_NAME}, subtract)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -3605,7 +3603,6 @@ TEST(${BACKEND_NAME}, sum_3d_to_vector_stable)
TEST(${BACKEND_NAME}, sign)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sign>(A), op::ParameterVector{A});
......@@ -3626,7 +3623,6 @@ TEST(${BACKEND_NAME}, sign)
TEST(${BACKEND_NAME}, power)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
......@@ -3645,7 +3641,7 @@ TEST(${BACKEND_NAME}, power)
auto result = backend->make_primary_tensor_view(element::f32, shape);
cf->call({a, b}, {result});
EXPECT_EQ((vector<float>{1, 1, 729, 125}), read_vector<float>(result));
EXPECT_TRUE(test::all_close(vector<float>{1, 1, 729, 125}, read_vector<float>(result)));
}
TEST(${BACKEND_NAME}, constant_equality_bool)
......
......@@ -21,14 +21,18 @@
#include <memory>
#include "gtest/gtest.h"
#include "ngraph/file_util.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/ngraph.hpp"
#include "ngraph/ops/batch_norm.hpp"
#include "ngraph/ops/get_output_element.hpp"
#include "ngraph/ops/parameter.hpp"
#include "ngraph/ops/sum.hpp"
#include "ngraph/pass/graph_rewrite.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/pass/visualize_tree.hpp"
#include "ngraph/pattern/matcher.hpp"
#include "ngraph/pattern/op/any.hpp"
#include "ngraph/pattern/op/label.hpp"
......@@ -36,6 +40,7 @@
#include "ngraph/file_util.hpp"
#include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/pass/visualize_tree.hpp"
#include "ngraph/runtime/cpu/ops/conv_bias.hpp"
#include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
#include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
......@@ -672,6 +677,190 @@ TEST(cpu_fusion, non_zero_padded_conv)
ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
}
TEST(cpu_fusion, fuse_conv_bias)
{
pass::Manager pass_manager;
pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
const string json_path = file_util::path_join(SERIALIZED_ZOO, "conv_bias.json");
const string json_string = file_util::read_file_to_string(json_path);
stringstream ss(json_string);
shared_ptr<Function> func = ngraph::deserialize(ss);
pass_manager.run_passes(func);
size_t cb = count_ops_of_type<op::ConvolutionBias>(func);
ASSERT_GT(cb, 0);
}
struct ConvolutionBiasTestData
{
size_t n{0};
size_t c{0};
size_t filter{0};
size_t kernel_size{0};
size_t w{0};
size_t h{0};
shared_ptr<runtime::TensorView> data_val;
shared_ptr<runtime::TensorView> weights_val;
shared_ptr<runtime::TensorView> bias_val;
shared_ptr<runtime::TensorView> result_val;
shared_ptr<runtime::TensorView> delta_val;
shared_ptr<runtime::TensorView> d_data_val;
shared_ptr<runtime::TensorView> d_weights_val;
shared_ptr<runtime::TensorView> d_bias_val;
vector<float> expected_result_val;
vector<float> expected_d_data_val;
vector<float> expected_d_weights_val;
vector<float> expected_d_bias_val;
Shape data_shape;
Shape weights_shape;
Shape bias_shape;
Shape result_shape;
shared_ptr<op::Parameter> data;
shared_ptr<op::Parameter> weights;
shared_ptr<op::Parameter> bias;
shared_ptr<op::Parameter> delta;
void n1c1h3w3(shared_ptr<runtime::Backend> backend)
{
n = 1;
c = 1;
filter = 1;
kernel_size = 3;
w = 3;
h = w;
data_shape = Shape{n, c, h, w};
data = make_shared<op::Parameter>(element::f32, data_shape);
weights_shape = Shape{filter, c, kernel_size, kernel_size};
weights = make_shared<op::Parameter>(element::f32, weights_shape);
bias_shape = Shape{filter};
bias = make_shared<op::Parameter>(element::f32, bias_shape);
result_shape = Shape{n, filter, 1, 1};
data_val = backend->make_primary_tensor_view(element::f32, data_shape);
copy_data(data_val,
vector<float>{-0.67765152f,
0.10073948f,
0.57595438f,
-0.3469252f,
-0.22134334f,
-1.80471897f,
-0.80642909f,
1.22033095f,
2.23235631f});
weights_val = backend->make_primary_tensor_view(element::f32, weights_shape);
copy_data(weights_val,
vector<float>{0.20070229f,
-0.54968649f,
-0.19819015f,
-0.38577855f,
1.37109005f,
-0.23789984f,
0.14867957f,
-0.49851316f,
-0.84815776f});
bias_val = backend->make_primary_tensor_view(element::f32, bias_shape);
copy_data(bias_val, vector<float>{0.07811152f});
result_val = backend->make_primary_tensor_view(element::f32, result_shape);
copy_data(result_val, vector<float>{0});
delta = make_shared<op::Parameter>(element::f32, result_shape);
delta_val = backend->make_primary_tensor_view(element::f32, result_shape);
copy_data(delta_val, vector<float>{-2.58936238f});
d_data_val = backend->make_primary_tensor_view(element::f32, data_shape);
copy_data(d_data_val, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0});
d_weights_val = backend->make_primary_tensor_view(element::f32, weights_shape);
copy_data(d_weights_val, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0});
d_bias_val = backend->make_primary_tensor_view(element::f32, bias_shape);
copy_data(d_bias_val, vector<float>{0});
expected_result_val = vector<float>{-2.58936238f};
expected_d_data_val = vector<float>{-0.51969099f,
1.42333758f,
0.5131861f,
0.99892044f,
-3.5502491f,
0.61600888f,
-0.3849853f,
1.29083121f,
2.19618773f};
expected_d_weights_val = vector<float>{1.7546854f,
-0.26085103f,
-1.49135458f,
0.89831507f,
0.57313812f,
4.67307138f,
2.08813715f,
-3.15987897f,
-5.7803793f};
expected_d_bias_val = vector<float>{-2.58936238f};
}
};
TEST(cpu_fusion, conv_bias_fprop_n1c1h3w3)
{
auto manager = runtime::Manager::get("CPU");
auto backend = manager->allocate_backend();
ConvolutionBiasTestData conv_test;
conv_test.n1c1h3w3(backend);
auto convolution = make_shared<op::Convolution>(conv_test.data, conv_test.weights);
auto convolution_bias = make_shared<op::ConvolutionBias>(convolution, conv_test.bias);
auto f = make_shared<Function>(
convolution_bias, op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
auto external = manager->compile(f);
auto cf = backend->make_call_frame(external);
cf->call({conv_test.data_val, conv_test.weights_val, conv_test.bias_val},
{conv_test.result_val});
auto result_vec = read_vector<float>(conv_test.result_val);
EXPECT_TRUE(
test::all_close(conv_test.expected_result_val, read_vector<float>(conv_test.result_val)));
}
TEST(cpu_fusion, conv_bias_bprop_n1c1h3w3)
{
auto manager = runtime::Manager::get("CPU");
auto backend = manager->allocate_backend();
ConvolutionBiasTestData conv_test;
conv_test.n1c1h3w3(backend);
auto convolution = make_shared<op::Convolution>(conv_test.data, conv_test.weights);
auto convolution_bias = make_shared<op::ConvolutionBias>(convolution, conv_test.bias);
auto f = make_shared<Function>(
convolution_bias, op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
auto d_data = convolution_bias->backprop_node(conv_test.data, conv_test.delta);
auto d_weights = convolution_bias->backprop_node(conv_test.weights, conv_test.delta);
auto d_bias = convolution_bias->backprop_node(conv_test.bias, conv_test.delta);
auto df = make_shared<Function>(
NodeVector{d_data, d_weights, d_bias},
op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias, conv_test.delta});
auto external = manager->compile(df);
auto cf = backend->make_call_frame(external);
cf->call({conv_test.data_val, conv_test.weights_val, conv_test.bias_val, conv_test.delta_val},
{conv_test.d_data_val, conv_test.d_weights_val, conv_test.d_bias_val});
EXPECT_TRUE(
test::all_close(conv_test.expected_d_data_val, read_vector<float>(conv_test.d_data_val)));
EXPECT_TRUE(test::all_close(conv_test.expected_d_weights_val,
read_vector<float>(conv_test.d_weights_val)));
EXPECT_TRUE(
test::all_close(conv_test.expected_d_bias_val, read_vector<float>(conv_test.d_bias_val)));
}
TEST(cpu_fusion, sigmoid_fprop_fusion)
{
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment