Commit 73fff9f4 authored by gaurides's avatar gaurides Committed by Scott Cyphers

CPU implementation of Gelu op (#3787)

* Initial implementation

* Fixed Gelu

* Gelu backprop initial implementation

* Add GeluBackprop fusion

* Gelu and gelu backprop fusion test cases

* Prevent decompose_op() for Gelu/GeluBackpropFactor for some type

* Fixes and cleanup

* Enabled backprop fusion

* Fixed some issues

* Mostly cleanup

* Some more cleanup

* File permissions

* Remove unused variable

* Style check

* Address PR feedback

* Address PR feedback

* Incorporate changes related to latest master

* Style check

* Some more PR feedback related changes

* Remove comment

* Check for relative error

* Retrigger CI

* corrected syntax
parent 7e78232a
......@@ -174,7 +174,7 @@ def test_gelu_operator_with_parameters():
result = computation(data_value)
expected = np.array([[-1.4901161e-06, 8.4134471e-01], [-4.5500278e-02, 2.9959502]],
dtype=np.float32)
assert np.allclose(result, expected)
assert np.allclose(result, expected, .007, .007)
def test_gelu_operator_with_array():
......@@ -189,7 +189,7 @@ def test_gelu_operator_with_array():
expected = np.array([[-1.4901161e-06, 8.4134471e-01], [-4.5500278e-02, 2.9959502]],
dtype=np.float32)
assert np.allclose(result, expected)
assert np.allclose(result, expected, .007, .007)
def test_clamp_operator():
......
......@@ -58,6 +58,7 @@ set(SRC
builder/erf.cpp
builder/gather.cpp
builder/gather_nd.cpp
builder/gelu.cpp
builder/leaky_relu.cpp
builder/lstm.cpp
builder/lrn.cpp
......@@ -107,6 +108,7 @@ set(SRC
op/convert_layout.cpp
op/deconv.cpp
op/dropout.cpp
op/gelu_backprop.cpp
op/group_conv_bias.cpp
op/halide_op.cpp
op/leaky_relu.cpp
......
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace cpu
{
template <>
void Builder::BUILDER_DECL(ngraph::op::Gelu)
{
auto& functors = external_function->get_functors();
auto input_buffer_index = external_function->get_buffer_index(args[0].get_name());
auto out_buffer_index = external_function->get_buffer_index(out[0].get_name());
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto gelu_desc = mkldnn_emitter->get_gelu_forward_desc(node);
size_t scratchpad_size = QUERY_SCRATCHPAD(eltwise_forward, gelu_desc);
// Gelu needs 3 primitives: input, result, and eltwise_forward
auto gelu_index = mkldnn_emitter->reserve_primitive_space(3);
auto& deps = mkldnn_emitter->get_primitive_deps(gelu_index);
auto functor = [&,
gelu_desc,
gelu_index,
scratchpad_size,
input_buffer_index,
out_buffer_index](CPURuntimeContext* ctx,
CPUExecutionContext* /* ectx */) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_gelu(ctx->mkldnn_memories,
ctx->mkldnn_primitives,
ctx->mkldnn_scratchpad_mds,
gelu_desc,
deps,
gelu_index);
}
cpu::mkldnn_utils::set_memory_ptr(
ctx, deps[0], ctx->buffer_data[input_buffer_index]);
cpu::mkldnn_utils::set_memory_ptr(
ctx, deps[1], ctx->buffer_data[out_buffer_index]);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx,
gelu_index,
deps,
cpu::mkldnn_utils::OpType::GELU,
scratchpad_size);
};
functors.emplace_back(functor);
}
else
{
throw ngraph_error("Gelu is supported with MKLDNN kernel only for f32.");
}
}
template <>
void Builder::BUILDER_DECL(ngraph::op::GeluBackprop)
{
auto& functors = external_function->get_functors();
auto arg_fwd_buffer_index = external_function->get_buffer_index(args[0].get_name());
auto delta_buffer_index = external_function->get_buffer_index(args[1].get_name());
auto out_buffer_index = external_function->get_buffer_index(out[0].get_name());
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto bwd_desc = mkldnn_emitter->get_gelu_backward_desc(node);
auto fwd_desc = mkldnn_emitter->get_gelu_forward_desc(node);
size_t scratchpad_size =
QUERY_SCRATCHPAD_2ARGS(eltwise_backward, fwd_desc, bwd_desc);
// geluBackprop needs 4 primitives: input, delta, result, and eltwise_backward.
size_t gelu_b_index = mkldnn_emitter->reserve_primitive_space(4);
auto& deps = mkldnn_emitter->get_primitive_deps(gelu_b_index);
auto functor = [&,
bwd_desc,
fwd_desc,
gelu_b_index,
scratchpad_size,
arg_fwd_buffer_index,
delta_buffer_index,
out_buffer_index](CPURuntimeContext* ctx,
CPUExecutionContext* /* ectx */) {
if (ctx->first_iteration)
{
mkldnn_emitter->build_gelu_backward(ctx->mkldnn_memories,
ctx->mkldnn_primitives,
ctx->mkldnn_scratchpad_mds,
bwd_desc,
fwd_desc,
deps,
gelu_b_index);
}
cpu::mkldnn_utils::set_memory_ptr(
ctx, deps[0], ctx->buffer_data[arg_fwd_buffer_index]);
cpu::mkldnn_utils::set_memory_ptr(
ctx, deps[1], ctx->buffer_data[delta_buffer_index]);
cpu::mkldnn_utils::set_memory_ptr(
ctx, deps[2], ctx->buffer_data[out_buffer_index]);
cpu::mkldnn_utils::mkldnn_invoke_primitive(
ctx,
gelu_b_index,
deps,
cpu::mkldnn_utils::OpType::GELUBACKPROP,
scratchpad_size);
};
functors.emplace_back(functor);
}
else
{
throw ngraph_error("GeluBackprop is supported only for f32 with mkldnn.");
}
}
void register_builders_gelu_cpp()
{
REGISTER_OP_BUILDER(Gelu);
REGISTER_OP_BUILDER(GeluBackprop);
}
}
}
}
......@@ -42,6 +42,7 @@ namespace ngraph
register_builders_erf_cpp();
register_builders_gather_cpp();
register_builders_gather_nd_cpp();
register_builders_gelu_cpp();
register_builders_get_output_element_cpp();
register_builders_leaky_relu_cpp();
register_builders_lrn_cpp();
......
......@@ -41,6 +41,7 @@ namespace ngraph
void register_builders_erf_cpp();
void register_builders_gather_cpp();
void register_builders_gather_nd_cpp();
void register_builders_gelu_cpp();
void register_builders_get_output_element_cpp();
void register_builders_leaky_relu_cpp();
void register_builders_lrn_cpp();
......
......@@ -62,6 +62,7 @@
#include "ngraph/op/experimental/tile.hpp"
#include "ngraph/op/floor.hpp"
#include "ngraph/op/fused/conv_fused.hpp"
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/op/fused/group_conv.hpp"
#include "ngraph/op/gather.hpp"
#include "ngraph/op/gather_nd.hpp"
......@@ -123,6 +124,7 @@
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/dropout.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
......@@ -3657,6 +3659,61 @@ namespace ngraph
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Gelu)
{
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
size_t gelu_index;
std::vector<std::size_t> deps;
size_t scratchpad_size;
emit_build_primitives(
external_function, node, writer, gelu_index, deps, scratchpad_size);
writer << "cg_ctx->set_memory_ptr(" << to_string(deps[0]) << ", "
<< args[0].get_name() << ");\n";
writer << "cg_ctx->set_memory_ptr(" << to_string(deps[1]) << ", "
<< out[0].get_name() << ");\n";
writer << "std::vector<size_t> deps{" << join(deps) << "};\n";
writer << "cg_ctx->mkldnn_invoke_primitive(" << to_string(gelu_index)
<< ", deps, OpType::GELU, " << to_string(scratchpad_size) << ");\n";
}
else
{
throw ngraph_error("Gelu is only supported with MKLDNN kernel for f32.");
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::GeluBackprop)
{
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
size_t gelu_bprop_index;
std::vector<std::size_t> deps;
size_t scratchpad_size;
emit_build_primitives(
external_function, node, writer, gelu_bprop_index, deps, scratchpad_size);
writer << "cg_ctx->set_memory_ptr(" << to_string(deps[0]) << ", "
<< args[0].get_name() << ");\n";
writer << "cg_ctx->set_memory_ptr(" << to_string(deps[1]) << ", "
<< args[1].get_name() << ");\n";
writer << "cg_ctx->set_memory_ptr(" << to_string(deps[2]) << ", "
<< out[0].get_name() << ");\n";
writer << "std::vector<size_t> deps{" << join(deps) << "};\n";
writer << "cg_ctx->mkldnn_invoke_primitive(" << to_string(gelu_bprop_index)
<< ", deps, OpType::GELUBACKPROP, " << to_string(scratchpad_size)
<< ");\n";
}
else
{
throw ngraph_error("GeluBackprop is only supported with MKLDNN for f32.");
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Sigmoid)
{
......
......@@ -37,6 +37,7 @@
#include "ngraph/op/topk.hpp"
#include "ngraph/runtime/cpu/cpu_external_function.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#define EMITTER_DECL(op_name) \
emit<op_name>(CPU_ExternalFunction * external_function, \
......@@ -150,7 +151,9 @@ namespace ngraph
class Quantize;
class QuantizedConcat;
class Tile;
class Gelu;
class RandomUniform;
class GeluBackprop;
}
namespace runtime
{
......@@ -445,7 +448,11 @@ namespace ngraph
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Tile);
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Gelu);
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::RandomUniform);
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::GeluBackprop);
}
}
}
......@@ -84,6 +84,7 @@
#include "ngraph/op/experimental/tile.hpp"
#include "ngraph/op/floor.hpp"
#include "ngraph/op/fused/conv_fused.hpp"
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/op/fused/group_conv.hpp"
#include "ngraph/op/fused/lstm_cell.hpp"
#include "ngraph/op/fused/softmax_crossentropy.hpp"
......@@ -180,6 +181,7 @@
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/dropout.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
......@@ -449,6 +451,8 @@ static const runtime::cpu::OpMap dispatcher{
&runtime::cpu::CPU_Emitter::emit<ngraph::op::DeconvolutionBias>},
{TI(ngraph::op::Dropout), &runtime::cpu::CPU_Emitter::emit<op::Dropout>},
{TI(ngraph::op::Tile), &runtime::cpu::CPU_Emitter::emit<op::Tile>},
{TI(ngraph::op::Gelu), &runtime::cpu::CPU_Emitter::emit<op::Gelu>},
{TI(ngraph::op::GeluBackprop), &runtime::cpu::CPU_Emitter::emit<op::GeluBackprop>},
};
static void
......@@ -1201,6 +1205,25 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(
return false;
}
}
else if (typeid(ngraph::op::GeluBackpropFactor) == typeid(node))
{
#if MKLDNN_VERSION_MAJOR < 1
return ((node.input(0).get_element_type() == element::f32) ? true : false);
#else
// TODO: will be supported in mkldnn v1.1
return false;
#endif
}
else if (typeid(ngraph::op::Gelu) == typeid(node))
{
#if MKLDNN_VERSION_MAJOR < 1
return ((node.input(0).get_element_type() == element::f32) ? true : false);
#else
// TODO: will be supported in mkldnn v1.1
return false;
#endif
}
if (dex)
{
auto handler = GetGlobalBuildDispatcher().find(type_index(typeid(node)));
......@@ -1411,6 +1434,11 @@ void runtime::cpu::CPU_ExternalFunction::build(ngraph::pass::PassConfig& pass_co
static StaticInitializers s_static_initializers(s_debug_dir);
m_mkldnn_emitter.reset(new MKLDNNEmitter());
ngraph::pass::Manager pass_manager;
if (std::getenv("NGRAPH_ENABLE_VISUALIZE_TRACING"))
{
// Enable per_pass_validation if required for debug purpose
pass_manager.set_per_pass_validation(false);
}
register_common_passes(pass_manager, pass_config);
pass_manager.run_passes(m_function, false);
......
......@@ -45,6 +45,7 @@
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -386,6 +387,27 @@ mkldnn::eltwise_forward::desc MKLDNNEmitter::get_bounded_relu_desc(const ngraph:
0.0f);
}
mkldnn::eltwise_forward::desc MKLDNNEmitter::get_gelu_forward_desc(const ngraph::Node* node)
{
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
return mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training,
mkldnn::algorithm::eltwise_gelu,
input_desc,
1.0f,
0.0f);
}
mkldnn::eltwise_backward::desc MKLDNNEmitter::get_gelu_backward_desc(const ngraph::Node* node)
{
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
const float negative_slope = 0.0f;
return mkldnn::eltwise_backward::desc(
mkldnn::algorithm::eltwise_gelu, result_desc, input_desc, negative_slope);
}
size_t MKLDNNEmitter::convolution_forward_init(bool with_bias)
{
size_t size = m_mkldnn_primitives.size();
......@@ -1362,6 +1384,56 @@ void MKLDNNEmitter::build_bounded_relu(std::vector<mkldnn::memory*>& mkldnn_memo
mkldnn_primitives[bounded_relu_index] = new mkldnn::eltwise_forward(bounded_relu_pd);
}
void MKLDNNEmitter::build_gelu(std::vector<mkldnn::memory*>& mkldnn_memories,
std::vector<mkldnn::primitive*>& mkldnn_primitives,
std::vector<mkldnn::memory::desc*>& mkldnn_scratchpad_mds,
const mkldnn::eltwise_forward::desc& gelu_desc,
const std::vector<size_t>& deps,
size_t gelu_index)
{
mkldnn::primitive_attr attr;
attr.set_scratchpad_mode(mkldnn::scratchpad_mode::user);
auto gelu_pd =
mkldnn::eltwise_forward::primitive_desc(gelu_desc, attr, executor::global_cpu_engine);
mkldnn_scratchpad_mds[gelu_index] = new mkldnn::memory::desc(gelu_pd.scratchpad_desc());
size_t input_index = deps[0];
build_memory(mkldnn_memories, gelu_pd.src_desc(), input_index);
size_t result_index = deps[1];
build_memory(mkldnn_memories, gelu_pd.dst_desc(), result_index);
mkldnn_primitives[gelu_index] = new mkldnn::eltwise_forward(gelu_pd);
}
void MKLDNNEmitter::build_gelu_backward(std::vector<mkldnn::memory*>& mkldnn_memories,
std::vector<mkldnn::primitive*>& mkldnn_primitives,
std::vector<mkldnn::memory::desc*>& mkldnn_scratchpad_mds,
const mkldnn::eltwise_backward::desc& bwd_desc,
const mkldnn::eltwise_forward::desc& fwd_desc,
const std::vector<size_t>& deps,
size_t gelu_bprop_index)
{
// gelu forward primitive desc
auto gelu_fwd_pd =
mkldnn::eltwise_forward::primitive_desc(fwd_desc, executor::global_cpu_engine);
mkldnn::primitive_attr attr;
attr.set_scratchpad_mode(mkldnn::scratchpad_mode::user);
auto gelu_bwd_pd = mkldnn::eltwise_backward::primitive_desc(
bwd_desc, attr, executor::global_cpu_engine, gelu_fwd_pd);
mkldnn_scratchpad_mds[gelu_bprop_index] =
new mkldnn::memory::desc(gelu_bwd_pd.scratchpad_desc());
size_t input_index = deps[0];
build_memory(mkldnn_memories, gelu_bwd_pd.src_desc(), input_index);
size_t delta_index = deps[1];
build_memory(mkldnn_memories, gelu_bwd_pd.diff_dst_desc(), delta_index);
size_t result_index = deps[2];
build_memory(mkldnn_memories, gelu_bwd_pd.diff_dst_desc(), result_index);
mkldnn_primitives[gelu_bprop_index] = new mkldnn::eltwise_backward(gelu_bwd_pd);
}
size_t MKLDNNEmitter::query_scratchpad_sum(const mkldnn::sum::primitive_desc pd)
{
mkldnn::memory::desc scratchpad_md = pd.scratchpad_desc();
......@@ -2333,4 +2405,51 @@ void MKLDNNEmitter::build_bounded_relu(
*mkldnn_primitives[input_index],
*mkldnn_primitives[result_index]);
}
void MKLDNNEmitter::build_gelu(std::vector<mkldnn::memory*>& /* mkldnn_memories */,
std::vector<mkldnn::primitive*>& mkldnn_primitives,
std::vector<mkldnn::memory::desc*>& /* mkldnn_scratchpad_mds */,
const mkldnn::eltwise_forward::desc& gelu_desc,
const std::vector<size_t>& deps,
size_t gelu_index)
{
size_t input_index = deps[0];
build_memory_primitive(mkldnn_primitives, gelu_desc.data.data_desc, input_index);
size_t result_index = deps[1];
build_memory_primitive(mkldnn_primitives, gelu_desc.data.data_desc, result_index);
mkldnn_primitives[gelu_index] =
new mkldnn::eltwise_forward({gelu_desc, executor::global_cpu_engine},
*mkldnn_primitives[input_index],
*mkldnn_primitives[result_index]);
}
void MKLDNNEmitter::build_gelu_backward(
std::vector<mkldnn::memory*>& /* mkldnn_memories */,
std::vector<mkldnn::primitive*>& mkldnn_primitives,
std::vector<mkldnn::memory::desc*>& /* mkldnn_scratchpad_mds */,
const mkldnn::eltwise_backward::desc& bwd_desc,
const mkldnn::eltwise_forward::desc& fwd_desc,
const std::vector<size_t>& deps,
size_t gelu_index)
{
size_t input_index = deps[0];
build_memory_primitive(mkldnn_primitives, bwd_desc.data.data_desc, input_index);
size_t delta_index = deps[1];
build_memory_primitive(mkldnn_primitives, bwd_desc.data.diff_data_desc, delta_index);
size_t result_index = deps[2];
build_memory_primitive(mkldnn_primitives, bwd_desc.data.data_desc, result_index);
// create forward gelu primitive descriptor
auto gelu_pd = mkldnn::eltwise_forward::primitive_desc(fwd_desc, executor::global_cpu_engine);
// create backward gelu primitive_descriptor
auto gelu_bwd_pd =
mkldnn::eltwise_backward::primitive_desc(bwd_desc, executor::global_cpu_engine, gelu_pd);
mkldnn_primitives[gelu_index] = new mkldnn::eltwise_backward(gelu_bwd_pd,
*mkldnn_primitives[input_index],
*mkldnn_primitives[delta_index],
*mkldnn_primitives[result_index]);
}
#endif
......@@ -38,6 +38,7 @@
#include "ngraph/op/experimental/quantized_conv_relu.hpp"
#include "ngraph/op/experimental/quantized_dot_bias.hpp"
#include "ngraph/op/fused/conv_fused.hpp"
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/op/fused/group_conv.hpp"
#include "ngraph/op/lrn.hpp"
#include "ngraph/op/max_pool.hpp"
......@@ -500,6 +501,10 @@ namespace ngraph
mkldnn::eltwise_forward::desc get_bounded_relu_desc(const ngraph::Node* node);
mkldnn::eltwise_forward::desc get_gelu_forward_desc(const ngraph::Node* node);
mkldnn::eltwise_backward::desc get_gelu_backward_desc(const ngraph::Node* node);
size_t build_dequantization(const ngraph::Node* node,
const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc);
......@@ -1172,6 +1177,21 @@ namespace ngraph
const std::vector<size_t>& deps,
size_t bounded_relu_index);
void build_gelu(std::vector<mkldnn::memory*>& mkldnn_memories,
std::vector<mkldnn::primitive*>& mkldnn_primitives,
std::vector<mkldnn::memory::desc*>& mkldnn_scratchpad_mds,
const mkldnn::eltwise_forward::desc& gelu_desc,
const std::vector<size_t>& deps,
size_t gelu_index);
void build_gelu_backward(std::vector<mkldnn::memory*>& mkldnn_memories,
std::vector<mkldnn::primitive*>& mkldnn_primitives,
std::vector<mkldnn::memory::desc*>& mkldnn_scratchpad_mds,
const mkldnn::eltwise_backward::desc& bwd_desc,
const mkldnn::eltwise_forward::desc& fwd_desc,
const std::vector<size_t>& deps,
size_t gelu_index);
#if MKLDNN_VERSION_MAJOR >= 1
// TODO(jmenon): Get rid of TensorViewWrappers at some point
mkldnn::memory::desc
......
......@@ -79,6 +79,7 @@ extern "C" void
case OpType::AVGPOOL:
case OpType::BOUNDEDRELU:
case OpType::CONVERTLAYOUT:
case OpType::GELU:
case OpType::LEAKYRELU:
case OpType::LRN:
case OpType::MAXPOOL:
......@@ -200,6 +201,7 @@ extern "C" void
{MKLDNN_ARG_DIFF_SRC, *ctx->mkldnn_memories[deps[2]]}};
break;
case OpType::RELUBACKPROP:
case OpType::GELUBACKPROP:
case OpType::SIGMOIDBACKPROP:
exec_args = {{MKLDNN_ARG_SRC, *ctx->mkldnn_memories[deps[0]]},
{MKLDNN_ARG_DIFF_DST, *ctx->mkldnn_memories[deps[1]]},
......
......@@ -47,6 +47,8 @@ namespace ngraph
CONVOLUTIONBACKPROPDATA,
CONVOLUTIONBACKPROPWEIGHTS,
CONVOLUTIONBACKPROPWEIGHTSBIAS,
GELU,
GELUBACKPROP,
GROUPCONVOLUTION,
GROUPCONVOLUTIONBIAS,
DECONVOLUTIONBIAS,
......
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
constexpr NodeTypeInfo op::GeluBackprop::type_info;
op::GeluBackprop::GeluBackprop(const Output<ngraph::Node>& arg, const Output<ngraph::Node>& delta)
: BinaryElementwiseArithmetic(arg, delta)
{
constructor_validate_and_infer_types();
set_output_size(1);
set_output_type(0, get_input_element_type(0), arg.get_shape());
}
shared_ptr<Node> op::GeluBackprop::copy_with_new_args(const NodeVector& new_args) const
{
check_new_args_count(this, new_args);
return make_shared<GeluBackprop>(new_args.at(0), new_args.at(1));
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include "ngraph/node.hpp"
#include "ngraph/op/op.hpp"
#include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
#include "ngraph/runtime/cpu/cpu_backend_visibility.h"
namespace ngraph
{
namespace op
{
/// \brief Elementwise GeluBackprop operation.
///
class GeluBackprop : public ngraph::op::util::BinaryElementwiseArithmetic
{
public:
CPU_BACKEND_API
static constexpr NodeTypeInfo type_info{"GeluBackprop", 0};
const NodeTypeInfo& get_type_info() const override { return type_info; }
/// \brief Constructs a GeluBackprop operation.
///
/// \param arg Node that produces the gelu forward input tensor.
GeluBackprop(const Output<ngraph::Node>& arg, const Output<ngraph::Node>& delta);
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
};
}
}
......@@ -36,6 +36,7 @@
#include "ngraph/op/experimental/quantized_conv_relu.hpp"
#include "ngraph/op/experimental/quantized_dot_bias.hpp"
#include "ngraph/op/fused/conv_fused.hpp"
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/op/fused/group_conv.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/lrn.hpp"
......@@ -55,6 +56,7 @@
#include "ngraph/runtime/cpu/op/conv_add.hpp"
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
......@@ -739,6 +741,48 @@ namespace ngraph
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Gelu)
{
(void)external_function;
auto gelu = static_cast<ngraph::op::Gelu*>(node);
if (node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
if (get_user_count(node->get_argument(0).get()) == 1)
{
// Safe to overwrite input
op_annotations->add_in_place_oi_pair({0, 0, true});
}
gelu->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::GeluBackprop)
{
(void)external_function;
auto gelu = static_cast<ngraph::op::GeluBackprop*>(node);
if (node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
if (get_user_count(node->get_argument(0).get()) == 1)
{
// Safe to overwrite input
op_annotations->add_in_place_oi_pair({0, 0, true});
}
gelu->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::CPULeakyRelu)
{
......@@ -1055,6 +1099,9 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::DeconvolutionBias>},
{TI(ngraph::op::ScatterAdd),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ScatterAdd>},
{TI(ngraph::op::Gelu), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Gelu>},
{TI(ngraph::op::GeluBackprop),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::GeluBackprop>},
};
bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(
......
......@@ -44,6 +44,7 @@
#include "ngraph/op/experimental/quantized_conv_bias.hpp"
#include "ngraph/op/experimental/quantized_conv_relu.hpp"
#include "ngraph/op/fused/conv_fused.hpp"
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/op/fused/group_conv.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/max_pool.hpp"
......@@ -75,6 +76,7 @@
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/dropout.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
......@@ -1183,6 +1185,76 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_dropout()
this->add_matcher(m, callback);
}
#if MKLDNN_VERSION_MAJOR < 1
void ngraph::runtime::cpu::pass::CPUFusion::construct_gelubackprop()
{
Shape shape{2, 2, 1, 1};
auto input = std::make_shared<pattern::op::Label>(element::f32, shape);
auto gbpfactor = std::make_shared<ngraph::op::GeluBackpropFactor>(input);
auto gbpfactor_label =
std::make_shared<pattern::op::Label>(gbpfactor, nullptr, NodeVector{gbpfactor});
auto delta = std::make_shared<pattern::op::Label>(element::f32, shape);
auto mult = std::make_shared<ngraph::op::Multiply>(gbpfactor, delta);
auto mult_label = std::make_shared<pattern::op::Label>(mult, nullptr, NodeVector{mult});
auto callback = [input, delta, gbpfactor_label, mult_label](pattern::Matcher& m) {
NGRAPH_DEBUG << "In callback for construct_gelubackprop against "
<< m.get_match_root()->get_name();
auto pattern_map = m.get_pattern_map();
if (m.get_match_root()->get_element_type() != element::f32)
{
NGRAPH_DEBUG << "mpattern = " << m.get_match_root()->get_name()
<< " type is not float!";
return false;
}
auto m_mult = std::static_pointer_cast<ngraph::op::Multiply>(m.get_match_root());
auto m_gbpfactor = std::static_pointer_cast<ngraph::op::GeluBackpropFactor>(
m.get_match_root()->get_argument(0));
if (m_gbpfactor->get_users().size() > 1)
{
NGRAPH_DEBUG << "GeluBackpropFactor has more than one user";
return false;
}
const PartialShape& mult1_shape = m_mult->get_input_partial_shape(0);
const PartialShape& mult2_shape = m_mult->get_input_partial_shape(1);
if (mult1_shape.rank().is_dynamic() || mult2_shape.rank().is_dynamic())
{
NGRAPH_DEBUG << "In construct_gelubackprop: some shapes are dynamic.";
return false;
}
if (pattern_map[input]->get_element_type() != pattern_map[delta]->get_element_type())
{
NGRAPH_DEBUG << "In construct_gelubackprop: types mismatch\n";
return false;
}
if (m_mult->get_argument(0)->get_shape() != m_mult->get_argument(1)->get_shape())
{
NGRAPH_DEBUG << "Input shapes for mult are different. shape1: "
<< m_mult->get_argument(1)->get_shape()
<< ", shape2: " << m_mult->get_argument(1)->get_shape() << "\n";
return false;
}
// No further checks needed.
auto gbp_n =
std::make_shared<ngraph::op::GeluBackprop>(pattern_map[input], pattern_map[delta]);
ngraph::replace_node(m.get_match_root(), gbp_n);
return true;
};
auto m = std::make_shared<pattern::Matcher>(mult, "CPUFusion.GeluBackprop");
this->add_matcher(m, callback);
}
#endif
void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_add_relu()
{
Shape shape{2, 2, 1, 1};
......
......@@ -96,6 +96,9 @@ public:
}
construct_dropout();
construct_batch_norm_infer_relu_with_multiply_add();
#if MKLDNN_VERSION_MAJOR < 1
construct_gelubackprop();
#endif
}
}
......@@ -128,6 +131,9 @@ private:
void construct_deconvolution_affine_folding();
void construct_deconvolution_affine_folding_relu();
void construct_dropout();
#if MKLDNN_VERSION_MAJOR < 1
void construct_gelubackprop();
#endif
};
class CPU_BACKEND_API ngraph::runtime::cpu::pass::CPUQuantFusion : public ngraph::pass::GraphRewrite
......
......@@ -30,6 +30,7 @@
#include "ngraph/op/experimental/quantized_conv_bias.hpp"
#include "ngraph/op/experimental/quantized_conv_relu.hpp"
#include "ngraph/op/experimental/quantized_dot_bias.hpp"
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/lrn.hpp"
#include "ngraph/op/max_pool.hpp"
......@@ -47,6 +48,7 @@
#include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -2229,6 +2231,111 @@ namespace ngraph
construct_string = writer.get_code();
}
template <>
void MKLDNNPrimitiveBuildPass::CONSTRUCT_PRIMITIVE_BUILD_STRING_DECL(Gelu)
{
auto gelu_node = static_cast<const ngraph::op::Gelu*>(node);
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
// query scratchpad size
auto gelu_desc = mkldnn_emitter.get_gelu_forward_desc(node);
scratchpad_size = mkldnn_emitter.query_scratchpad_eltwise_forward(gelu_desc);
// Gelu needs 3 primitives: input, result, and eltwise_forward.
index = mkldnn_emitter.reserve_primitive_space(3);
deps = mkldnn_emitter.get_primitive_deps(index);
CodeWriter writer;
// Write memory descriptors to file
std::vector<mkldnn::memory::desc> descs = {input_desc, result_desc};
auto desc_index = mkldnn_emitter.get_mkldnn_descriptors_size();
mkldnn_emitter.reserve_descriptor_space(descs.size());
serialize_memory_descs(desc_file, descs, deps[0]);
writer << "auto gelu_desc = "
"mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward, "
"mkldnn::algorithm::eltwise_gelu, "
"*cg_ctx->mkldnn_descriptors["
<< desc_index << "], 1.0f, 0.0f);\n";
writer << "mkldnn::primitive_attr attr;\n";
writer << "attr.set_scratchpad_mode(mkldnn::scratchpad_mode::user);\n";
writer << "\n// create gelu primitive_descriptor\n";
writer << "auto gelu_pd = "
"mkldnn::eltwise_forward::primitive_desc(gelu_desc, attr, "
"cg_ctx->global_cpu_engine);\n";
writer << "\n// build primitive\n";
writer << "cg_ctx->mkldnn_primitives[" << std::to_string(index)
<< "] = new mkldnn::eltwise_forward(gelu_pd);\n";
writer << "cg_ctx->mkldnn_scratchpad_mds[" << std::to_string(index)
<< "] = new mkldnn::memory::desc(gelu_pd.scratchpad_desc());\n";
construct_string = writer.get_code();
}
template <>
void MKLDNNPrimitiveBuildPass::CONSTRUCT_PRIMITIVE_BUILD_STRING_DECL(GeluBackprop)
{
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto delta_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
// query scratchpad size
auto fwd_desc = mkldnn_emitter.get_gelu_forward_desc(node);
auto bwd_desc = mkldnn_emitter.get_gelu_backward_desc(node);
scratchpad_size =
mkldnn_emitter.query_scratchpad_eltwise_backward(fwd_desc, bwd_desc);
// GeluBackprop needs 4 primitives: input, delta, result, and
// eltwise_backward.
index = mkldnn_emitter.reserve_primitive_space(4);
deps = mkldnn_emitter.get_primitive_deps(index);
CodeWriter writer;
// Write memory descriptors to file
std::vector<mkldnn::memory::desc> descs = {input_desc, delta_desc, result_desc};
auto desc_index = mkldnn_emitter.get_mkldnn_descriptors_size();
mkldnn_emitter.reserve_descriptor_space(descs.size());
serialize_memory_descs(desc_file, descs, deps[0]);
writer << "auto fwd_desc = "
"mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward, "
"mkldnn::algorithm::eltwise_gelu, "
"*cg_ctx->mkldnn_descriptors["
<< desc_index << "], 0, 0);\n";
writer << "auto bwd_desc = "
"mkldnn::eltwise_backward::desc(mkldnn::algorithm::eltwise_gelu, "
"*cg_ctx->mkldnn_descriptors["
<< desc_index + 1 << "], "
"*cg_ctx->mkldnn_descriptors["
<< desc_index << "], 0, 0);\n";
writer << "mkldnn::primitive_attr attr;\n";
writer << "attr.set_scratchpad_mode(mkldnn::scratchpad_mode::user);\n";
writer << "\n// create forward gelu primitive descriptor\n";
writer << "auto gelu_fwd_pd = "
"mkldnn::eltwise_forward::primitive_desc(fwd_desc, "
"cg_ctx->global_cpu_engine);\n";
writer << "\n// create backward gelu primitive_descriptor\n";
writer << "auto gelu_bwd_pd = "
"mkldnn::eltwise_backward::primitive_desc(bwd_desc, attr, "
"cg_ctx->global_cpu_engine, gelu_fwd_pd);\n";
writer << "\n// build primitive\n";
writer << "cg_ctx->mkldnn_primitives[" << std::to_string(index)
<< "] = new mkldnn::eltwise_backward(gelu_bwd_pd);\n";
writer << "cg_ctx->mkldnn_scratchpad_mds[" << std::to_string(index)
<< "] = new mkldnn::memory::desc(gelu_bwd_pd.scratchpad_desc());\n";
construct_string = writer.get_code();
}
template <>
void MKLDNNPrimitiveBuildPass::CONSTRUCT_PRIMITIVE_BUILD_STRING_DECL(Sigmoid)
{
......
......@@ -22,5 +22,9 @@ lrn_across_nw
lrn_across_empty
lrn_6D_across_2_axes
# Gelu tests not supported in CPU backend, we use mkldnn gelubackprop (and not factor)
gelu_backprop_factor_f32
backwards_gelu_f32
# ONNX TopK with dynamic K
top_k_opset_10
......@@ -20,6 +20,7 @@
#include <cstdlib>
#include <random>
#include <string>
#include "util/random.hpp"
// clang-format off
#ifdef ${BACKEND_NAME}_FLOAT_TOLERANCE_BITS
......@@ -47,25 +48,34 @@ static string s_manifest = "${MANIFEST}";
NGRAPH_TEST(${BACKEND_NAME}, gelu_f32)
{
Shape shape{8};
Shape shape{100000};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Gelu>(A), ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto name = param->get_name();
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape);
vector<float> input{-4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f};
copy_data(a, input);
copy_data(a, args[0]);
auto result = backend->create_tensor(element::f32, shape);
std::transform(input.begin(), input.end(), input.begin(), [](float x) -> float {
std::transform(args[0].begin(), args[0].end(), args[0].begin(), [](float x) -> float {
return 0.5f * x * (1.0f + erf(x / sqrt(2.0f)));
});
auto handle = backend->compile(f);
handle->call_with_validate({result}, {a});
EXPECT_TRUE(test::all_close_f(input, read_vector<float>(result)));
EXPECT_TRUE(test::all_close(args[0], read_vector<float>(result), .007f, .007f));
}
NGRAPH_TEST(${BACKEND_NAME}, gelu_f64)
......
......@@ -33,6 +33,7 @@
#include "ngraph/op/experimental/generate_mask.hpp"
#include "ngraph/op/experimental/quantized_conv_bias.hpp"
#include "ngraph/op/fused/conv_fused.hpp"
#include "ngraph/op/fused/gelu.hpp"
#include "ngraph/op/fused/group_conv.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/max_pool.hpp"
......@@ -66,6 +67,7 @@
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/dropout.hpp"
#include "ngraph/runtime/cpu/op/gelu_backprop.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
......@@ -1081,6 +1083,73 @@ TEST(cpu_fusion, conv_add)
EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
}
#if MKLDNN_VERSION_MAJOR < 1
static double gelu_backprop_factor(double x)
{
auto pi = 4.0 * std::atan(1.0);
return 0.5 * (1.0 + erf(x * sqrt(1.0 / 2.0))) + (x * exp(-x * x / 2.0)) / sqrt(2.0 * pi);
}
TEST(cpu_fusion, fuse_gelu_backprop_f32)
{
Shape shape_a{2, 1, 60, 60};
auto make_function = [shape_a]() {
auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
auto gbpfactor = std::make_shared<op::GeluBackpropFactor>(A);
auto delta = std::make_shared<op::Parameter>(element::f32, shape_a);
auto gbp = gbpfactor * delta;
auto f = make_shared<Function>(NodeVector{gbp}, ParameterVector{A, delta});
return f;
};
auto fuse_func = make_function();
// Test fusion
{
pass::Manager pass_manager;
pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
pass_manager.run_passes(fuse_func);
ASSERT_EQ(count_ops_of_type<op::GeluBackprop>(fuse_func), 1);
}
// Test values
{
test::Uniform<float> rng(1.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : fuse_func->get_parameters())
{
auto name = param->get_name();
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a);
auto delta = backend->create_tensor(element::f32, shape_a);
copy_data(a, args[0]);
copy_data(delta, args[1]);
auto result = backend->create_tensor(element::f32, shape_a);
std::transform(args[0].begin(), args[0].end(), args[0].begin(), [](float x) -> float {
return static_cast<float>(gelu_backprop_factor(static_cast<double>(x)));
});
std::transform(args[0].begin(),
args[0].end(),
args[1].begin(),
args[0].begin(),
[](float x, float delta) -> float { return static_cast<float>(x * delta); });
auto handle = backend->compile(fuse_func);
handle->call_with_validate({result}, {a, delta});
EXPECT_TRUE(test::all_close(args[0], read_vector<float>(result), 0.007f, 0.007f));
}
}
#endif
shared_ptr<Function> gen_deconv(const bool add_goe)
{
Shape conv_out_shape{100, 64, 1, 1};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment