Inplace convert for bitcasting ops (#2446)

* Inplace convert for bitcasting ops * Dont passthrough blocked layouts * More checks for types not handled by mkldnn

Inplace convert for bitcasting ops (#2446)
* Inplace convert for bitcasting ops * Dont passthrough blocked layouts * More checks for types not handled by mkldnn
6cd77ff2 · Jayaram Bobba · Scott Cyphers · 39fe4f24 · 6cd77ff2 · 6cd77ff2
Commit 6cd77ff2 authored Feb 15, 2019 by Jayaram Bobba Committed by Scott Cyphers Feb 15, 2019
7 changed files
--- a/src/ngraph/runtime/cpu/builder/convert.cpp
+++ b/src/ngraph/runtime/cpu/builder/convert.cpp
@@ -103,7 +103,10 @@ namespace ngraph

                auto functor = [&, kernel, element_count](CPURuntimeContext* ctx,
                                                          CPUExecutionContext* ectx) {
-                    kernel(arg_tensor, out_tensor, element_count, ectx->arena);
+                    if (arg_tensor != out_tensor)
+                    {
+                        kernel(arg_tensor, out_tensor, element_count, ectx->arena);
+                    }
                };
                functors.emplace_back(functor);
            }

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -1386,6 +1386,8 @@ namespace ngraph
            {
                auto& result_element_type = out[0].get_element_type();

+                writer << "if ((void*)" << out[0].get_name() << " != (void*)" << args[0].get_name()
+                       << ") \n";
                writer.block_begin();
                writer << "#pragma omp parallel for\n";
                writer << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -289,6 +289,17 @@ mkldnn::memory::desc runtime::cpu::mkldnn_utils::create_default_mkldnn_md(
    return memory::desc(memory::dims(shape.begin(), shape.end()), et, format);
 }

+bool runtime::cpu::mkldnn_utils::can_create_mkldnn_md(const ngraph::element::Type type)
+{
+    auto it = get_mkldnn_data_type_map().find(type);
+    if (it == get_mkldnn_data_type_map().end() ||
+        it->second == mkldnn::memory::data_type::data_undef)
+    {
+        return false;
+    }
+    return true;
+}
+
 bool runtime::cpu::mkldnn_utils::can_create_mkldnn_md(const Shape& dims,
                                                      const Strides& strides,
                                                      const ngraph::element::Type type)

--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -51,6 +51,7 @@ namespace ngraph
                                                              bool is_output,
                                                              mkldnn::memory::format format);
                bool is_perm_sorted(const Strides& a, const AxisVector& perm);
+                bool can_create_mkldnn_md(const ngraph::element::Type type);
                bool can_create_mkldnn_md(const Shape& dims,
                                          const Strides& strides,
                                          const ngraph::element::Type type);

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/op/concat.hpp"
 #include "ngraph/op/constant.hpp"
+#include "ngraph/op/convert.hpp"
 #include "ngraph/op/convolution.hpp"
 #include "ngraph/op/dequantize.hpp"
 #include "ngraph/op/experimental/quantized_avg_pool.hpp"
@@ -670,39 +671,26 @@ namespace ngraph
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionRelu)
                {
-                    if (node->get_input_element_type(0) == element::u8 &&
-                        node->get_input_element_type(1) == element::i8)
-                    {
-                        runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
-                    }
+                    runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
                }

                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionBias)
                {
-                    if (node->get_input_element_type(0) == element::u8 &&
-                        node->get_input_element_type(1) == element::i8)
-                    {
-                        runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
-                    }
+                    runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
                }

                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionBiasAdd)
                {
                    auto quantized_conv_bias = static_cast<op::QuantizedConvolutionBiasAdd*>(node);
-                    if (node->get_input_element_type(0) == element::u8 &&
-                        node->get_input_element_type(1) == element::i8 &&
-                        node->get_input_element_type(3) == element::u8)
-                    {
-                        auto op_annotations =
-                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
-                        op_annotations->set_mkldnn_op(true);
-                        const int ADD_INPUT = 3;
-                        // Accumulates conv into the second input of the unfused add
-                        op_annotations->add_in_place_oi_pair({0, ADD_INPUT, true});
-                        quantized_conv_bias->set_op_annotations(op_annotations);
-                    }
+                    auto op_annotations =
+                        std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                    op_annotations->set_mkldnn_op(true);
+                    const int ADD_INPUT = 3;
+                    // Accumulates conv into the second input of the unfused add
+                    op_annotations->add_in_place_oi_pair({0, ADD_INPUT, true});
+                    quantized_conv_bias->set_op_annotations(op_annotations);
                }

                template <>
@@ -710,18 +698,13 @@ namespace ngraph
                {
                    auto quantized_conv_bias =
                        static_cast<op::QuantizedConvolutionBiasSignedAdd*>(node);
-                    if (node->get_input_element_type(0) == element::u8 &&
-                        node->get_input_element_type(1) == element::i8 &&
-                        node->get_input_element_type(3) == element::i8)
-                    {
-                        auto op_annotations =
-                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
-                        op_annotations->set_mkldnn_op(true);
-                        const int ADD_INPUT = 3;
-                        // Accumulates conv into the second input of the unfused add
-                        op_annotations->add_in_place_oi_pair({0, ADD_INPUT, true});
-                        quantized_conv_bias->set_op_annotations(op_annotations);
-                    }
+                    auto op_annotations =
+                        std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                    op_annotations->set_mkldnn_op(true);
+                    const int ADD_INPUT = 3;
+                    // Accumulates conv into the second input of the unfused add
+                    op_annotations->add_in_place_oi_pair({0, ADD_INPUT, true});
+                    quantized_conv_bias->set_op_annotations(op_annotations);
                }

                template <>
@@ -810,6 +793,22 @@ namespace ngraph
                    }
                    runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
                }
+
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::Convert)
+                {
+                    auto convert = static_cast<op::Convert*>(node);
+                    if ((node->get_input_element_type(0) == element::i8 &&
+                         node->get_output_element_type(0) == element::u8) ||
+                        (node->get_input_element_type(0) == element::u8 &&
+                         node->get_output_element_type(0) == element::i8))
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->add_in_place_oi_pair({0, 0, false});
+                        convert->set_op_annotations(op_annotations);
+                    }
+                }
            }
        }
    }
@@ -820,6 +819,7 @@ namespace ngraph
 static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>},
    {TI(ngraph::op::Concat), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Concat>},
+    {TI(ngraph::op::Convert), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convert>},
    {TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPool>},
    {TI(ngraph::op::AvgPoolBackprop),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPoolBackprop>},

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -31,6 +31,7 @@
 #include "ngraph/op/avg_pool.hpp"
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/op/concat.hpp"
+#include "ngraph/op/convert.hpp"
 #include "ngraph/op/convolution.hpp"
 #include "ngraph/op/experimental/quantized_avg_pool.hpp"
 #include "ngraph/op/experimental/quantized_conv.hpp"
@@ -1936,6 +1937,31 @@ namespace ngraph
                        set_native_layouts(external_function, node);
                    }
                }
+
+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::Convert)
+                {
+                    auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
+                    auto tv = node->get_output_tensor_ptr(0);
+
+                    if (input_md.data.format == mkldnn_blocked ||
+                        input_md.data.format == mkldnn_format_undef ||
+                        !mkldnn_utils::can_create_mkldnn_md(tv->get_element_type()))
+                    {
+                        // Cannot pass through layout information for blocked layouts at the moment
+                        set_native_layouts(external_function, node);
+                    }
+                    else
+                    {
+                        vector<memory::desc> o_mds;
+                        o_mds.push_back(mkldnn_utils::create_default_mkldnn_md(
+                            node.get(),
+                            0,
+                            true,
+                            static_cast<memory::format>(input_md.data.format)));
+                        set_output_layouts(node, o_mds);
+                    }
+                }
            }
        }
    }
@@ -1945,6 +1971,7 @@ namespace ngraph

 static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::Concat), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Concat>},
+    {TI(ngraph::op::Convert), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Convert>},
    {TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPULayout::layout<ngraph::op::AvgPool>},
    {TI(ngraph::op::AvgPoolBackprop),
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::AvgPoolBackprop>},

--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -668,3 +668,24 @@ TEST(cpu_test, convolution_large_padding)
    auto cpu_f = make_function();
    compare_backends(int_f, cpu_f, "INTERPRETER", "CPU", 1e-4, 1e-4);
 }
+
+TEST(cpu_test, convert_inplace)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::u8, shape);
+    auto B = op::Constant::create(element::u8, shape, {1, 1, 1, 1});
+    auto C = op::Constant::create(element::i8, shape, {1, 1, 1, 1});
+    auto f =
+        make_shared<Function>(make_shared<op::Convert>(A + B, element::i8) - C, ParameterVector{A});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape);
+    copy_data(a, vector<uint8_t>{1, 2, 3, 254});
+    auto result = backend->create_tensor(element::i8, shape);
+
+    auto handle = backend->compile(f);
+    backend->call_with_validate(handle, {result}, {a});
+    EXPECT_EQ((vector<int8_t>{1, 2, 3, -2}), read_vector<int8_t>(result));
+}