Add support for int32 output in reference kernel

4ec19b95 · nishant.b.patel · b65e32e2 · 4ec19b95 · 4ec19b95 · 4ec19b95
Commit 4ec19b95 authored Jun 18, 2019 by nishant.b.patel
4 changed files
--- a/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
@@ -112,7 +112,9 @@ namespace ngraph
                    };
                    functors.emplace_back(functor);
                }
-                else
+                else if (args[0].get_element_type() == element::u8 &&
+                         args[1].get_element_type() == element::u8 &&
+                         out[0].get_element_type() == element::u8)
                {
                    std::function<decltype(
                        runtime::cpu::kernel::convolution<uint8_t, uint8_t, uint8_t, int32_t>)>
@@ -132,6 +134,140 @@ namespace ngraph
                    auto padding_above = qconvolution->get_padding_above();
                    auto data_dilation_strides = qconvolution->get_data_dilation_strides();

+                    auto functor = [&,
+                                    kernel,
+                                    arg0_shape,
+                                    arg1_shape,
+                                    arg0_buffer_index,
+                                    arg1_buffer_index,
+                                    arg2_buffer_index,
+                                    arg3_buffer_index,
+                                    arg4_buffer_index,
+                                    arg5_buffer_index,
+                                    arg6_buffer_index,
+                                    arg7_buffer_index,
+                                    out0_buffer_index,
+                                    result_shape,
+                                    window_movement_strides,
+                                    window_dilation_strides,
+                                    padding_below,
+                                    padding_above,
+                                    data_dilation_strides,
+                                    scales_size](CPURuntimeContext* ctx,
+                                                 CPUExecutionContext* ectx) {
+                        vector<float> dyn_scales;
+                        dyn_scales.assign(static_cast<float*>(ctx->buffer_data[arg2_buffer_index]),
+                                          static_cast<float*>(ctx->buffer_data[arg2_buffer_index]) +
+                                              scales_size);
+                        kernel(ctx->buffer_data[arg0_buffer_index],
+                               ctx->buffer_data[arg1_buffer_index],
+                               ctx->buffer_data[out0_buffer_index],
+                               arg0_shape,
+                               arg1_shape,
+                               result_shape,
+                               window_movement_strides,
+                               window_dilation_strides,
+                               padding_below,
+                               padding_above,
+                               data_dilation_strides,
+                               ctx->buffer_data[arg2_buffer_index],
+                               ctx->buffer_data[arg3_buffer_index],
+                               ctx->buffer_data[arg4_buffer_index],
+                               ctx->buffer_data[arg5_buffer_index],
+                               ctx->buffer_data[arg6_buffer_index],
+                               ctx->buffer_data[arg7_buffer_index]);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else if (args[0].get_element_type() == element::u8 &&
+                         args[1].get_element_type() == element::u8 &&
+                         out[0].get_element_type() == element::i32)
+                {
+                    std::function<decltype(
+                        runtime::cpu::kernel::convolution<uint8_t, uint8_t, int32_t, int32_t>)>
+                        kernel;
+                    kernel = runtime::cpu::kernel::convolution<uint8_t, uint8_t, int32_t, int32_t>;
+
+                    auto arg3_buffer_index =
+                        external_function->get_buffer_index(args[3].get_name()); // input scale
+                    auto arg5_buffer_index =
+                        external_function->get_buffer_index(args[5].get_name()); // filter scale
+                    auto arg7_buffer_index =
+                        external_function->get_buffer_index(args[7].get_name()); // output scale
+
+                    auto window_movement_strides = qconvolution->get_window_movement_strides();
+                    auto window_dilation_strides = qconvolution->get_window_dilation_strides();
+                    auto padding_below = qconvolution->get_padding_below();
+                    auto padding_above = qconvolution->get_padding_above();
+                    auto data_dilation_strides = qconvolution->get_data_dilation_strides();
+
+                    auto functor = [&,
+                                    kernel,
+                                    arg0_shape,
+                                    arg1_shape,
+                                    arg0_buffer_index,
+                                    arg1_buffer_index,
+                                    arg2_buffer_index,
+                                    arg3_buffer_index,
+                                    arg4_buffer_index,
+                                    arg5_buffer_index,
+                                    arg6_buffer_index,
+                                    arg7_buffer_index,
+                                    out0_buffer_index,
+                                    result_shape,
+                                    window_movement_strides,
+                                    window_dilation_strides,
+                                    padding_below,
+                                    padding_above,
+                                    data_dilation_strides,
+                                    scales_size](CPURuntimeContext* ctx,
+                                                 CPUExecutionContext* ectx) {
+                        vector<float> dyn_scales;
+                        dyn_scales.assign(static_cast<float*>(ctx->buffer_data[arg2_buffer_index]),
+                                          static_cast<float*>(ctx->buffer_data[arg2_buffer_index]) +
+                                              scales_size);
+                        kernel(ctx->buffer_data[arg0_buffer_index],
+                               ctx->buffer_data[arg1_buffer_index],
+                               ctx->buffer_data[out0_buffer_index],
+                               arg0_shape,
+                               arg1_shape,
+                               result_shape,
+                               window_movement_strides,
+                               window_dilation_strides,
+                               padding_below,
+                               padding_above,
+                               data_dilation_strides,
+                               ctx->buffer_data[arg2_buffer_index],
+                               ctx->buffer_data[arg3_buffer_index],
+                               ctx->buffer_data[arg4_buffer_index],
+                               ctx->buffer_data[arg5_buffer_index],
+                               ctx->buffer_data[arg6_buffer_index],
+                               ctx->buffer_data[arg7_buffer_index]);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else if (args[0].get_element_type() == element::u8 &&
+                         args[1].get_element_type() == element::i8 &&
+                         out[0].get_element_type() == element::i32)
+                {
+                    std::function<decltype(
+                        runtime::cpu::kernel::convolution<uint8_t, int8_t, int32_t, int32_t>)>
+                        kernel;
+                    kernel = runtime::cpu::kernel::convolution<uint8_t, int8_t, int32_t, int32_t>;
+
+                    auto arg3_buffer_index =
+                        external_function->get_buffer_index(args[3].get_name()); // input scale
+                    auto arg5_buffer_index =
+                        external_function->get_buffer_index(args[5].get_name()); // filter scale
+                    auto arg7_buffer_index =
+                        external_function->get_buffer_index(args[7].get_name()); // output scale
+
+                    auto window_movement_strides = qconvolution->get_window_movement_strides();
+                    auto window_dilation_strides = qconvolution->get_window_dilation_strides();
+                    auto padding_below = qconvolution->get_padding_below();
+                    auto padding_above = qconvolution->get_padding_above();
+                    auto data_dilation_strides = qconvolution->get_data_dilation_strides();
+
                    auto functor = [&,
                                    kernel,
                                    arg0_shape,

--- a/src/ngraph/runtime/interpreter/int_executable.hpp
+++ b/src/ngraph/runtime/interpreter/int_executable.hpp
@@ -1216,6 +1216,50 @@ private:
                    args[6]->get_data_ptr<const float>(),
                    args[7]->get_data_ptr<const uint8_t>());
            }
+            else if (input_element_type == element::u8 && filter_element_type == element::i8 &&
+                     output_element_type == element::i32)
+            {
+                reference::convolution<uint8_t, int8_t, int32_t, int32_t>(
+                    args[0]->get_data_ptr<const uint8_t>(),
+                    args[1]->get_data_ptr<const int8_t>(),
+                    out[0]->get_data_ptr<int32_t>(),
+                    node.get_input_shape(0),
+                    node.get_input_shape(1),
+                    node.get_output_shape(0),
+                    qc->get_window_movement_strides(),
+                    qc->get_window_dilation_strides(),
+                    qc->get_padding_below(),
+                    qc->get_padding_above(),
+                    qc->get_data_dilation_strides(),
+                    args[2]->get_data_ptr<const float>(),
+                    args[3]->get_data_ptr<const uint8_t>(),
+                    args[4]->get_data_ptr<const float>(),
+                    args[5]->get_data_ptr<const int8_t>(),
+                    args[6]->get_data_ptr<const float>(),
+                    args[7]->get_data_ptr<const int32_t>());
+            }
+            else if (input_element_type == element::u8 && filter_element_type == element::u8 &&
+                     output_element_type == element::i32)
+            {
+                reference::convolution<uint8_t, uint8_t, int32_t, int32_t>(
+                    args[0]->get_data_ptr<const uint8_t>(),
+                    args[1]->get_data_ptr<const uint8_t>(),
+                    out[0]->get_data_ptr<int32_t>(),
+                    node.get_input_shape(0),
+                    node.get_input_shape(1),
+                    node.get_output_shape(0),
+                    qc->get_window_movement_strides(),
+                    qc->get_window_dilation_strides(),
+                    qc->get_padding_below(),
+                    qc->get_padding_above(),
+                    qc->get_data_dilation_strides(),
+                    args[2]->get_data_ptr<const float>(),
+                    args[3]->get_data_ptr<const uint8_t>(),
+                    args[4]->get_data_ptr<const float>(),
+                    args[5]->get_data_ptr<const uint8_t>(),
+                    args[6]->get_data_ptr<const float>(),
+                    args[7]->get_data_ptr<const int32_t>());
+            }
            else
            {
                std::stringstream ss;

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -7712,3 +7712,56 @@ NGRAPH_TEST(${BACKEND_NAME}, quantized_conv_non_zero_zero_point)
            << "Vectors x and y differ at index " << i;
    }
 }
+
+TEST(${BACKEND_NAME}, quantized_conv_int32_output)
+{
+    Shape shape_a{1, 1, 3, 4};
+    Shape shape_b{1, 1, 3, 3};
+    Shape shape_r{1, 1, 3, 4};
+    vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
+    vector<uint8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
+    auto A = make_shared<op::Parameter>(element::u8, shape_a);
+    auto B = make_shared<op::Parameter>(element::u8, shape_b);
+    auto C = make_shared<op::Parameter>(element::f32, Shape{});
+    auto D = op::Constant::create(element::u8, Shape{}, {0});
+    auto E = make_shared<op::Parameter>(element::f32, Shape{});
+    auto F = op::Constant::create(element::u8, Shape{}, {0});
+    auto G = make_shared<op::Parameter>(element::f32, Shape{});
+    auto H = op::Constant::create(element::i32, Shape{}, {0});
+    auto CV = make_shared<op::QuantizedConvolution>(A,
+                                                    B,
+                                                    Strides{1, 1},
+                                                    Strides{1, 1},
+                                                    CoordinateDiff{1, 1},
+                                                    CoordinateDiff{1, 1},
+                                                    Strides{1, 1},
+                                                    C,
+                                                    D,
+                                                    E,
+                                                    F,
+                                                    G,
+                                                    H,
+                                                    element::i32,
+                                                    AxisSet{},
+                                                    AxisSet{},
+                                                    AxisSet{});
+    auto f = make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, C, E, G});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::u8, shape_b);
+    copy_data(b, b_data);
+    auto c = backend->create_tensor(element::f32, Shape{});
+    copy_data(c, vector<float>{1.0f});
+    auto d = backend->create_tensor(element::f32, Shape{});
+    copy_data(d, vector<float>{1.0f});
+    auto e = backend->create_tensor(element::f32, Shape{});
+    copy_data(e, vector<float>{1.0f});
+    auto result = backend->create_tensor(element::i32, shape_r);
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b, c, d, e});
+    EXPECT_EQ((vector<int32_t>{22, 34, 30, 32, 38, 72, 90, 43, 33, 52, 43, 39}),
+              read_vector<int32_t>(result));
+}
--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -2057,59 +2057,3 @@ TEST(cpu_test, tensor_copy_from_different_layout)

    EXPECT_EQ((vector<uint8_t>{1, 4, 2, 5, 3, 6}), read_vector<uint8_t>(b));
 }
-
-// Adding this test case in cpu_test
-// because reference kernel isn't supporting intermediate
-// output types as of now
-TEST(cpu_test, quantized_conv_int32_output)
-{
-    Shape shape_a{1, 1, 3, 4};
-    Shape shape_b{1, 1, 3, 3};
-    Shape shape_r{1, 1, 3, 4};
-    vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
-    vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
-    auto A = make_shared<op::Parameter>(element::u8, shape_a);
-    auto B = make_shared<op::Parameter>(element::i8, shape_b);
-    auto C = make_shared<op::Parameter>(element::f32, Shape{});
-    auto D = op::Constant::create(element::u8, Shape{}, {0});
-    auto E = make_shared<op::Parameter>(element::f32, Shape{});
-    auto F = op::Constant::create(element::i8, Shape{}, {0});
-    auto G = make_shared<op::Parameter>(element::f32, Shape{});
-    auto H = op::Constant::create(element::i32, Shape{}, {0});
-    auto CV = make_shared<op::QuantizedConvolution>(A,
-                                                    B,
-                                                    Strides{1, 1},
-                                                    Strides{1, 1},
-                                                    CoordinateDiff{1, 1},
-                                                    CoordinateDiff{1, 1},
-                                                    Strides{1, 1},
-                                                    C,
-                                                    D,
-                                                    E,
-                                                    F,
-                                                    G,
-                                                    H,
-                                                    element::i32,
-                                                    AxisSet{},
-                                                    AxisSet{},
-                                                    AxisSet{});
-    auto f = make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, C, E, G});
-
-    auto backend = runtime::Backend::create("CPU");
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::u8, shape_a);
-    copy_data(a, a_data);
-    auto b = backend->create_tensor(element::i8, shape_b);
-    copy_data(b, b_data);
-    auto c = backend->create_tensor(element::f32, Shape{});
-    copy_data(c, vector<float>{1.0f});
-    auto d = backend->create_tensor(element::f32, Shape{});
-    copy_data(d, vector<float>{1.0f});
-    auto e = backend->create_tensor(element::f32, Shape{});
-    copy_data(e, vector<float>{1.0f});
-    auto result = backend->create_tensor(element::i32, shape_r);
-    auto handle = backend->compile(f);
-    handle->call_with_validate({result}, {a, b, c, d, e});
-    EXPECT_EQ((vector<int32_t>{22, 34, 30, 32, 38, 72, 90, 43, 33, 52, 43, 39}),
-              read_vector<int32_t>(result));
-}