TopK additional tests for nvGPU backend (#1946)

* added tests for malloc mode and graph transform * Comment incorporation * changed comparing backend to INTERPRETER * COmments resolved+clang * Adressed all comments * IntelGPU does not support topk

TopK additional tests for nvGPU backend (#1946)
* added tests for malloc mode and graph transform * Comment incorporation * changed comparing backend to INTERPRETER * COmments resolved+clang * Adressed all comments * IntelGPU does not support topk
37dc586c · Ayan Moitra · Robert Kimball · d9f615b7 · 37dc586c · 37dc586c
Commit 37dc586c authored Nov 05, 2018 by Ayan Moitra Committed by Robert Kimball Nov 05, 2018
5 changed files
--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
@@ -129,12 +129,15 @@ namespace ngraph
                        NodeVector new_goes;
                        for (auto& goe : goes)
                        {
-                            auto out_idx =
+                            auto goe_ptr = std::dynamic_pointer_cast<op::GetOutputElement>(goe);
-                                std::dynamic_pointer_cast<op::GetOutputElement>(goe)->get_n();
+                            if (goe_ptr)
-                            auto new_goe =
+                            {
-                                std::make_shared<op::GetOutputElement>(new_topk, out_idx);
+                                auto out_idx = goe_ptr->get_n();
-                            ngraph::replace_node(goe, new_goe);
+                                auto new_goe =
-                            new_goes.push_back(new_goe);
+                                    std::make_shared<op::GetOutputElement>(new_topk, out_idx);
+                                ngraph::replace_node(goe, new_goe);
+                                new_goes.push_back(new_goe);
+                            }
                        }
                        Shape reordered_out_shape;
                        for (size_t j = 0; j < ndim; j++)

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -81,6 +81,9 @@ topk_3d_min_one
 topk_3d_min_partial
 topk_5d_max_partial
 topk_int64
+topk_3d_large_input_max
+topk_3d_large_input_min
+topk_3d_single_output
 zero_sized_abs
 zero_sized_acos
 zero_sized_add

--- a/src/ngraph/runtime/interpreter/unit_test.manifest
+++ b/src/ngraph/runtime/interpreter/unit_test.manifest
@@ -9,3 +9,5 @@ batchnorm_fprop_bprop_2step
 computation_reuse
 generate_mask
 topk_int64
+topk_3d_large_input_max
+topk_3d_large_input_min
--- a/test/backend_topk.in.cpp
+++ b/test/backend_topk.in.cpp
@@ -571,3 +571,99 @@ NGRAPH_TEST(${BACKEND_NAME}, topk_2d_min_one)
    backend->call_with_validate(f1, {result1}, {a});
    EXPECT_EQ((vector<float>{3, 1, 4}), read_vector<float>(result1));
 }
+NGRAPH_TEST(${BACKEND_NAME}, topk_3d_large_input_max)
+{
+    Shape shape{4, 8192, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::TopK>(A, 1, element::i32, 10, true);
+    auto interp_f_0 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(B, 0), op::ParameterVector{A});
+    auto interp_f_1 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(B, 1), op::ParameterVector{A});
+    auto gpu_f_0 = ngraph::clone_function(*interp_f_0);
+    auto gpu_f_1 = ngraph::clone_function(*interp_f_1);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : interp_f_0->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        iota(tensor_val.begin(), tensor_val.end(), 0.0f);
+        args.push_back(tensor_val);
+    }
+    auto interp_results_0 = execute<float, int32_t>(interp_f_0, args, "INTERPRETER");
+    auto gpu_results_0 = execute<float, int32_t>(gpu_f_0, args, "${BACKEND_NAME}");
+    for (size_t i = 0; i < gpu_results_0.size(); i++)
+    {
+        EXPECT_EQ(gpu_results_0.at(i), interp_results_0.at(i));
+    }
+    auto interp_results_1 = execute(interp_f_1, args, "INTERPRETER");
+    auto gpu_results_1 = execute(gpu_f_1, args, "${BACKEND_NAME}");
+    for (size_t i = 0; i < gpu_results_1.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close_f(gpu_results_1.at(i), interp_results_1.at(i), 24, 0));
+    }
+}
+NGRAPH_TEST(${BACKEND_NAME}, topk_3d_large_input_min)
+{
+    Shape shape{4, 8192, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::TopK>(A, 1, element::i32, 10, false);
+    auto interp_f_0 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(B, 0), op::ParameterVector{A});
+    auto interp_f_1 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(B, 1), op::ParameterVector{A});
+    auto gpu_f_0 = ngraph::clone_function(*interp_f_0);
+    auto gpu_f_1 = ngraph::clone_function(*interp_f_1);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : interp_f_0->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        iota(tensor_val.begin(), tensor_val.end(), 0.0f);
+        args.push_back(tensor_val);
+    }
+    auto interp_results_0 = execute<float, int32_t>(interp_f_0, args, "INTERPRETER");
+    auto gpu_results_0 = execute<float, int32_t>(gpu_f_0, args, "${BACKEND_NAME}");
+    for (size_t i = 0; i < gpu_results_0.size(); i++)
+    {
+        EXPECT_EQ(gpu_results_0.at(i), interp_results_0.at(i));
+    }
+    auto interp_results_1 = execute(interp_f_1, args, "INTERPRETER");
+    auto gpu_results_1 = execute(gpu_f_1, args, "${BACKEND_NAME}");
+    for (size_t i = 0; i < gpu_results_1.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close_f(gpu_results_1.at(i), interp_results_1.at(i), 24, 0));
+    }
+}
+NGRAPH_TEST(${BACKEND_NAME}, topk_3d_single_output)
+{
+    Shape shape{2, 3, 2};
+    Shape rshape{2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::TopK>(A, 1, element::i32, 2, false);
+    auto f0 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(B, 0), op::ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{12, 2, 10, 9, 8, 4, 6, 1, 5, 3, 11, 7});
+    auto result0 = backend->create_tensor(element::i32, rshape);
+    backend->call_with_validate(f0, {result0}, {a});
+    EXPECT_EQ((vector<int32_t>{2, 0, 1, 2, 1, 0, 0, 1}), read_vector<int32_t>(result0));
+}
--- a/test/gpu_test.cpp
+++ b/test/gpu_test.cpp
@@ -22,7 +22,11 @@
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
 #include "ngraph/runtime/gpu/gpu_util.hpp"
 #include "ngraph/runtime/gpu/nvshape.hpp"
+#include "ngraph/util.hpp"
+#include "util/all_close.hpp"
+#include "util/all_close_f.hpp"
+using namespace std;
 using namespace ngraph;
 TEST(gpu_test, gpu_shape_from_64bit_shape)
@@ -160,3 +164,56 @@ TEST(gpu_test, memory_manager_seperate_workspaces_allocsize)
    emitter.allocate_primitive_memory();
    EXPECT_EQ(emitter.sizeof_device_allocation(), total_size);
 }
+TEST(gpu_test, topk_fanout_graph_transform)
+{
+    Shape shape{2, 3, 2};
+    Shape out_shape{2, 2, 2};
+    auto A_gpu = make_shared<op::Parameter>(element::f32, shape);
+    auto A_int32_gpu_1 = make_shared<op::Parameter>(element::i32, out_shape);
+    auto A_int32_gpu_2 = make_shared<op::Parameter>(element::i32, out_shape);
+    auto A_f32_gpu_1 = make_shared<op::Parameter>(element::f32, out_shape);
+    auto A_f32_gpu_2 = make_shared<op::Parameter>(element::f32, out_shape);
+    auto B_gpu = make_shared<op::TopK>(A_gpu, 1, element::i32, 2, true);
+    auto C_gpu_0 = make_shared<op::GetOutputElement>(B_gpu, 0);
+    auto C_gpu_1 = make_shared<op::GetOutputElement>(B_gpu, 1);
+    auto gpu_R_0 = make_shared<op::Add>(A_int32_gpu_1, C_gpu_0);
+    auto gpu_R_1 = make_shared<op::Add>(A_int32_gpu_2, C_gpu_0);
+    auto gpu_R_2 = make_shared<op::Add>(A_f32_gpu_1, C_gpu_1);
+    auto gpu_R_3 = make_shared<op::Add>(A_f32_gpu_2, C_gpu_1);
+    auto gpu_f = make_shared<Function>(
+        NodeVector{gpu_R_0, gpu_R_1, gpu_R_2, gpu_R_3},
+        op::ParameterVector{A_gpu, A_int32_gpu_1, A_int32_gpu_2, A_f32_gpu_1, A_f32_gpu_2});
+    auto backend = runtime::Backend::create("GPU");
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(
+        a, vector<float>{1.0f, 2.0f, 3.0f, 4.0f, 4.0f, 3.0f, 2.0f, 1.0f, 3.0f, 3.0f, 1.0f, 4.0f});
+    auto b = backend->create_tensor(element::i32, out_shape);
+    copy_data(b, vector<int32_t>{0, 0, 0, 0, 0, 0, 0, 0});
+    auto c = backend->create_tensor(element::i32, out_shape);
+    copy_data(c, vector<int32_t>{0, 0, 0, 0, 0, 0, 0, 0});
+    auto d = backend->create_tensor(element::f32, out_shape);
+    copy_data(d, vector<float>{0, 0, 0, 0, 0, 0, 0, 0});
+    auto e = backend->create_tensor(element::f32, out_shape);
+    copy_data(e, vector<float>{0, 0, 0, 0, 0, 0, 0, 0});
+    auto r0 = backend->create_tensor(element::i32, out_shape);
+    auto r1 = backend->create_tensor(element::i32, out_shape);
+    auto r2 = backend->create_tensor(element::f32, out_shape);
+    auto r3 = backend->create_tensor(element::f32, out_shape);
+    backend->call_with_validate(gpu_f, {r0, r1, r2, r3}, {a, b, c, d, e});
+    EXPECT_EQ((vector<int32_t>{2, 1, 1, 2, 1, 2, 0, 1}), read_vector<int32_t>(r0));
+    EXPECT_EQ((vector<int32_t>{2, 1, 1, 2, 1, 2, 0, 1}), read_vector<int32_t>(r1));
+    EXPECT_TRUE(
+        test::all_close_f(vector<float>{4, 4, 3, 3, 3, 4, 2, 3}, read_vector<float>(r2), 24, 0));
+    EXPECT_TRUE(
+        test::all_close_f(vector<float>{4, 4, 3, 3, 3, 4, 2, 3}, read_vector<float>(r3), 24, 0));
+    auto reshape_count = count_ops_of_type<ngraph::op::Reshape>(gpu_f);
+    EXPECT_EQ(reshape_count, 10);
+}