Support TopK for NvidiaGPU backend (#1908)

* fresh commit for the changes * Working topk on ndims for nvGPU * fix * clang * Added unit test, improved kernel hash and Bob's comment * int64 test+clang * Moved argReduce and topk tests to a separate file * TopK unsupported for IntelGPU * addressed Fenglei and Chris's comments * addressed Fenglei and Chris's comments

Support TopK for NvidiaGPU backend (#1908)
* fresh commit for the changes * Working topk on ndims for nvGPU * fix * clang * Added unit test, improved kernel hash and Bob's comment * int64 test+clang * Moved argReduce and topk tests to a separate file * TopK unsupported for IntelGPU * addressed Fenglei and Chris's comments * addressed Fenglei and Chris's comments
d901446d · Ayan Moitra · Robert Kimball · 239322e0 · d901446d · d901446d
Commit d901446d authored Oct 29, 2018 by Ayan Moitra Committed by Robert Kimball Oct 29, 2018
16 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -212,6 +212,138 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::string& dtype,
    return this->m_primitive_emitter->register_primitive(kernel_launch, hash.str());
 }

+size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& dtypes,
+                                             const NVShape& input_shape,
+                                             const size_t topk_axis,
+                                             size_t topk_k,
+                                             const element::Type index_elem_type,
+                                             bool compute_max)
+{
+    NGRAPH_ASSERT(dtypes[1] == index_elem_type)
+        << " The index element type does not match out[0] type";
+    uint32_t rank = static_cast<uint32_t>(input_shape.size());
+    NGRAPH_ASSERT(rank <= 2) << " The input tensor should be of either rank 1 or rank 2";
+    NGRAPH_ASSERT(topk_axis == rank - 1)
+        << " The axis along which topk is computed should be the last axis";
+    size_t num_cols = input_shape[rank - 1];
+    size_t num_rows = ((rank == 2) ? input_shape[0] : 1);
+    std::vector<std::string> dtypes_string;
+    for (auto& dtype : dtypes)
+    {
+        dtypes_string.push_back(dtype.c_type_string());
+    }
+
+    /*  The struct 'Entry' used in the kernel looks like this:
+    struct Entry
+    {
+            size_t index;
+            float value;
+
+            __device__ size_t get_index(){return index;}
+            __device__ void set_index(size_t id) {index = id;}
+            __device__ float get_value(){return value;}
+            __device__ void set_value(float val){value = val;}
+
+    };
+    Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the struct can
+    therfore be given by 'shared_struct_bytes' as calculated below accounting for structure padding*/
+
+    size_t shared_struct_bytes = (((dtypes[0].size() + index_elem_type.size()) <= 8) ? 8 : 16);
+    size_t shared_data_bytes = num_cols * shared_struct_bytes;
+
+    // Use global memory when each row size exceeds shared mem allowed per block
+    int device_num = 0;
+    CUDA_RT_SAFE_CALL(cudaGetDevice(&device_num));
+    cudaDeviceProp prop;
+    CUDA_RT_SAFE_CALL(cudaGetDeviceProperties(&prop, device_num));
+    bool use_malloc = ((shared_data_bytes > prop.sharedMemPerBlock) ? true : false);
+
+    std::stringstream kernel_name;
+    kernel_name << "topk_" << join(dtypes_string, "_") << "_cm_" << compute_max << "_use_malloc_"
+                << use_malloc;
+    std::string hash = kernel_name.str() + "_i_" + join(input_shape, "_");
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+    uint32_t block_size_x = 32;
+    uint32_t aligned_grid_size_x = num_rows;
+
+    auto args = m_primitive_emitter->add_kernel_args();
+    args.add_placeholder(dtypes_string[0], "in")
+        .add_placeholder(dtypes_string[1], "out_id")
+        .add_placeholder(dtypes_string[2], "out_val");
+    if (use_malloc)
+    {
+        args.add_placeholder("Entry", "entry");
+    }
+    args.add("num_cols", num_cols).add("topk_k", topk_k);
+
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
+    if (compiled_kernel == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        runtime::gpu::CudaKernelBuilder::get_topk(
+            writer, kernel_name.str(), dtypes_string, compute_max, args, use_malloc);
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+    }
+    if (use_malloc)
+    {
+        GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+        size_t heap_workspace_id = allocator.reserve_workspace(num_rows * shared_data_bytes);
+        std::unique_ptr<gpu::primitive> kernel_launch(
+            new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+                void* buffer = runtime::gpu::invoke_memory_primitive(m_ctx, heap_workspace_id);
+                void** args_list = args.resolve_placeholder(0, &inputs[0])
+                                       .resolve_placeholder(1, &outputs[0])
+                                       .resolve_placeholder(2, &outputs[1])
+                                       .resolve_placeholder(3, &buffer)
+                                       .get_argument_list();
+
+                CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                              aligned_grid_size_x,
+                                              1,
+                                              1,
+                                              block_size_x,
+                                              1,
+                                              1,
+                                              0,
+                                              NULL, // stream
+                                              args_list,
+                                              0)); // arguments
+                debug_sync();
+            }});
+        primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    }
+    else
+    {
+        std::unique_ptr<gpu::primitive> kernel_launch(
+            new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+                void** args_list = args.resolve_placeholder(0, &inputs[0])
+                                       .resolve_placeholder(1, &outputs[0])
+                                       .resolve_placeholder(2, &outputs[1])
+                                       .get_argument_list();
+
+                CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                              aligned_grid_size_x,
+                                              1,
+                                              1,
+                                              block_size_x,
+                                              1,
+                                              1,
+                                              shared_data_bytes, // shared mem
+                                              NULL,              //stream
+                                              args_list,
+                                              0)); // arguments
+                debug_sync();
+            }});
+        primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    }
+    return primitive_index;
+}
+
 size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes,
                                               NVShape input_shape,
                                               NVShape output_shape,
@@ -2165,7 +2297,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
                                      args_list.data(),
                                      0)); // arguments
        debug_sync();
-
    }});

    return this->m_primitive_emitter->register_primitive(f, hash);
@@ -2656,7 +2787,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string

    std::unique_ptr<gpu::primitive> conv(
        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
-
            void** args_list = args.resolve_placeholder(0, &inputs[0])
                                   .resolve_placeholder(1, &inputs[1])
                                   .resolve_placeholder(2, &outputs[0])

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -50,6 +50,13 @@ namespace ngraph
                size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);

            public:
+                size_t build_topk(const std::vector<element::Type>& dtypes,
+                                  const NVShape& input_shape,
+                                  const size_t topk_axis,
+                                  size_t topk_k,
+                                  const element::Type index_elem_type,
+                                  bool compute_max);
+
                size_t build_pad(const std::vector<std::string>& dtypes,
                                 NVShape input_shape,
                                 NVShape output_shape,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -202,6 +202,130 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
    return;
 }

+void runtime::gpu::CudaKernelBuilder::get_topk(codegen::CodeWriter& writer,
+                                               const std::string& name,
+                                               const std::vector<std::string>& dtypes,
+                                               bool compute_max,
+                                               runtime::gpu::GPUKernelArgs& args,
+                                               bool use_malloc)
+{
+    writer << "struct Entry\n";
+    writer.block_begin();
+    {
+        writer << dtypes[0] << " value;\n";
+        writer << dtypes[1] << " index;\n";
+        writer << "__device__ " << dtypes[1] << " get_index() {return index;}\n";
+        writer << "__device__ "
+               << "void set_index(" << dtypes[1] << " id) {index = id;}\n";
+        writer << "__device__ " << dtypes[0] << " get_value() {return value;}\n";
+        writer << "__device__ "
+               << "void set_value(" << dtypes[0] << " val) {value = val;}\n";
+    }
+    writer.block_end();
+    writer << ";\n";
+    writer << "__device__ void swap(Entry& a, Entry& b)\n";
+    writer.block_begin();
+    {
+        writer << "Entry t = a;\n";
+        writer << "a = b;\n";
+        writer << "b = t;\n";
+    }
+    writer.block_end();
+    writer << "__device__ void heapify(Entry *heap, size_t heap_size, size_t idx)\n";
+    writer.block_begin();
+    {
+        writer << "size_t largest = idx;\n";
+        writer << "size_t left = (idx << 1) + 1;\n";
+        writer << "size_t right = (idx + 1) << 1;\n";
+        std::string g_op = ((compute_max) ? ">" : "<");
+        writer << "if (left < heap_size && heap[left].get_value() " << g_op
+               << " heap[largest].get_value())\n";
+        writer.block_begin();
+        {
+            writer << "largest = left;\n";
+        }
+        writer.block_end();
+        writer << "if (right < heap_size && heap[right].get_value() " << g_op
+               << " heap[largest].get_value())\n";
+        writer.block_begin();
+        {
+            writer << "largest = right;\n";
+        }
+        writer.block_end();
+        writer << "if (largest != idx)\n";
+        writer.block_begin();
+        {
+            writer << "swap(heap[largest], heap[idx]);\n";
+            writer << "heapify(heap, heap_size, largest);\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+    writer << "__device__ void create_and_build(Entry *entry, size_t size)\n";
+    writer.block_begin();
+    {
+        writer << "for (int i = (size-2) / 2; i >= 0; --i)\n";
+        writer.block_begin();
+        {
+            writer << "heapify(entry, size, i);\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
+    writer.block_begin();
+    {
+        writer << "in = in + blockIdx.x * num_cols;\n";
+        if (use_malloc)
+        {
+            writer << "entry = entry + blockIdx.x * num_cols;\n";
+        }
+        writer << "out_id = out_id + blockIdx.x * topk_k;\n";
+        writer << "out_val = out_val + blockIdx.x * topk_k;\n";
+        if (!use_malloc)
+        {
+            writer << "extern __shared__ Entry entry[];\n";
+        }
+
+        writer << "for (size_t i = threadIdx.x; i < num_cols; i += blockDim.x)\n";
+        writer.block_begin();
+        {
+            writer << "entry[i].set_value(in[i]);\n";
+            writer << "entry[i].set_index(i);\n";
+        }
+        writer.block_end();
+
+        writer << "__syncthreads();\n";
+
+        writer << "if (threadIdx.x == 0)\n";
+        writer.block_begin();
+        {
+            writer << "create_and_build(entry, num_cols);\n";
+
+            writer << "size_t changed_size_of_heap = num_cols;\n";
+            writer << "size_t k = 0;\n";
+            writer << "while (k++ < topk_k)\n";
+            writer.block_begin();
+            {
+                writer << "swap(*entry, entry[changed_size_of_heap - 1]);\n";
+                writer << "heapify(entry, --changed_size_of_heap, 0);\n";
+            }
+            writer.block_end();
+
+            writer << "for (size_t i = threadIdx.x; i < topk_k; i++)\n";
+            writer.block_begin();
+            {
+                writer << "out_val[i] = entry[num_cols - 1 - i].get_value();\n";
+                writer << "out_id[i] = entry[num_cols - 1 - i].get_index();\n";
+            }
+            writer.block_end();
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+}
+
 //each thread calculate the whole reduction of one output
 void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
    codegen::CodeWriter& writer,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -85,6 +85,13 @@ namespace ngraph
                                                size_t out_rank,
                                                size_t reduce_rank);

+                static void get_topk(codegen::CodeWriter& writer,
+                                     const std::string& name,
+                                     const std::vector<std::string>& dtypes,
+                                     bool compute_max,
+                                     runtime::gpu::GPUKernelArgs& args,
+                                     bool use_malloc);
+
                //using one block with at most 512 threads to reduce to scalar.
                static void get_reduce_to_scalar_op(codegen::CodeWriter& writer,
                                                    const std::string& name,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -1583,7 +1583,32 @@ void runtime::gpu::GPU_Emitter::emit_Tanh(EMIT_ARGS)

 void runtime::gpu::GPU_Emitter::emit_TopK(EMIT_ARGS)
 {
-    throw unsupported_op("Unsupported op '" + node->description() + "'");
+    if (out[0].get_size() == 0)
+    {
+        return;
+    }
+    auto topk = static_cast<const ngraph::op::TopK*>(node);
+    size_t topk_axis = topk->get_top_k_axis();
+    size_t topk_k = topk->get_k();
+    auto index_elem_type = topk->get_index_element_type();
+    bool compute_max = topk->get_compute_max();
+    std::vector<element::Type> dtypes{args[0].get_element_type()};
+    NGRAPH_ASSERT(out.size() == 2) << "TopK can only have 2 outputs";
+    for (size_t i = 0; i < out.size(); i++)
+    {
+        dtypes.push_back(out[i].get_element_type());
+    }
+    auto& input_shape = args[0].get_shape();
+    auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
+    auto index = cuda_emitter->build_topk(
+        dtypes, input_shape, topk_axis, topk_k, index_elem_type, compute_max);
+    writer.block_begin();
+    {
+        writer << "void* input[] = {" << node_names(args) << "};\n";
+        writer << "void* output[] = {" << node_names(out) << "};\n";
+        writer << "gpu::invoke_primitive(ctx, " << index << ", input, output);\n";
+    }
+    writer.block_end();
 }

 string runtime::gpu::GPU_Emitter::node_names(const vector<GPUTensorWrapper>& args,

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -572,8 +572,8 @@ void runtime::gpu::GPU_ExternalFunction::compile()
 #endif
    pass_manager.register_pass<runtime::gpu::pass::BatchNormCache>();
    pass_manager.register_pass<ngraph::pass::LikeReplacement>();
-    pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
    pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
+    pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
    pass_manager.register_pass<runtime::gpu::pass::TensorMemoryReservation>(

--- a/src/ngraph/runtime/gpu/gpu_kernel_args.hpp
+++ b/src/ngraph/runtime/gpu/gpu_kernel_args.hpp
@@ -81,7 +81,7 @@ namespace ngraph
                // Retrieve the kernel parameter signature given the added kernel arguments.
                //
                std::string get_input_signature();
-
+                size_t get_size() { return m_argument_list.size(); }
            private:
                //
                // Cache the host argument for persistence, add it to the argument list,

--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
@@ -21,8 +21,10 @@
 #include <typeinfo>

 #include "gpu_layout.hpp"
+#include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/replace_slice.hpp"
 #include "ngraph/op/reshape.hpp"
+#include "ngraph/op/topk.hpp"
 #include "ngraph/runtime/gpu/gpu_op_annotations.hpp"

 using namespace std;
@@ -79,6 +81,97 @@ namespace ngraph
                        reshape->set_op_annotations(op_annotations);
                    }
                }
+                template <>
+                void GPULayout::LAYOUT_DECL(ngraph::op::TopK)
+                {
+                    auto topk = std::dynamic_pointer_cast<ngraph::op::TopK>(node);
+                    auto topk_axis = topk->get_top_k_axis();
+                    auto topk_k = topk->get_k();
+                    auto parent_node = topk->get_argument(0);
+                    auto in_shape = topk->get_input_shape(0);
+                    size_t ndim = in_shape.size();
+                    if (in_shape.size() <= 2 && topk_axis == ndim - 1)
+                    {
+                        return;
+                    }
+                    else
+                    {
+                        auto out_shape = in_shape;
+                        out_shape[topk_axis] = topk_k;
+                        AxisVector reshape_axis_order = ngraph::get_default_order(ndim);
+                        reshape_axis_order.erase(reshape_axis_order.begin() + topk_axis);
+                        reshape_axis_order.push_back(topk_axis);
+                        Shape pre_reshape_out;
+                        for (size_t j = 0; j < ndim; j++)
+                        {
+                            pre_reshape_out.push_back(in_shape[reshape_axis_order[j]]);
+                        }
+                        Shape pre_2d_reshape_out(2);
+                        pre_2d_reshape_out[1] = pre_reshape_out[ndim - 1];
+                        pre_2d_reshape_out[0] =
+                            ngraph::shape_size(pre_reshape_out) / pre_2d_reshape_out[1];
+                        auto pre_reshape = make_shared<ngraph::op::Reshape>(
+                            parent_node, reshape_axis_order, pre_reshape_out);
+                        AxisVector axis_order = ngraph::get_default_order(ndim);
+                        auto pre_2d_reshape = make_shared<ngraph::op::Reshape>(
+                            pre_reshape, axis_order, pre_2d_reshape_out);
+                        insert_new_node_between(parent_node, topk, pre_reshape);
+                        insert_new_node_between(pre_reshape, topk, pre_2d_reshape);
+                        NodeVector goes = op::get_output_elements(topk);
+                        auto new_topk =
+                            make_shared<ngraph::op::TopK>(pre_2d_reshape,
+                                                          1,
+                                                          topk->get_index_element_type(),
+                                                          topk->get_k(),
+                                                          topk->get_compute_max());
+                        ngraph::replace_node(topk, new_topk);
+                        // Replace old goe with new goe based on new topk
+                        NodeVector new_goes;
+                        for (auto& goe : goes)
+                        {
+                            auto out_idx =
+                                std::dynamic_pointer_cast<op::GetOutputElement>(goe)->get_n();
+                            auto new_goe =
+                                std::make_shared<op::GetOutputElement>(new_topk, out_idx);
+                            ngraph::replace_node(goe, new_goe);
+                            new_goes.push_back(new_goe);
+                        }
+                        Shape reordered_out_shape;
+                        for (size_t j = 0; j < ndim; j++)
+                        {
+                            reordered_out_shape.push_back(out_shape[reshape_axis_order[j]]);
+                        }
+                        NodeVector post_2d_reshapes = insert_new_reshape_after(
+                            new_goes, AxisVector{0, 1}, reordered_out_shape);
+                        axis_order.pop_back();
+                        axis_order.insert(axis_order.begin() + topk_axis, 1, ndim - 1);
+                        insert_new_reshape_after(post_2d_reshapes, axis_order, out_shape);
+                    }
+                }
+                NodeVector insert_new_reshape_after(NodeVector& parents,
+                                                    const AxisVector& axis_vector,
+                                                    const Shape& out_shape)
+                {
+                    NodeVector reshapes;
+                    for (auto& parent : parents)
+                    {
+                        for (auto node : parent->get_users())
+                        {
+                            for (size_t i = 0; i < node->get_input_size(); i++)
+                            {
+                                if (node->get_argument(i) == parent)
+                                {
+                                    auto new_reshape = make_shared<ngraph::op::Reshape>(
+                                        parent, axis_vector, out_shape);
+                                    node->get_inputs().at(i).replace_output(
+                                        new_reshape->get_outputs().at(0));
+                                    reshapes.push_back(new_reshape);
+                                }
+                            }
+                        }
+                    }
+                    return reshapes;
+                }
            }
        }
    }
@@ -90,6 +183,7 @@ static const runtime::gpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::ReplaceSlice),
     &runtime::gpu::pass::GPULayout::layout<ngraph::op::ReplaceSlice>},
    {TI(ngraph::op::Reshape), &runtime::gpu::pass::GPULayout::layout<ngraph::op::Reshape>},
+    {TI(ngraph::op::TopK), &runtime::gpu::pass::GPULayout::layout<ngraph::op::TopK>},
 };

 bool runtime::gpu::pass::GPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)

--- a/src/ngraph/runtime/gpu/pass/gpu_layout.hpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.hpp
@@ -54,6 +54,10 @@ namespace ngraph
                private:
                    GPU_ExternalFunction* m_external_function;
                };
+
+                NodeVector insert_new_reshape_after(NodeVector& parents,
+                                                    const AxisVector& axis_vector,
+                                                    const Shape& out_shape);
            }
        }
    }

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
@@ -29,24 +29,6 @@ backwards_maxpool_n2_c1_hw5_3x3_str2_max
 backwards_avgpool_n1_c1_hw2x2
 backwards_avgpool_n1_c1_hw4x4
 backwards_avgpool_n2_c2_hw4x4
-topk_1d_max_all
-topk_1d_max_partial
-topk_1d_max_one
-topk_1d_min_all
-topk_1d_min_partial
-topk_1d_min_one
-topk_2d_max_all
-topk_2d_max_partial
-topk_2d_max_one
-topk_2d_min_all
-topk_2d_min_partial
-topk_2d_min_one
-topk_3d_max_all
-topk_3d_max_partial
-topk_3d_max_one
-topk_3d_min_all
-topk_3d_min_partial
-topk_3d_min_one
 quantize
 quantize_axes
 quantize_int8

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -78,6 +78,8 @@ topk_3d_max_partial
 topk_3d_min_all
 topk_3d_min_one
 topk_3d_min_partial
+topk_5d_max_partial
+topk_int64
 zero_sized_abs
 zero_sized_acos
 zero_sized_add

--- a/src/ngraph/runtime/interpreter/unit_test.manifest
+++ b/src/ngraph/runtime/interpreter/unit_test.manifest
@@ -7,3 +7,4 @@ batchnorm_fprop_inference_b2c2h2w1
 batchnorm_fprop_bprop
 batchnorm_fprop_bprop_2step
 computation_reuse
+topk_int64
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -112,6 +112,8 @@ set(MULTI_TEST_SRC
    backend_reduce.in.cpp
    backend_reshape.in.cpp
    backend_sum.in.cpp
+    backend_topk.in.cpp
+    backend_arg_reduce.in.cpp
    backend_test.in.cpp
    backend_unary_elementwise.in.cpp
    convolution_test.in.cpp

--- a/test/backend_arg_reduce.in.cpp
+++ b/test/backend_arg_reduce.in.cpp
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
--- a/test/backend_topk.in.cpp
+++ b/test/backend_topk.in.cpp