Commit d901446d authored by Ayan Moitra's avatar Ayan Moitra Committed by Robert Kimball

Support TopK for NvidiaGPU backend (#1908)

* fresh commit for the changes

* Working topk on ndims for nvGPU

* fix

* clang

* Added unit test, improved kernel hash and Bob's comment

* int64 test+clang

* Moved argReduce and topk tests to a separate file

* TopK unsupported for IntelGPU

* addressed Fenglei and Chris's comments

* addressed Fenglei and Chris's comments
parent 239322e0
......@@ -212,6 +212,138 @@ size_t runtime::gpu::CUDAEmitter::build_concat(const std::string& dtype,
return this->m_primitive_emitter->register_primitive(kernel_launch, hash.str());
}
size_t runtime::gpu::CUDAEmitter::build_topk(const std::vector<element::Type>& dtypes,
const NVShape& input_shape,
const size_t topk_axis,
size_t topk_k,
const element::Type index_elem_type,
bool compute_max)
{
NGRAPH_ASSERT(dtypes[1] == index_elem_type)
<< " The index element type does not match out[0] type";
uint32_t rank = static_cast<uint32_t>(input_shape.size());
NGRAPH_ASSERT(rank <= 2) << " The input tensor should be of either rank 1 or rank 2";
NGRAPH_ASSERT(topk_axis == rank - 1)
<< " The axis along which topk is computed should be the last axis";
size_t num_cols = input_shape[rank - 1];
size_t num_rows = ((rank == 2) ? input_shape[0] : 1);
std::vector<std::string> dtypes_string;
for (auto& dtype : dtypes)
{
dtypes_string.push_back(dtype.c_type_string());
}
/* The struct 'Entry' used in the kernel looks like this:
struct Entry
{
size_t index;
float value;
__device__ size_t get_index(){return index;}
__device__ void set_index(size_t id) {index = id;}
__device__ float get_value(){return value;}
__device__ void set_value(float val){value = val;}
};
Based on the datatypes, the max size of the struct can be 16 bytes. Any arbitrary size of the struct can
therfore be given by 'shared_struct_bytes' as calculated below accounting for structure padding*/
size_t shared_struct_bytes = (((dtypes[0].size() + index_elem_type.size()) <= 8) ? 8 : 16);
size_t shared_data_bytes = num_cols * shared_struct_bytes;
// Use global memory when each row size exceeds shared mem allowed per block
int device_num = 0;
CUDA_RT_SAFE_CALL(cudaGetDevice(&device_num));
cudaDeviceProp prop;
CUDA_RT_SAFE_CALL(cudaGetDeviceProperties(&prop, device_num));
bool use_malloc = ((shared_data_bytes > prop.sharedMemPerBlock) ? true : false);
std::stringstream kernel_name;
kernel_name << "topk_" << join(dtypes_string, "_") << "_cm_" << compute_max << "_use_malloc_"
<< use_malloc;
std::string hash = kernel_name.str() + "_i_" + join(input_shape, "_");
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
uint32_t block_size_x = 32;
uint32_t aligned_grid_size_x = num_rows;
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes_string[0], "in")
.add_placeholder(dtypes_string[1], "out_id")
.add_placeholder(dtypes_string[2], "out_val");
if (use_malloc)
{
args.add_placeholder("Entry", "entry");
}
args.add("num_cols", num_cols).add("topk_k", topk_k);
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
runtime::gpu::CudaKernelBuilder::get_topk(
writer, kernel_name.str(), dtypes_string, compute_max, args, use_malloc);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
}
if (use_malloc)
{
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t heap_workspace_id = allocator.reserve_workspace(num_rows * shared_data_bytes);
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* buffer = runtime::gpu::invoke_memory_primitive(m_ctx, heap_workspace_id);
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.resolve_placeholder(2, &outputs[1])
.resolve_placeholder(3, &buffer)
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1,
block_size_x,
1,
1,
0,
NULL, // stream
args_list,
0)); // arguments
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
}
else
{
std::unique_ptr<gpu::primitive> kernel_launch(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.resolve_placeholder(2, &outputs[1])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1,
block_size_x,
1,
1,
shared_data_bytes, // shared mem
NULL, //stream
args_list,
0)); // arguments
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
}
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_onehot(const std::array<std::string, 2>& dtypes,
NVShape input_shape,
NVShape output_shape,
......@@ -2165,7 +2297,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_window(const OpName op_name,
args_list.data(),
0)); // arguments
debug_sync();
}});
return this->m_primitive_emitter->register_primitive(f, hash);
......@@ -2656,7 +2787,6 @@ size_t runtime::gpu::CUDAEmitter::build_convolution(const std::array<std::string
std::unique_ptr<gpu::primitive> conv(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &inputs[1])
.resolve_placeholder(2, &outputs[0])
......
......@@ -50,6 +50,13 @@ namespace ngraph
size_t build_primitive(const op::ReplaceSlice* node, bool in_place_op);
public:
size_t build_topk(const std::vector<element::Type>& dtypes,
const NVShape& input_shape,
const size_t topk_axis,
size_t topk_k,
const element::Type index_elem_type,
bool compute_max);
size_t build_pad(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
......
......@@ -202,6 +202,130 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
return;
}
void runtime::gpu::CudaKernelBuilder::get_topk(codegen::CodeWriter& writer,
const std::string& name,
const std::vector<std::string>& dtypes,
bool compute_max,
runtime::gpu::GPUKernelArgs& args,
bool use_malloc)
{
writer << "struct Entry\n";
writer.block_begin();
{
writer << dtypes[0] << " value;\n";
writer << dtypes[1] << " index;\n";
writer << "__device__ " << dtypes[1] << " get_index() {return index;}\n";
writer << "__device__ "
<< "void set_index(" << dtypes[1] << " id) {index = id;}\n";
writer << "__device__ " << dtypes[0] << " get_value() {return value;}\n";
writer << "__device__ "
<< "void set_value(" << dtypes[0] << " val) {value = val;}\n";
}
writer.block_end();
writer << ";\n";
writer << "__device__ void swap(Entry& a, Entry& b)\n";
writer.block_begin();
{
writer << "Entry t = a;\n";
writer << "a = b;\n";
writer << "b = t;\n";
}
writer.block_end();
writer << "__device__ void heapify(Entry *heap, size_t heap_size, size_t idx)\n";
writer.block_begin();
{
writer << "size_t largest = idx;\n";
writer << "size_t left = (idx << 1) + 1;\n";
writer << "size_t right = (idx + 1) << 1;\n";
std::string g_op = ((compute_max) ? ">" : "<");
writer << "if (left < heap_size && heap[left].get_value() " << g_op
<< " heap[largest].get_value())\n";
writer.block_begin();
{
writer << "largest = left;\n";
}
writer.block_end();
writer << "if (right < heap_size && heap[right].get_value() " << g_op
<< " heap[largest].get_value())\n";
writer.block_begin();
{
writer << "largest = right;\n";
}
writer.block_end();
writer << "if (largest != idx)\n";
writer.block_begin();
{
writer << "swap(heap[largest], heap[idx]);\n";
writer << "heapify(heap, heap_size, largest);\n";
}
writer.block_end();
}
writer.block_end();
writer << "__device__ void create_and_build(Entry *entry, size_t size)\n";
writer.block_begin();
{
writer << "for (int i = (size-2) / 2; i >= 0; --i)\n";
writer.block_begin();
{
writer << "heapify(entry, size, i);\n";
}
writer.block_end();
}
writer.block_end();
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "in = in + blockIdx.x * num_cols;\n";
if (use_malloc)
{
writer << "entry = entry + blockIdx.x * num_cols;\n";
}
writer << "out_id = out_id + blockIdx.x * topk_k;\n";
writer << "out_val = out_val + blockIdx.x * topk_k;\n";
if (!use_malloc)
{
writer << "extern __shared__ Entry entry[];\n";
}
writer << "for (size_t i = threadIdx.x; i < num_cols; i += blockDim.x)\n";
writer.block_begin();
{
writer << "entry[i].set_value(in[i]);\n";
writer << "entry[i].set_index(i);\n";
}
writer.block_end();
writer << "__syncthreads();\n";
writer << "if (threadIdx.x == 0)\n";
writer.block_begin();
{
writer << "create_and_build(entry, num_cols);\n";
writer << "size_t changed_size_of_heap = num_cols;\n";
writer << "size_t k = 0;\n";
writer << "while (k++ < topk_k)\n";
writer.block_begin();
{
writer << "swap(*entry, entry[changed_size_of_heap - 1]);\n";
writer << "heapify(entry, --changed_size_of_heap, 0);\n";
}
writer.block_end();
writer << "for (size_t i = threadIdx.x; i < topk_k; i++)\n";
writer.block_begin();
{
writer << "out_val[i] = entry[num_cols - 1 - i].get_value();\n";
writer << "out_id[i] = entry[num_cols - 1 - i].get_index();\n";
}
writer.block_end();
}
writer.block_end();
}
writer.block_end();
}
//each thread calculate the whole reduction of one output
void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
codegen::CodeWriter& writer,
......
......@@ -85,6 +85,13 @@ namespace ngraph
size_t out_rank,
size_t reduce_rank);
static void get_topk(codegen::CodeWriter& writer,
const std::string& name,
const std::vector<std::string>& dtypes,
bool compute_max,
runtime::gpu::GPUKernelArgs& args,
bool use_malloc);
//using one block with at most 512 threads to reduce to scalar.
static void get_reduce_to_scalar_op(codegen::CodeWriter& writer,
const std::string& name,
......
......@@ -1583,7 +1583,32 @@ void runtime::gpu::GPU_Emitter::emit_Tanh(EMIT_ARGS)
void runtime::gpu::GPU_Emitter::emit_TopK(EMIT_ARGS)
{
throw unsupported_op("Unsupported op '" + node->description() + "'");
if (out[0].get_size() == 0)
{
return;
}
auto topk = static_cast<const ngraph::op::TopK*>(node);
size_t topk_axis = topk->get_top_k_axis();
size_t topk_k = topk->get_k();
auto index_elem_type = topk->get_index_element_type();
bool compute_max = topk->get_compute_max();
std::vector<element::Type> dtypes{args[0].get_element_type()};
NGRAPH_ASSERT(out.size() == 2) << "TopK can only have 2 outputs";
for (size_t i = 0; i < out.size(); i++)
{
dtypes.push_back(out[i].get_element_type());
}
auto& input_shape = args[0].get_shape();
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto index = cuda_emitter->build_topk(
dtypes, input_shape, topk_axis, topk_k, index_elem_type, compute_max);
writer.block_begin();
{
writer << "void* input[] = {" << node_names(args) << "};\n";
writer << "void* output[] = {" << node_names(out) << "};\n";
writer << "gpu::invoke_primitive(ctx, " << index << ", input, output);\n";
}
writer.block_end();
}
string runtime::gpu::GPU_Emitter::node_names(const vector<GPUTensorWrapper>& args,
......
......@@ -572,8 +572,8 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#endif
pass_manager.register_pass<runtime::gpu::pass::BatchNormCache>();
pass_manager.register_pass<ngraph::pass::LikeReplacement>();
pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
pass_manager.register_pass<runtime::gpu::pass::GPULayout>(this);
pass_manager.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorLayout>>();
pass_manager.register_pass<ngraph::pass::Liveness>();
pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
pass_manager.register_pass<runtime::gpu::pass::TensorMemoryReservation>(
......
......@@ -81,7 +81,7 @@ namespace ngraph
// Retrieve the kernel parameter signature given the added kernel arguments.
//
std::string get_input_signature();
size_t get_size() { return m_argument_list.size(); }
private:
//
// Cache the host argument for persistence, add it to the argument list,
......
......@@ -21,8 +21,10 @@
#include <typeinfo>
#include "gpu_layout.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/replace_slice.hpp"
#include "ngraph/op/reshape.hpp"
#include "ngraph/op/topk.hpp"
#include "ngraph/runtime/gpu/gpu_op_annotations.hpp"
using namespace std;
......@@ -79,6 +81,97 @@ namespace ngraph
reshape->set_op_annotations(op_annotations);
}
}
template <>
void GPULayout::LAYOUT_DECL(ngraph::op::TopK)
{
auto topk = std::dynamic_pointer_cast<ngraph::op::TopK>(node);
auto topk_axis = topk->get_top_k_axis();
auto topk_k = topk->get_k();
auto parent_node = topk->get_argument(0);
auto in_shape = topk->get_input_shape(0);
size_t ndim = in_shape.size();
if (in_shape.size() <= 2 && topk_axis == ndim - 1)
{
return;
}
else
{
auto out_shape = in_shape;
out_shape[topk_axis] = topk_k;
AxisVector reshape_axis_order = ngraph::get_default_order(ndim);
reshape_axis_order.erase(reshape_axis_order.begin() + topk_axis);
reshape_axis_order.push_back(topk_axis);
Shape pre_reshape_out;
for (size_t j = 0; j < ndim; j++)
{
pre_reshape_out.push_back(in_shape[reshape_axis_order[j]]);
}
Shape pre_2d_reshape_out(2);
pre_2d_reshape_out[1] = pre_reshape_out[ndim - 1];
pre_2d_reshape_out[0] =
ngraph::shape_size(pre_reshape_out) / pre_2d_reshape_out[1];
auto pre_reshape = make_shared<ngraph::op::Reshape>(
parent_node, reshape_axis_order, pre_reshape_out);
AxisVector axis_order = ngraph::get_default_order(ndim);
auto pre_2d_reshape = make_shared<ngraph::op::Reshape>(
pre_reshape, axis_order, pre_2d_reshape_out);
insert_new_node_between(parent_node, topk, pre_reshape);
insert_new_node_between(pre_reshape, topk, pre_2d_reshape);
NodeVector goes = op::get_output_elements(topk);
auto new_topk =
make_shared<ngraph::op::TopK>(pre_2d_reshape,
1,
topk->get_index_element_type(),
topk->get_k(),
topk->get_compute_max());
ngraph::replace_node(topk, new_topk);
// Replace old goe with new goe based on new topk
NodeVector new_goes;
for (auto& goe : goes)
{
auto out_idx =
std::dynamic_pointer_cast<op::GetOutputElement>(goe)->get_n();
auto new_goe =
std::make_shared<op::GetOutputElement>(new_topk, out_idx);
ngraph::replace_node(goe, new_goe);
new_goes.push_back(new_goe);
}
Shape reordered_out_shape;
for (size_t j = 0; j < ndim; j++)
{
reordered_out_shape.push_back(out_shape[reshape_axis_order[j]]);
}
NodeVector post_2d_reshapes = insert_new_reshape_after(
new_goes, AxisVector{0, 1}, reordered_out_shape);
axis_order.pop_back();
axis_order.insert(axis_order.begin() + topk_axis, 1, ndim - 1);
insert_new_reshape_after(post_2d_reshapes, axis_order, out_shape);
}
}
NodeVector insert_new_reshape_after(NodeVector& parents,
const AxisVector& axis_vector,
const Shape& out_shape)
{
NodeVector reshapes;
for (auto& parent : parents)
{
for (auto node : parent->get_users())
{
for (size_t i = 0; i < node->get_input_size(); i++)
{
if (node->get_argument(i) == parent)
{
auto new_reshape = make_shared<ngraph::op::Reshape>(
parent, axis_vector, out_shape);
node->get_inputs().at(i).replace_output(
new_reshape->get_outputs().at(0));
reshapes.push_back(new_reshape);
}
}
}
}
return reshapes;
}
}
}
}
......@@ -90,6 +183,7 @@ static const runtime::gpu::pass::LayoutOpMap s_dispatcher{
{TI(ngraph::op::ReplaceSlice),
&runtime::gpu::pass::GPULayout::layout<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::Reshape), &runtime::gpu::pass::GPULayout::layout<ngraph::op::Reshape>},
{TI(ngraph::op::TopK), &runtime::gpu::pass::GPULayout::layout<ngraph::op::TopK>},
};
bool runtime::gpu::pass::GPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
......
......@@ -54,6 +54,10 @@ namespace ngraph
private:
GPU_ExternalFunction* m_external_function;
};
NodeVector insert_new_reshape_after(NodeVector& parents,
const AxisVector& axis_vector,
const Shape& out_shape);
}
}
}
......
......@@ -29,24 +29,6 @@ backwards_maxpool_n2_c1_hw5_3x3_str2_max
backwards_avgpool_n1_c1_hw2x2
backwards_avgpool_n1_c1_hw4x4
backwards_avgpool_n2_c2_hw4x4
topk_1d_max_all
topk_1d_max_partial
topk_1d_max_one
topk_1d_min_all
topk_1d_min_partial
topk_1d_min_one
topk_2d_max_all
topk_2d_max_partial
topk_2d_max_one
topk_2d_min_all
topk_2d_min_partial
topk_2d_min_one
topk_3d_max_all
topk_3d_max_partial
topk_3d_max_one
topk_3d_min_all
topk_3d_min_partial
topk_3d_min_one
quantize
quantize_axes
quantize_int8
......
......@@ -78,6 +78,8 @@ topk_3d_max_partial
topk_3d_min_all
topk_3d_min_one
topk_3d_min_partial
topk_5d_max_partial
topk_int64
zero_sized_abs
zero_sized_acos
zero_sized_add
......
......@@ -7,3 +7,4 @@ batchnorm_fprop_inference_b2c2h2w1
batchnorm_fprop_bprop
batchnorm_fprop_bprop_2step
computation_reuse
topk_int64
......@@ -112,6 +112,8 @@ set(MULTI_TEST_SRC
backend_reduce.in.cpp
backend_reshape.in.cpp
backend_sum.in.cpp
backend_topk.in.cpp
backend_arg_reduce.in.cpp
backend_test.in.cpp
backend_unary_elementwise.in.cpp
convolution_test.in.cpp
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment