Commit a86a9050 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu add onehot op (#638)

* add onehot op

* refactor broadcast and onehot op
parent 9d89ffb9
......@@ -56,6 +56,54 @@ void runtime::gpu::CudaKernelBuilder::get_elementwise_op(
return;
}
void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, size_t m, size_t k, size_t n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t idx = tid / (m * k) * m + tid % m;\n";
writer << "out[tid] = in[idx];\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
}
void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, size_t m, size_t k, size_t n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t idx = (tid / m) * m * k + (m * in[tid]) + tid % m;\n";
writer << "out[idx] = 1;\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
}
void runtime::gpu::CudaKernelBuilder::get_device_helper(
codegen::CodeWriter& writer,
const std::string& name,
......
......@@ -39,6 +39,14 @@ namespace ngraph
const std::array<std::string, 2>& data_types,
const size_t& num_inputs);
static void get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types);
static void get_onehot_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types);
static void get_device_helper(codegen::CodeWriter& writer,
const std::string& name,
const std::string& math_kernel,
......
......@@ -21,40 +21,65 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
using namespace ngraph;
using namespace ngraph::runtime::gpu;
void runtime::gpu::emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
void runtime::gpu::emit_broadcast(std::string name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count)
{
std::string name = "broadcast";
std::string name_signature = name + "_" + data_types[0] + "_" + data_types[1];
std::replace(name_signature.begin(), name_signature.end(), ' ', '_');
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
if (CudaFunctionPool::instance().get(name_signature) == nullptr)
{
std::string kernel;
std::string data_type("float");
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_broadcast_op(writer, name_signature, data_types);
std::string kernel = writer.get_code();
CudaFunctionPool::instance().set(name_signature, kernel);
}
kernel = R"(
extern "C" __global__
void cuda_)" + name +
"(" + data_type + "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" +
R"(
void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name_signature).get(),
static_cast<unsigned int>(count),
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void runtime::gpu::emit_onehot(std::string name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count)
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
std::string name_signature = name + "_" + data_types[0] + "_" + data_types[1];
std::replace(name_signature.begin(), name_signature.end(), ' ', '_');
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name_signature) == nullptr)
{
size_t idx = tid / (m * k) * m + tid % m;
out[tid] = in[idx];
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_onehot_op(writer, name_signature, data_types);
std::string kernel = writer.get_code();
CudaFunctionPool::instance().set(name_signature, kernel);
}
})";
CudaFunctionPool::instance().set(name, kernel);
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = CUdeviceptr(in);
d_ptr_out = CUdeviceptr(out);
void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name_signature).get(),
static_cast<unsigned int>(count),
1,
1, // grid dim
......
......@@ -34,8 +34,21 @@ namespace ngraph
template <typename T>
struct CudaOpMap;
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
void emit_broadcast(std::string name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count);
void emit_onehot(std::string name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count);
template <typename T, typename... Inputs>
void emit_elementwise_op(std::string name,
......@@ -45,6 +58,7 @@ namespace ngraph
Inputs&&... inputs)
{
std::string type_signature = "_" + data_types[0] + "_" + data_types[1];
std::replace(type_signature.begin(), type_signature.end(), ' ', '_');
if (CudaFunctionPool::instance().get(name + type_signature) == nullptr)
{
codegen::CodeWriter writer;
......
......@@ -123,10 +123,10 @@ namespace ngraph
<< n->description() << ">(\"" << n->description() << "\""
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", count"
<< ", (CUdeviceptr) " << out[0].get_name();
<< ", CUdeviceptr(" << out[0].get_name() << ")";
for (size_t i = 0; i < args.size(); i++)
{
writer << ", (CUdeviceptr) " << args[i].get_name();
writer << ", CUdeviceptr(" << args[i].get_name() << ")";
}
writer << ");\n";
writer.indent--;
......@@ -489,9 +489,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "{ // " << node->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", "
<< out[0].get_name() << ", " << repeat_size << ", " << repeat_times
<< ", " << out[0].get_size() << ");\n";
writer << "runtime::gpu::emit_broadcast(\"" << node->description()
<< "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
<< out[0].get_name() << ")"
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type()
<< "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", "
<< out[0].get_size() << ");\n";
writer.indent--;
writer << "}\n";
}
......@@ -619,6 +624,38 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "}\n";
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::OneHot)
{
if (out[0].get_size() == 0)
{
return;
}
auto onehot = static_cast<const ngraph::op::OneHot*>(node);
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
size_t idx = onehot->get_one_hot_axis();
size_t repeat_times = result_shape[idx];
size_t repeat_size = 1;
for (size_t i = idx + 1; i < result_shape.size(); i++)
{
repeat_size *= result_shape[i];
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer << "runtime::gpu::emit_onehot(\"" << node->description()
<< "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
<< out[0].get_name() << ")"
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", " << args[0].get_size()
<< ");\n";
writer.indent--;
writer << "}\n";
}
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Sqrt)
{
......
......@@ -909,7 +909,6 @@ TEST(${BACKEND_NAME}, backwards_log)
TEST(${BACKEND_NAME}, backwards_maximum)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......@@ -930,7 +929,6 @@ TEST(${BACKEND_NAME}, backwards_maximum)
TEST(${BACKEND_NAME}, backwards_minimum)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend();
......
......@@ -3764,7 +3764,6 @@ TEST(${BACKEND_NAME}, replace_slice_vector)
TEST(${BACKEND_NAME}, one_hot_scalar_2_in_3)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{};
auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3};
......@@ -3787,7 +3786,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_2_in_3)
TEST(${BACKEND_NAME}, one_hot_scalar_1_in_3)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{};
auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3};
......@@ -3810,7 +3808,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_1_in_3)
TEST(${BACKEND_NAME}, one_hot_scalar_0_in_3)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{};
auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3};
......@@ -3899,7 +3896,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_oob_in_3)
TEST(${BACKEND_NAME}, one_hot_vector_0)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{8};
auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3, 8};
......@@ -3924,7 +3920,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_0)
TEST(${BACKEND_NAME}, one_hot_vector_1)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{8};
auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{8, 3};
......@@ -4015,7 +4010,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_1_far_oob)
TEST(${BACKEND_NAME}, one_hot_matrix_0)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3};
auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3, 3, 3};
......@@ -4046,7 +4040,6 @@ TEST(${BACKEND_NAME}, one_hot_matrix_0)
TEST(${BACKEND_NAME}, one_hot_vector_1_fp)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{8};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{8, 3};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment