Commit a86a9050 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu add onehot op (#638)

* add onehot op

* refactor broadcast and onehot op
parent 9d89ffb9
...@@ -56,6 +56,54 @@ void runtime::gpu::CudaKernelBuilder::get_elementwise_op( ...@@ -56,6 +56,54 @@ void runtime::gpu::CudaKernelBuilder::get_elementwise_op(
return; return;
} }
void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, size_t m, size_t k, size_t n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t idx = tid / (m * k) * m + tid % m;\n";
writer << "out[tid] = in[idx];\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
}
void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1] << "* out, size_t m, size_t k, size_t n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t idx = (tid / m) * m * k + (m * in[tid]) + tid % m;\n";
writer << "out[idx] = 1;\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
}
void runtime::gpu::CudaKernelBuilder::get_device_helper( void runtime::gpu::CudaKernelBuilder::get_device_helper(
codegen::CodeWriter& writer, codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
......
...@@ -39,6 +39,14 @@ namespace ngraph ...@@ -39,6 +39,14 @@ namespace ngraph
const std::array<std::string, 2>& data_types, const std::array<std::string, 2>& data_types,
const size_t& num_inputs); const size_t& num_inputs);
static void get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types);
static void get_onehot_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types);
static void get_device_helper(codegen::CodeWriter& writer, static void get_device_helper(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::string& math_kernel, const std::string& math_kernel,
......
...@@ -21,40 +21,65 @@ ...@@ -21,40 +21,65 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
using namespace ngraph; using namespace ngraph;
using namespace ngraph::runtime::gpu;
void runtime::gpu::emit_broadcast( void runtime::gpu::emit_broadcast(std::string name,
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count) CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count)
{ {
std::string name = "broadcast"; std::string name_signature = name + "_" + data_types[0] + "_" + data_types[1];
std::replace(name_signature.begin(), name_signature.end(), ' ', '_');
// Create an instance of nvrtcProgram with the code string. // Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr) if (CudaFunctionPool::instance().get(name_signature) == nullptr)
{ {
std::string kernel; codegen::CodeWriter writer;
std::string data_type("float"); CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_broadcast_op(writer, name_signature, data_types);
std::string kernel = writer.get_code();
CudaFunctionPool::instance().set(name_signature, kernel);
}
kernel = R"( void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count};
extern "C" __global__ CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name_signature).get(),
void cuda_)" + name + static_cast<unsigned int>(count),
"(" + data_type + "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + 1,
R"( 1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void runtime::gpu::emit_onehot(std::string name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count)
{ {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; std::string name_signature = name + "_" + data_types[0] + "_" + data_types[1];
if(tid < n) std::replace(name_signature.begin(), name_signature.end(), ' ', '_');
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name_signature) == nullptr)
{ {
size_t idx = tid / (m * k) * m + tid % m; codegen::CodeWriter writer;
out[tid] = in[idx]; CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_onehot_op(writer, name_signature, data_types);
std::string kernel = writer.get_code();
CudaFunctionPool::instance().set(name_signature, kernel);
} }
})";
CudaFunctionPool::instance().set(name, kernel);
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = CUdeviceptr(in);
d_ptr_out = CUdeviceptr(out);
void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count}; void* args_list[] = {&in, &out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(), CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name_signature).get(),
static_cast<unsigned int>(count), static_cast<unsigned int>(count),
1, 1,
1, // grid dim 1, // grid dim
......
...@@ -34,8 +34,21 @@ namespace ngraph ...@@ -34,8 +34,21 @@ namespace ngraph
template <typename T> template <typename T>
struct CudaOpMap; struct CudaOpMap;
void emit_broadcast( void emit_broadcast(std::string name,
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count); CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count);
void emit_onehot(std::string name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
size_t repeat_size,
size_t repeat_times,
size_t count);
template <typename T, typename... Inputs> template <typename T, typename... Inputs>
void emit_elementwise_op(std::string name, void emit_elementwise_op(std::string name,
...@@ -45,6 +58,7 @@ namespace ngraph ...@@ -45,6 +58,7 @@ namespace ngraph
Inputs&&... inputs) Inputs&&... inputs)
{ {
std::string type_signature = "_" + data_types[0] + "_" + data_types[1]; std::string type_signature = "_" + data_types[0] + "_" + data_types[1];
std::replace(type_signature.begin(), type_signature.end(), ' ', '_');
if (CudaFunctionPool::instance().get(name + type_signature) == nullptr) if (CudaFunctionPool::instance().get(name + type_signature) == nullptr)
{ {
codegen::CodeWriter writer; codegen::CodeWriter writer;
......
...@@ -123,10 +123,10 @@ namespace ngraph ...@@ -123,10 +123,10 @@ namespace ngraph
<< n->description() << ">(\"" << n->description() << "\"" << n->description() << ">(\"" << n->description() << "\""
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}" << ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", count" << ", count"
<< ", (CUdeviceptr) " << out[0].get_name(); << ", CUdeviceptr(" << out[0].get_name() << ")";
for (size_t i = 0; i < args.size(); i++) for (size_t i = 0; i < args.size(); i++)
{ {
writer << ", (CUdeviceptr) " << args[i].get_name(); writer << ", CUdeviceptr(" << args[i].get_name() << ")";
} }
writer << ");\n"; writer << ");\n";
writer.indent--; writer.indent--;
...@@ -489,9 +489,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -489,9 +489,14 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "{ // " << node->get_name() << " \n"; writer << "{ // " << node->get_name() << " \n";
writer.indent++; writer.indent++;
writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", " writer << "runtime::gpu::emit_broadcast(\"" << node->description()
<< out[0].get_name() << ", " << repeat_size << ", " << repeat_times << "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
<< ", " << out[0].get_size() << ");\n"; << out[0].get_name() << ")"
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type()
<< "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", "
<< out[0].get_size() << ");\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
...@@ -619,6 +624,38 @@ cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -619,6 +624,38 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
writer << "}\n"; writer << "}\n";
} }
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::OneHot)
{
if (out[0].get_size() == 0)
{
return;
}
auto onehot = static_cast<const ngraph::op::OneHot*>(node);
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
size_t idx = onehot->get_one_hot_axis();
size_t repeat_times = result_shape[idx];
size_t repeat_size = 1;
for (size_t i = idx + 1; i < result_shape.size(); i++)
{
repeat_size *= result_shape[i];
}
writer << "{ // " << node->get_name() << "\n";
writer.indent++;
writer << "runtime::gpu::cuda_memset(" << out[0].get_name() << ", 0, "
<< out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer << "runtime::gpu::emit_onehot(\"" << node->description()
<< "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
<< out[0].get_name() << ")"
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type() << "\"}"
<< ", " << repeat_size << ", " << repeat_times << ", " << args[0].get_size()
<< ");\n";
writer.indent--;
writer << "}\n";
}
template <> template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Sqrt) void GPU_Emitter::EMITTER_DECL(ngraph::op::Sqrt)
{ {
......
...@@ -909,7 +909,6 @@ TEST(${BACKEND_NAME}, backwards_log) ...@@ -909,7 +909,6 @@ TEST(${BACKEND_NAME}, backwards_log)
TEST(${BACKEND_NAME}, backwards_maximum) TEST(${BACKEND_NAME}, backwards_maximum)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}"); auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
...@@ -930,7 +929,6 @@ TEST(${BACKEND_NAME}, backwards_maximum) ...@@ -930,7 +929,6 @@ TEST(${BACKEND_NAME}, backwards_maximum)
TEST(${BACKEND_NAME}, backwards_minimum) TEST(${BACKEND_NAME}, backwards_minimum)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
auto manager = runtime::Manager::get("${BACKEND_NAME}"); auto manager = runtime::Manager::get("${BACKEND_NAME}");
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
......
...@@ -3764,7 +3764,6 @@ TEST(${BACKEND_NAME}, replace_slice_vector) ...@@ -3764,7 +3764,6 @@ TEST(${BACKEND_NAME}, replace_slice_vector)
TEST(${BACKEND_NAME}, one_hot_scalar_2_in_3) TEST(${BACKEND_NAME}, one_hot_scalar_2_in_3)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{}; Shape shape_a{};
auto A = make_shared<op::Parameter>(element::i32, shape_a); auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3}; Shape shape_r{3};
...@@ -3787,7 +3786,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_2_in_3) ...@@ -3787,7 +3786,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_2_in_3)
TEST(${BACKEND_NAME}, one_hot_scalar_1_in_3) TEST(${BACKEND_NAME}, one_hot_scalar_1_in_3)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{}; Shape shape_a{};
auto A = make_shared<op::Parameter>(element::i32, shape_a); auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3}; Shape shape_r{3};
...@@ -3810,7 +3808,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_1_in_3) ...@@ -3810,7 +3808,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_1_in_3)
TEST(${BACKEND_NAME}, one_hot_scalar_0_in_3) TEST(${BACKEND_NAME}, one_hot_scalar_0_in_3)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{}; Shape shape_a{};
auto A = make_shared<op::Parameter>(element::i32, shape_a); auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3}; Shape shape_r{3};
...@@ -3899,7 +3896,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_oob_in_3) ...@@ -3899,7 +3896,6 @@ TEST(${BACKEND_NAME}, one_hot_scalar_oob_in_3)
TEST(${BACKEND_NAME}, one_hot_vector_0) TEST(${BACKEND_NAME}, one_hot_vector_0)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{8}; Shape shape_a{8};
auto A = make_shared<op::Parameter>(element::i32, shape_a); auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3, 8}; Shape shape_r{3, 8};
...@@ -3924,7 +3920,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_0) ...@@ -3924,7 +3920,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_0)
TEST(${BACKEND_NAME}, one_hot_vector_1) TEST(${BACKEND_NAME}, one_hot_vector_1)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{8}; Shape shape_a{8};
auto A = make_shared<op::Parameter>(element::i32, shape_a); auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{8, 3}; Shape shape_r{8, 3};
...@@ -4015,7 +4010,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_1_far_oob) ...@@ -4015,7 +4010,6 @@ TEST(${BACKEND_NAME}, one_hot_vector_1_far_oob)
TEST(${BACKEND_NAME}, one_hot_matrix_0) TEST(${BACKEND_NAME}, one_hot_matrix_0)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3, 3}; Shape shape_a{3, 3};
auto A = make_shared<op::Parameter>(element::i32, shape_a); auto A = make_shared<op::Parameter>(element::i32, shape_a);
Shape shape_r{3, 3, 3}; Shape shape_r{3, 3, 3};
...@@ -4046,7 +4040,6 @@ TEST(${BACKEND_NAME}, one_hot_matrix_0) ...@@ -4046,7 +4040,6 @@ TEST(${BACKEND_NAME}, one_hot_matrix_0)
TEST(${BACKEND_NAME}, one_hot_vector_1_fp) TEST(${BACKEND_NAME}, one_hot_vector_1_fp)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{8}; Shape shape_a{8};
auto A = make_shared<op::Parameter>(element::f32, shape_a); auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{8, 3}; Shape shape_r{8, 3};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment