Commit 41268068 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu broadcast (#576)

* add gpu broadcast

* add broadcast kernel

* fix bug for cumemdopyDtD usage in gpu_external_function.cpp
parent ae50019e
......@@ -61,6 +61,55 @@ namespace ngraph
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{
std::string name = "broadcast";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
std::string data_type("float");
kernel = R"(
extern "C" __global__
void cuda_)" + name + "(" + data_type +
"* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + R"(
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
{
size_t idx = tid / (m * k) * m + tid % m;
out[tid] = in[idx];
}
})";
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
}
}
}
......@@ -27,6 +27,8 @@ namespace ngraph
namespace gpu
{
void emit_abs(void* in, void* out, size_t count);
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count);
}
}
}
......@@ -457,7 +457,65 @@ void runtime::gpu::GPU_Emitter::EmitBroadcast(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
auto broadcast = static_cast<const ngraph::op::Broadcast*>(n);
auto arg_shape = args[0].get_shape();
auto result_shape = out[0].get_shape();
auto& axes = broadcast->get_broadcast_axes();
//broadcast axes is empty, do a copy
if (axes.empty())
{
writer << "{ // " << n->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::cuda_memcpyDtD(" << out[0].get_name() << ", " << args[0].get_name()
<< ", " << out[0].get_size() << " * " << out[0].get_element_type().size() << ");\n";
writer.indent--;
writer << "}\n";
return;
}
//broadcast axes size is 1, or can be group to 1 (consecutive axes, like 01 or 12 or 123 etc)
vector<int> axes_v;
std::copy(axes.begin(), axes.end(), std::back_inserter(axes_v));
std::sort(axes_v.begin(), axes_v.end());
bool is_one_axes = true;
if (axes.size() != 1)
{
for (int i = 1; i < axes_v.size(); i++)
{
if (axes_v[i] != axes_v[i - 1] + 1)
{
is_one_axes = false;
break;
}
}
}
if (is_one_axes)
{
int repeat_times = 1;
for (int i = 0; i < axes_v.size(); i++)
{
repeat_times *= result_shape[axes_v[i]];
}
int repeat_size = 1;
for (int i = *axes_v.rbegin() + 1; i < result_shape.size(); i++)
{
repeat_size *= result_shape[i];
}
writer << "{ // " << n->get_name() << " \n";
writer.indent++;
writer << "runtime::gpu::emit_broadcast(" << args[0].get_name() << ", " << out[0].get_name()
<< ", " << repeat_size << ", " << repeat_times << ", " << out[0].get_size()
<< ");\n";
writer.indent--;
writer << "}\n";
}
else
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
}
void runtime::gpu::GPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
......@@ -474,7 +532,6 @@ void runtime::gpu::GPU_Emitter::EmitConstant(
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
throw std::runtime_error(n->get_name() + " is not implemented.");
}
void runtime::gpu::GPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
......
......@@ -46,7 +46,6 @@ TEST(benchmark, mxnet_mnist_mlp_forward)
TEST(benchmark, gpu_mxnet_mnist_mlp_forward)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
run_benchmark(json_path, "GPU", 1000);
}
......
......@@ -1701,7 +1701,6 @@ TEST(${BACKEND_NAME}, function_call)
TEST(${BACKEND_NAME}, broadcast_scalar_vector)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{4};
......@@ -1724,7 +1723,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_vector)
TEST(${BACKEND_NAME}, broadcast_scalar_matrix)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{2, 2};
......@@ -1747,7 +1745,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_matrix)
TEST(${BACKEND_NAME}, broadcast_scalar_tensor)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{2, 2, 2};
......@@ -1770,7 +1767,6 @@ TEST(${BACKEND_NAME}, broadcast_scalar_tensor)
TEST(${BACKEND_NAME}, broadcast_trivial)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape{2, 2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Broadcast>(A, shape, AxisSet{}),
......@@ -1792,7 +1788,6 @@ TEST(${BACKEND_NAME}, broadcast_trivial)
TEST(${BACKEND_NAME}, broadcast_vector_colwise)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{3, 4};
......@@ -1815,7 +1810,6 @@ TEST(${BACKEND_NAME}, broadcast_vector_colwise)
TEST(${BACKEND_NAME}, broadcast_vector_rowwise)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{4};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{3, 4};
......@@ -1887,7 +1881,6 @@ TEST(${BACKEND_NAME}, broadcast_vector_rowwise_int64)
TEST(${BACKEND_NAME}, broadcast_matrix_0)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{2, 2, 2};
......@@ -1910,7 +1903,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_0)
TEST(${BACKEND_NAME}, broadcast_matrix_1)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{2, 2, 2};
......@@ -1933,7 +1925,6 @@ TEST(${BACKEND_NAME}, broadcast_matrix_1)
TEST(${BACKEND_NAME}, broadcast_matrix_2)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{2, 2};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{2, 2, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment