Commit 4e78f25d authored by Fenglei's avatar Fenglei Committed by Robert Kimball

gpu reshape n-dimension (n>2) (#716)

* add nd reshape

* compiler and no crash with wrong result version

* change output_stride to trans_stride, which transform input idx to output idx

* using vector instead of c array

* remove delete

* using const and reference to pass string and array

* change 'unimplement' comments, remove extra indents

* format and cast size_t to int
parent b3896731
......@@ -29,4 +29,4 @@ namespace ngraph
size_t batch_axis,
Shape mask_shape);
}
}
\ No newline at end of file
}
......@@ -104,6 +104,36 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
writer << "}\n";
}
void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types)
{
writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
<< data_types[1]
<< "* out, size_t* input_strides, size_t* trans_strides, size_t rank, size_t n)\n";
writer.block_begin();
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
writer << "if (tid < n)\n";
writer.block_begin();
{
writer << "size_t idx_in = tid;\n";
writer << "size_t idx_out = 0;\n";
writer << "for(size_t i = 0; i < rank; i++)\n";
writer.block_begin();
{
writer << "idx_out += (idx_in / input_strides[i]) * trans_strides[i];\n";
writer << "idx_in %= input_strides[i];\n";
}
writer.block_end();
writer << "out[idx_out] = in[tid];\n";
}
writer.block_end();
}
writer.block_end();
}
void runtime::gpu::CudaKernelBuilder::get_device_helper(
codegen::CodeWriter& writer,
const std::string& name,
......
......@@ -47,6 +47,10 @@ namespace ngraph
const std::string& name,
const std::array<std::string, 2>& data_types);
static void get_reshape_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types);
static void get_device_helper(codegen::CodeWriter& writer,
const std::string& name,
const std::string& math_kernel,
......
......@@ -23,10 +23,10 @@
using namespace ngraph;
using namespace ngraph::runtime::gpu;
void runtime::gpu::emit_broadcast(std::string name,
void runtime::gpu::emit_broadcast(const std::string& name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
const std::array<std::string, 2>& data_types,
size_t repeat_size,
size_t repeat_times,
size_t count)
......@@ -58,10 +58,10 @@ void runtime::gpu::emit_broadcast(std::string name,
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void runtime::gpu::emit_onehot(std::string name,
void runtime::gpu::emit_onehot(const std::string& name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
const std::array<std::string, 2>& data_types,
size_t repeat_size,
size_t repeat_times,
size_t count)
......@@ -92,3 +92,39 @@ void runtime::gpu::emit_onehot(std::string name,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void runtime::gpu::emit_reshape(const std::string& name,
CUdeviceptr in,
CUdeviceptr out,
const std::array<std::string, 2>& data_types,
CUdeviceptr input_strides,
CUdeviceptr trans_strides,
size_t rank,
size_t count)
{
std::string name_signature = name + "_" + data_types[0] + "_" + data_types[1];
std::replace(name_signature.begin(), name_signature.end(), ' ', '_');
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name_signature) == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
CudaKernelBuilder::get_reshape_op(writer, name_signature, data_types);
std::string kernel = writer.get_code();
CudaFunctionPool::instance().set(name_signature, kernel);
}
void* args_list[] = {&in, &out, &input_strides, &trans_strides, &rank, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name_signature).get(),
static_cast<unsigned int>(count),
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
......@@ -34,25 +34,34 @@ namespace ngraph
template <typename T>
struct CudaOpMap;
void emit_broadcast(std::string name,
void emit_broadcast(const std::string& name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
const std::array<std::string, 2>& data_types,
size_t repeat_size,
size_t repeat_times,
size_t count);
void emit_onehot(std::string name,
void emit_onehot(const std::string& name,
CUdeviceptr in,
CUdeviceptr out,
std::array<std::string, 2> data_types,
const std::array<std::string, 2>& data_types,
size_t repeat_size,
size_t repeat_times,
size_t count);
void emit_reshape(const std::string& name,
CUdeviceptr in,
CUdeviceptr out,
const std::array<std::string, 2>& data_types,
CUdeviceptr input_strides,
CUdeviceptr trans_strides,
size_t rank,
size_t count);
template <typename T, typename... Inputs>
void emit_elementwise_op(std::string name,
std::array<std::string, 2> data_types,
void emit_elementwise_op(const std::string& name,
const std::array<std::string, 2>& data_types,
size_t count,
CUdeviceptr out,
Inputs&&... inputs)
......
......@@ -553,7 +553,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
else if (arg_rank == 2)
{
// TODO Assert arg0_shape[0] == arg1_shape[0]?
writer.indent++;
writer << "const float alpha = 1.0;\n";
writer << "const float beta = 0;\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
......@@ -568,12 +567,63 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
<< "," << result_shape[1] << ");\n";
writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
}
// Other cases (reordering of axes for tensors with rank>2) are not handled yet.
// Other cases (reordering of axes for tensors with rank>2).
else
{
throw runtime_error(
"Axis permutation in reshape is not implemented yet for tensors with "
"rank>2");
std::vector<size_t> input_strides(arg_rank);
std::vector<size_t> output_strides(arg_rank);
std::vector<size_t> trans_strides(arg_rank);
size_t stride = 1;
for (int i = static_cast<int>(arg_rank) - 1; i >= 0; i--)
{
input_strides[i] = stride;
stride *= arg_shape[i];
}
stride = 1;
for (int i = static_cast<int>(arg_rank) - 1; i >= 0; i--)
{
output_strides[i] = stride;
stride *= arg_shape[input_order[i]];
}
for (int i = 0; i < arg_rank; i++)
{
trans_strides[input_order[i]] = output_strides[i];
}
writer << "size_t rank = " << arg_rank << ";\n";
writer << "std::vector<size_t> input_strides_h = {" << input_strides[0] << "UL";
for (int i = 1; i < arg_rank; i++)
{
writer << ", " << input_strides[i] << "UL";
}
writer << "};\n";
writer << "std::vector<size_t> trans_strides_h = {" << trans_strides[0] << "UL";
for (int i = 1; i < arg_rank; i++)
{
writer << ", " << trans_strides[i] << "UL";
}
writer << "};\n";
writer << "void* input_strides_d = "
"runtime::gpu::create_gpu_buffer(sizeof(size_t) * rank);\n";
writer << "void* trans_strides_d = "
"runtime::gpu::create_gpu_buffer(sizeof(size_t) * rank);\n";
writer
<< "runtime::gpu::cuda_memcpyHtD(input_strides_d, input_strides_h.data(), "
"sizeof(size_t) * rank);\n";
writer
<< "runtime::gpu::cuda_memcpyHtD(trans_strides_d, trans_strides_h.data(), "
"sizeof(size_t) * rank);\n";
writer << "runtime::gpu::emit_reshape(\"" << node->description()
<< "\", CUdeviceptr(" << args[0].get_name() << "), CUdeviceptr("
<< out[0].get_name() << ")"
<< ", {\"" << args[0].get_type() << "\", \"" << out[0].get_type()
<< "\"}"
<< ", "
<< "CUdeviceptr(input_strides_d), CUdeviceptr(trans_strides_d)"
<< ", " << arg_rank << ", " << args[0].get_size() << ");\n";
writer << "runtime::gpu::free_gpu_buffer(input_strides_d);\n";
writer << "runtime::gpu::free_gpu_buffer(trans_strides_d);\n";
}
writer.block_end();
}
......
......@@ -2340,7 +2340,6 @@ TEST(${BACKEND_NAME}, reduce_3d_to_vector)
TEST(${BACKEND_NAME}, reshape_t2v_012)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{2, 2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{12};
......@@ -2363,7 +2362,6 @@ TEST(${BACKEND_NAME}, reshape_t2v_012)
TEST(${BACKEND_NAME}, reshape_t2s_012)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{1, 1, 1};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{};
......@@ -2386,7 +2384,6 @@ TEST(${BACKEND_NAME}, reshape_t2s_012)
TEST(${BACKEND_NAME}, reshape_t2s_120)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{1, 1, 1};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{};
......@@ -2409,7 +2406,6 @@ TEST(${BACKEND_NAME}, reshape_t2s_120)
TEST(${BACKEND_NAME}, reshape_s2t)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{1, 1, 1, 1, 1, 1};
......@@ -2476,7 +2472,6 @@ TEST(${BACKEND_NAME}, reshape_v2m_row)
TEST(${BACKEND_NAME}, reshape_v2t_middle)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
Shape shape_a{3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_r{1, 3, 1};
......@@ -2606,7 +2601,6 @@ TEST(${BACKEND_NAME}, reshape_m2m_dim_change_transpose)
//
TEST(${BACKEND_NAME}, reshape_6d)
{
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
vector<float> a_data(2 * 2 * 3 * 3 * 2 * 4);
for (int i = 0; i < 2 * 2 * 3 * 3 * 2 * 4; i++)
{
......
......@@ -147,9 +147,8 @@ TEST(builder, tensor_mask)
auto sequence_lengths = make_shared<op::Parameter>(element::u32, max_sequence_length);
Shape mask_shape{3, 5};
auto f =
make_shared<Function>(builder::tensor_mask(sequence_lengths, 1, 0, mask_shape),
op::ParameterVector{sequence_lengths});
auto f = make_shared<Function>(builder::tensor_mask(sequence_lengths, 1, 0, mask_shape),
op::ParameterVector{sequence_lengths});
auto manager = runtime::Manager::get("INTERPRETER");
auto external = manager->compile(f);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment