Unverified Commit b5467550 authored by Chris Sullivan's avatar Chris Sullivan Committed by GitHub

Updated gpu cpp files with consistent use of namespaces (cosmetic) (#629)

* Updated namespace use in cpp files.
parent a32fdab5
...@@ -19,25 +19,18 @@ ...@@ -19,25 +19,18 @@
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp" #include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
namespace ngraph using namespace ngraph;
runtime::gpu::CudaContextManager& runtime::gpu::CudaContextManager::instance()
{ {
namespace runtime static CudaContextManager manager;
{ return manager;
namespace gpu }
{
CudaContextManager& CudaContextManager::instance()
{
static CudaContextManager manager;
return manager;
}
CudaContextManager::CudaContextManager() runtime::gpu::CudaContextManager::CudaContextManager()
{ {
CUDA_SAFE_CALL(cuInit(0)); CUDA_SAFE_CALL(cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0)); CUDA_SAFE_CALL(cuDeviceGet(&m_device, 0));
CUDA_SAFE_CALL(cuCtxCreate(&m_context, 0, m_device)); CUDA_SAFE_CALL(cuCtxCreate(&m_context, 0, m_device));
m_context_ptr = std::make_shared<CUcontext>(m_context); m_context_ptr = std::make_shared<CUcontext>(m_context);
}
}
}
} }
...@@ -20,46 +20,39 @@ ...@@ -20,46 +20,39 @@
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp" #include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp" #include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph using namespace ngraph;
std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionBuilder::get(const std::string& name,
const std::string& kernel,
int number_of_options,
const char** options)
{ {
namespace runtime nvrtcProgram prog;
{ NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
namespace gpu kernel.c_str(),
{ "op.cu",
std::shared_ptr<CUfunction> CudaFunctionBuilder::get(const std::string& name, 0, // numHeaders
const std::string& kernel, NULL, // headers
int number_of_options, NULL)); // includeNames
const char** options)
{
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
kernel.c_str(),
"op.cu",
0, // numHeaders
NULL, // headers
NULL)); // includeNames
nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options); nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
if (compile_result != NVRTC_SUCCESS) if (compile_result != NVRTC_SUCCESS)
{ {
throw std::runtime_error("compile error: \n" + kernel + "\n options"); throw std::runtime_error("compile error: \n" + kernel + "\n options");
} }
size_t ptx_size; size_t ptx_size;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size)); NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
char* ptx = new char[ptx_size]; char* ptx = new char[ptx_size];
NVRTC_SAFE_CALL(nvrtcGetPTX( NVRTC_SAFE_CALL(
prog, nvrtcGetPTX(prog,
ptx)); // Load the generated PTX and get a handle to the parent kernel. ptx)); // Load the generated PTX and get a handle to the parent kernel.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program. NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
CUmodule module; CUmodule module;
CUfunction function; CUfunction function;
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str())); CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
return std::make_shared<CUfunction>(function); return std::make_shared<CUfunction>(function);
}
}
}
} }
...@@ -26,40 +26,31 @@ ...@@ -26,40 +26,31 @@
static const std::string s_output_dir = "gpu_codegen"; static const std::string s_output_dir = "gpu_codegen";
namespace ngraph using namespace ngraph;
runtime::gpu::CudaFunctionPool& runtime::gpu::CudaFunctionPool::instance()
{ {
namespace runtime static CudaFunctionPool pool;
{ return pool;
namespace gpu }
{
CudaFunctionPool& CudaFunctionPool::instance()
{
static CudaFunctionPool pool;
return pool;
}
void CudaFunctionPool::set(const std::string& name, const std::string& kernel) void runtime::gpu::CudaFunctionPool::set(const std::string& name, const std::string& kernel)
{ {
const char* opts[] = {"--gpu-architecture=compute_35", const char* opts[] = {"--gpu-architecture=compute_35", "--relocatable-device-code=true"};
"--relocatable-device-code=true"}; std::string filename =
std::string filename = file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu");
file_util::path_join(s_output_dir, "cuda_kernel_" + name + "_codegen.cu"); std::ofstream out(filename);
std::ofstream out(filename); out << kernel;
out << kernel; out.close();
out.close(); m_function_map.insert({name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
m_function_map.insert( }
{name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts)});
}
std::shared_ptr<CUfunction> CudaFunctionPool::get(const std::string& name) std::shared_ptr<CUfunction> runtime::gpu::CudaFunctionPool::get(const std::string& name)
{ {
auto it = m_function_map.find(name); auto it = m_function_map.find(name);
if (it != m_function_map.end()) if (it != m_function_map.end())
{ {
return (*it).second; return (*it).second;
}
return nullptr;
}
}
} }
return nullptr;
} }
...@@ -16,74 +16,67 @@ ...@@ -16,74 +16,67 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
#include "ngraph/codegen/code_writer.hpp" #include "ngraph/codegen/code_writer.hpp"
namespace ngraph using namespace ngraph;
void runtime::gpu::CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer,
const std::string& name,
const std::string& data_type,
const std::string& op,
const size_t& num_inputs)
{ {
namespace runtime writer << "extern \"C\" __global__ void cuda_" << name << "(";
for (size_t i = 0; i < num_inputs; i++)
{
writer << data_type << "* in" << i << ", ";
}
writer << data_type << "* out,"
<< "size_t n)\n";
writer << "{\n";
writer.indent++;
{ {
namespace gpu writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{ {
void CudaKernelBuilder::get_elementwise_op(codegen::CodeWriter& writer, writer << "out[tid] = " << op << "(";
const std::string& name, for (size_t i = 0; i < num_inputs - 1; i++)
const std::string& data_type,
const std::string& op,
const size_t& num_inputs)
{ {
writer << "extern \"C\" __global__ void cuda_" << name << "("; writer << "in" << i << "[tid], ";
for (size_t i = 0; i < num_inputs; i++)
{
writer << data_type << "* in" << i << ", ";
}
writer << data_type << "* out,"
<< "size_t n)\n";
writer << "{\n";
writer.indent++;
{
writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
writer << "if (tid < n)\n";
writer << "{\n";
writer.indent++;
{
writer << "out[tid] = " << op << "(";
for (size_t i = 0; i < num_inputs - 1; i++)
{
writer << "in" << i << "[tid], ";
}
writer << "in" << num_inputs - 1 << "[tid]);\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
return;
} }
writer << "in" << num_inputs - 1 << "[tid]);\n";
}
writer.indent--;
writer << "}\n";
}
writer.indent--;
writer << "}\n";
void CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer, return;
const std::string& name, }
const std::string& data_type,
const std::string& math_kernel, void runtime::gpu::CudaKernelBuilder::get_device_helper(codegen::CodeWriter& writer,
const size_t& num_inputs) const std::string& name,
{ const std::string& data_type,
if (math_kernel.size()) const std::string& math_kernel,
{ const size_t& num_inputs)
writer << "__device__ " << data_type << " " << name << "("; {
for (size_t i = 0; i < num_inputs - 1; i++) if (math_kernel.size())
{ {
writer << data_type << " x" << i << ", "; writer << "__device__ " << data_type << " " << name << "(";
} for (size_t i = 0; i < num_inputs - 1; i++)
writer << data_type << " x" << num_inputs - 1; {
writer << ")\n"; writer << data_type << " x" << i << ", ";
writer << "{\n"; }
writer.indent++; writer << data_type << " x" << num_inputs - 1;
{ writer << ")\n";
writer << "return " + math_kernel << ";\n"; writer << "{\n";
} writer.indent++;
writer.indent--; {
writer << "}\n"; writer << "return " + math_kernel << ";\n";
}
return;
}
} }
writer.indent--;
writer << "}\n";
} }
return;
} }
...@@ -20,26 +20,22 @@ ...@@ -20,26 +20,22 @@
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_ops.hpp"
namespace ngraph using namespace ngraph;
void runtime::gpu::emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{ {
namespace runtime std::string name = "broadcast";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{ {
namespace gpu std::string kernel;
{ std::string data_type("float");
void emit_broadcast(
void* in, void* out, size_t repeat_size, size_t repeat_times, size_t count)
{
std::string name = "broadcast";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
std::string kernel;
std::string data_type("float");
kernel = R"( kernel = R"(
extern "C" __global__ extern "C" __global__
void cuda_)" + name + "(" + data_type + void cuda_)" + name +
"* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" + R"( "(" + data_type + "* in, " + data_type + "* out, size_t m, size_t k, size_t n)\n" +
R"(
{ {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n) if(tid < n)
...@@ -48,28 +44,25 @@ void cuda_)" + name + "(" + data_type + ...@@ -48,28 +44,25 @@ void cuda_)" + name + "(" + data_type +
out[tid] = in[idx]; out[tid] = in[idx];
} }
})"; })";
CudaFunctionPool::instance().set(name, kernel); CudaFunctionPool::instance().set(name, kernel);
} }
//convert runtime ptr to driver api ptr //convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out; CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = CUdeviceptr(in); d_ptr_in = CUdeviceptr(in);
d_ptr_out = CUdeviceptr(out); d_ptr_out = CUdeviceptr(out);
void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count}; void* args_list[] = {&d_ptr_in, &d_ptr_out, &repeat_size, &repeat_times, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(), CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
static_cast<unsigned int>(count), static_cast<unsigned int>(count),
1, 1,
1, // grid dim 1, // grid dim
1, 1,
1, 1,
1, // block dim 1, // block dim
0, 0,
NULL, // shared mem and stream NULL, // shared mem and stream
args_list, args_list,
0)); // arguments 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
}
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment