Commit ee220ffb authored by fenglei.tian's avatar fenglei.tian

fix bugs and apply clang

parent a574bdaf
......@@ -14,8 +14,6 @@
* limitations under the License.
*******************************************************************************/
#pragma once
#include <memory>
#include <string>
......@@ -27,7 +25,7 @@ namespace ngraph
{
namespace gpu
{
static CudaContextManager::CudaContextManager& instance()
CudaContextManager& CudaContextManager::instance()
{
static CudaContextManager manager;
return manager;
......
......@@ -14,11 +14,10 @@
* limitations under the License.
*******************************************************************************/
#pragma once
#include <string>
#include "ngraph/runtime/gpu/gpu_cuda_context_manager.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_function_builder.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
namespace ngraph
......@@ -27,21 +26,20 @@ namespace ngraph
{
namespace gpu
{
static std::shared_ptr<CUfunction> CudaFunctionBuilder::get(const std::string& name,
const std::string& kernel,
int number_of_options,
const char** options)
std::shared_ptr<CUfunction> CudaFunctionBuilder::get(const std::string& name,
const std::string& kernel,
int number_of_options,
const char** options)
{
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
kernel.c_str(),
"op.cu",
0, // numHeaders
NULL, // headers
NULL)); // includeNames
kernel.c_str(),
"op.cu",
0, // numHeaders
NULL, // headers
NULL)); // includeNames
nvrtcResult compile_result =
nvrtcCompileProgram(prog, number_of_options, options);
nvrtcResult compile_result = nvrtcCompileProgram(prog, number_of_options, options);
if (compile_result != NVRTC_SUCCESS)
{
......
......@@ -33,38 +33,7 @@ namespace ngraph
static std::shared_ptr<CUfunction> get(const std::string& name,
const std::string& kernel,
int number_of_options,
const char** options)
{
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,
kernel.c_str(),
"op.cu",
0, // numHeaders
NULL, // headers
NULL)); // includeNames
nvrtcResult compile_result =
nvrtcCompileProgram(prog, number_of_options, options);
if (compile_result != NVRTC_SUCCESS)
{
throw std::runtime_error("compile error: \n" + kernel + "\n options");
}
size_t ptx_size;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptx_size));
char* ptx = new char[ptx_size];
NVRTC_SAFE_CALL(nvrtcGetPTX(
prog,
ptx)); // Load the generated PTX and get a handle to the parent kernel.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Destroy the program.
CUmodule module;
CUfunction function;
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_SAFE_CALL(cuModuleGetFunction(&function, module, name.c_str()));
return std::make_shared<CUfunction>(function);
}
const char** options);
};
}
}
......
......@@ -14,8 +14,6 @@
* limitations under the License.
*******************************************************************************/
#pragma once
#include <string>
#include <unordered_map>
......@@ -27,7 +25,7 @@ namespace ngraph
{
namespace gpu
{
static CudaFunctionPool::CudaFunctionPool& instance()
CudaFunctionPool& CudaFunctionPool::instance()
{
static CudaFunctionPool pool;
return pool;
......
......@@ -38,6 +38,7 @@ namespace ngraph
void set(std::string& name, std::shared_ptr<CUfunction> function);
std::shared_ptr<CUfunction> get(std::string& name);
protected:
CudaFunctionPool() {}
~CudaFunctionPool() {}
......
......@@ -14,8 +14,6 @@
* limitations under the License.
*******************************************************************************/
#pragma once
#include "ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp"
namespace ngraph
......@@ -24,47 +22,48 @@ namespace ngraph
{
namespace gpu
{
static void CudaKernelBuilder::get_1_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
void CudaKernelBuilder::get_1_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
{
kernel = R"(
extern "C" __global__
void cuda_)" + name + "(" + data_type +
"* in, " + data_type + "* out, size_t n)\n" + R"({
void cuda_)" + name + "(" +
data_type + "* in, " + data_type + "* out, size_t n)\n" + R"({
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
{
out[tid] =)" + op + "(in[tid]);\n" +
R"(}
R"(}
})";
return;
}
static void CudaKernelBuilder::get_2_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
void CudaKernelBuilder::get_2_element_op(const std::string& name,
const std::string& data_type,
const std::string& op,
std::string& kernel)
{
kernel = R"(
extern "C" __global__
void )" + name + "(" + data_type +
"* in1, " + data_type + "* in2, " + data_type + "* out, size_t n)\n" +
R"({
"* in1, " + data_type + "* in2, " + data_type + "* out, size_t n)\n" +
R"({
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
{
out[tid] = in1[tid] )" + op + "in2[tid]\n" +
R"(}
out[tid] = in1[tid] )" + op +
"in2[tid]\n" +
R"(}
})";
return;
}
static void CudaKernelBuilder::get_n_element_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel)
void CudaKernelBuilder::get_n_element_op(const std::string& name,
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel)
{
kernel = "";
return;
......
......@@ -17,6 +17,7 @@
#pragma once
#include <string>
#include <vector>
namespace ngraph
{
......@@ -41,6 +42,7 @@ namespace ngraph
const std::string& data_type,
const std::vector<std::string>& ops,
std::string& kernel);
};
}
}
}
......@@ -28,44 +28,38 @@ namespace ngraph
{
namespace gpu
{
namespace cuda
void emit_abs(void* in, void* out, size_t count)
{
namespace kernel
std::string name = "abs";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
void emit_abs(void* in, void* out, size_t count)
{
std::string name = "abs";
// Create an instance of nvrtcProgram with the code string.
if (CudaFunctionPool::instance().get(name) == nullptr)
{
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
CudaKernelBuilder::get_1_element_op(name, "float", "fabsf", kernel);
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
}
const char* opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
std::string kernel;
CudaKernelBuilder::get_1_element_op(name, "float", "fabsf", kernel);
CudaFunctionPool::instance().set(
name, CudaFunctionBuilder::get("cuda_" + name, kernel, 2, opts));
}
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
//convert runtime ptr to driver api ptr
CUdeviceptr d_ptr_in, d_ptr_out;
d_ptr_in = (CUdeviceptr)in;
d_ptr_out = (CUdeviceptr)out;
void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
}
void* args_list[] = {&d_ptr_in, &d_ptr_out, &count};
CUDA_SAFE_CALL(cuLaunchKernel(*CudaFunctionPool::instance().get(name).get(),
count,
1,
1, // grid dim
1,
1,
1, // block dim
0,
NULL, // shared mem and stream
args_list,
0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
}
}
......
......@@ -25,13 +25,7 @@ namespace ngraph
{
namespace gpu
{
namespace cuda
{
namespace kernel
{
void emit_abs(void* in, void* out, size_t count);
}
}
void emit_abs(void* in, void* out, size_t count);
}
}
}
......@@ -90,8 +90,8 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer << "ngraph::runtime::gpu::cuda::kernel::emit_abs((void*) " << args[0].get_name()
<< ", (void*) " << out[0].get_name() << ", count);\n";
writer << "ngraph::runtime::gpu::emit_abs((void*) " << args[0].get_name() << ", (void*) "
<< out[0].get_name() << ", count);\n";
writer.indent--;
writer << "}\n";
}
......
......@@ -16,6 +16,11 @@
#pragma once
#include <memory>
#include <stdexcept>
#include <string>
#include <vector>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment