Commit 89a56b08 authored by Fenglei Tian's avatar Fenglei Tian

cuda kernel tests

parent b6c34bd3
......@@ -203,6 +203,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
runtime/gpu/gpu_tensor_view.cpp
runtime/gpu/gpu_tensor_view_wrapper.cpp
runtime/gpu/gpu_util.cpp
runtime/gpu/gpu_cuda_kernel_emitter.cpp
)
set_property(SOURCE codegen/compiler.cpp APPEND_STRING PROPERTY COMPILE_DEFINITIONS
"CUDA_HEADER_PATHS=\"${CUDA_INCLUDE_DIRS}\";")
......
......@@ -77,73 +77,14 @@ namespace ngraph
void emit_abs(void* in, void* out, size_t count)
{
const char *op_abs = R"(
extern "C" __global__
void cuda_op_abs(float* in, float* out, size_t n)
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
{
out[tid] = fabsf(in[tid]);
}
})";
size_t numBlocks = 4; size_t numThreads = 4;
// Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
op_abs, // buffer
"op_abs.cu", // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
const char *opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
2, // numOptions
opts); // options
// Obtain compilation log from the program.
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char *log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
std::cout << log << '\n';
delete[] log;
if (compileResult != NVRTC_SUCCESS) {
exit(1);
}
size_t ptxSize;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = new char[ptxSize];
NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel.
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction cuda_op_abs_kernel;
CUDA_SAFE_CALL( cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
void *argsList[] = {In, Out, &count};
CUDA_SAFE_CALL(
cuLaunchKernel(cuda_op_abs_kernel,
count , 1, 1, // grid dim
1, 1, 1, // block dim
0, NULL, // shared mem and stream
argsList, 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
void *argsList[] = {In, Out, &count};
CUDA_SAFE_CALL(
cuLaunchKernel(cuda_op_abs_kernel,
count , 1, 1, // grid dim
1, 1, 1, // block dim
0, NULL, // shared mem and stream
argsList, 0)); // arguments
CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
}
void emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type,
......
......@@ -249,6 +249,7 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/gpu/gpu_util.hpp"
#include "ngraph/util.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
)";
string pch_header_source = writer.get_code();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment