Commit e63322d9 authored by fenglei.tian's avatar fenglei.tian

add rentime cuda kernel compile

parent ae010b42
./build/test/unit-test --gtest_filter=GPU.ab
./build/test/unit-test --gtest_filter=GPU.abc
./build/test/unit-test --gtest_filter=GPU.maximum
./build/test/unit-test --gtest_filter=GPU.minimum
./build/test/unit-test --gtest_filter=GPU.multiple*
./build/test/unit-test --gtest_filter=GPU.sqrt
./build/test/unit-test --gtest_filter=GPU.nagtive
./build/test/unit-test --gtest_filter=GPU.abs
#./build/test/unit-test --gtest_filter=GPU.ab
#./build/test/unit-test --gtest_filter=GPU.abc
#./build/test/unit-test --gtest_filter=GPU.maximum
#./build/test/unit-test --gtest_filter=GPU.minimum
#./build/test/unit-test --gtest_filter=GPU.multiple*
#./build/test/unit-test --gtest_filter=GPU.sqrt
#./build/test/unit-test --gtest_filter=GPU.nagtive
#./build/test/unit-test --gtest_filter=GPU.abs
#./build/test/unit-test --gtest_filter=GPU.dot*
......@@ -271,7 +271,9 @@ endif()
# Nvidia
if(NGRAPH_GPU_ENABLE AND CUDA_LIBRARIES)
target_link_libraries(ngraph PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
find_library(CUDA_nvrtc_LIBRARY nvrtc /usr/local/cuda/lib64)
find_library(CUDA_cuda_LIBRARY cuda /usr/local/cuda/lib64)
target_link_libraries(ngraph PUBLIC ${CUDA_cuda_LIBRARY} ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES})
endif()
# Argon
......
......@@ -20,6 +20,11 @@
#include <typeindex>
#include <unordered_map>
#include <vector>
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cudnn_v7.h>
#include "ngraph/node.hpp"
#include "ngraph/ops/broadcast.hpp"
......@@ -44,6 +49,30 @@
using namespace std;
using namespace ngraph;
#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
exit(1); \
} \
} while(0)
#define CUDA_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
exit(1); \
} \
} while(0)
void runtime::gpu::GPU_Emitter::EmitNop(codegen::CodeWriter& writer,
const ngraph::Node* n,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
......@@ -56,7 +85,104 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& args,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{
writer << " // " << n->get_name() << "\n return;\n";
const char *op_abs = R"(
extern "C" __global__
void cuda_op_abs(float* in, float* out, size_t n)
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < n)
{
out[tid] = fabsf(in[tid]);
}
})";
size_t numBlocks = 4; size_t numThreads = 4;
// Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
op_abs, // buffer
"op_abs.cu", // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
const char *opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
2, // numOptions
opts); // options
// Obtain compilation log from the program.
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char *log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
std::cout << log << '\n';
delete[] log;
if (compileResult != NVRTC_SUCCESS) {
exit(1);
}
size_t ptxSize;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = new char[ptxSize];
NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel.
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction cuda_op_abs_kernel;
CUDA_SAFE_CALL( cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
// CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState));
//CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
//CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
//size_t cubinSize;
//void *cubin;
//CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
writer << "{ // " << n->get_name() << "\n";
writer.indent++;
writer << "int count = " << out[0].get_size() << ";\n";
writer << "if(count == 0) return;\n";
writer << "void *argsList[] = {(void *)" << args[0].get_name() << ", (void *)" << out[0].get_name() << ", &count};\n";
writer << "//cuLaunchKernel(cuda_op_abs_kernel, count, 1, 1, 1, 1, 1, 0, NULL, argsList, 0);\n";
writer << "}\n";
// Generate input for execution, and create output buffers.
//size_t nt = numBlocks * numThreads;
//size_t bufferSize = nt * sizeof(float);
//float *hOut = new float[nt];
//float *hIn = new float[nt];
//for(int i = 0; i< nt; i++) hIn[i] = -i;
//
//CUdeviceptr dOut, dIn;
//cuMemAlloc(&dOut, bufferSize); // Execute parent kernel.
//cuMemAlloc(&dIn, bufferSize); // Execute parent kernel.
//cuMemcpyHtoD(dIn, hIn, bufferSize);
//
//void *argst[] = {&dIn, &dOut, &nt};
// CUDA_SAFE_CALL(
// cuLaunchKernel(kernel,
// numBlocks , 1, 1, // grid dim
// numThreads, 1, 1, // block dim
// 0, NULL, // shared mem and stream
// argst, 0)); // arguments
//CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
//cuMemcpyDtoH(hOut, dOut, bufferSize);
//for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources.
//cuMemFree(dOut);
//cuModuleUnload(module);
}
void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
......
......@@ -76,7 +76,7 @@ if(NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR)
endif()
if(NGRAPH_GPU_ENABLE AND LLVM_INCLUDE_DIR)
include_directories(SYSTEM ${LLVM_INCLUDE_DIR})
include_directories(SYSTEM ${LLVM_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR})
link_directories(${LLVM_LIB_DIR})
link_directories(${CUDA_LIBRARIES})
link_directories(${CUDA_CUBLAS_LIBRARIES})
......@@ -130,7 +130,8 @@ if(LLVM_INCLUDE_DIR)
endif()
if(CUDA_INCLUDE_DIRS)
target_link_libraries(unit-test ${CUDA_LIBRARIES} ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
find_library(CUDA_nvrtc_LIBRARY nvrtc /usr/local/cuda/lib64)
target_link_libraries(unit-test ${CUDA_nvrtc_LIBRARY} ${CUDA_LIBRARIES} ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
endif()
target_link_libraries(unit-test ngraph libgtest pthread)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment