Commit 144d790b authored by fenglei.tian's avatar fenglei.tian

add cuda kernel

parent e63322d9
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#include <algorithm>
#include <map>
#include "ngraph/runtime/gpu/gpu_kernel_emitters.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
namespace cuda
{
namespace kernel
{
void emit_abs(float* in, float* out, size_t count)
{
}
void emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& broadcast_axes)
{
}
//
// For the reference kernel this is gpud on, see ngraph/runtime/kernel/concat.hpp.
//
void emit_concat(codegen::CodeWriter& writer,
const std::string& element_type,
const std::vector<std::string>& args,
const std::string& out,
const std::vector<Shape>& in_shapes,
const Shape& out_shape,
size_t concatenation_axis)
{
}
void emit_replace_slice(
codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& arg1, // replacement value
const std::string& out,
const Shape& arg1_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides)
{
}
void emit_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides)
{
}
void emit_reshape(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisVector& arg0_axis_order)
{
}
void emit_sum(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& reduction_axes)
{
}
}
}
}
}
}
// ----------------------------------------------------------------------------
// Copyright 2017 Nervana Systems Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// ----------------------------------------------------------------------------
#pragma once
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/common.hpp"
namespace ngraph
{
namespace runtime
{
namespace gpu
{
namespace kernel
{
void emit_broadcast(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& broadcast_axes);
void emit_concat(codegen::CodeWriter& writer,
const std::string& element_type,
const std::vector<std::string>& args,
const std::string& out,
const std::vector<Shape>& in_shapes,
const Shape& out_shape,
const size_t concatenation_axis);
void emit_replace_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& arg1, // replacement value
const std::string& out,
const Shape& arg1_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides);
void emit_slice(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const Coordinate& lower_bounds,
const Coordinate& upper_bounds,
const Strides& strides);
void emit_reshape(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisVector& arg0_axis_order);
void emit_sum(codegen::CodeWriter& writer,
const std::string& element_type,
const std::string& arg0, // replacement context
const std::string& out,
const Shape& arg0_shape,
const Shape& out_shape,
const AxisSet& reduction_axes);
}
}
}
}
...@@ -52,24 +52,24 @@ using namespace ngraph; ...@@ -52,24 +52,24 @@ using namespace ngraph;
#define NVRTC_SAFE_CALL(x) \ #define NVRTC_SAFE_CALL(x) \
do { \ do { \
nvrtcResult result = x; \ nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \ if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \ std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \ << nvrtcGetErrorString(result) << '\n'; \
exit(1); \ exit(1); \
} \ } \
} while(0) } while(0)
#define CUDA_SAFE_CALL(x) \ #define CUDA_SAFE_CALL(x) \
do { \ do { \
CUresult result = x; \ CUresult result = x; \
if (result != CUDA_SUCCESS) { \ if (result != CUDA_SUCCESS) { \
const char *msg; \ const char *msg; \
cuGetErrorName(result, &msg); \ cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \ std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \ << msg << '\n'; \
exit(1); \ exit(1); \
} \ } \
} while(0) } while(0)
...@@ -96,59 +96,59 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer, ...@@ -96,59 +96,59 @@ void runtime::gpu::GPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
} }
})"; })";
size_t numBlocks = 4; size_t numThreads = 4; size_t numBlocks = 4; size_t numThreads = 4;
// Create an instance of nvrtcProgram with the code string. // Create an instance of nvrtcProgram with the code string.
nvrtcProgram prog; nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog i
op_abs, // buffer op_abs, // buffer
"op_abs.cu", // name "op_abs.cu", // name
0, // numHeaders 0, // numHeaders
NULL, // headers NULL, // headers
NULL)); // includeNames NULL)); // includeNames
const char *opts[] = {"--gpu-architecture=compute_35", const char *opts[] = {"--gpu-architecture=compute_35",
"--relocatable-device-code=true"}; "--relocatable-device-code=true"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
2, // numOptions 2, // numOptions
opts); // options opts); // options
// Obtain compilation log from the program. // Obtain compilation log from the program.
size_t logSize; size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char *log = new char[logSize]; char *log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
std::cout << log << '\n'; std::cout << log << '\n';
delete[] log; delete[] log;
if (compileResult != NVRTC_SUCCESS) { if (compileResult != NVRTC_SUCCESS) {
exit(1); exit(1);
} }
size_t ptxSize; size_t ptxSize;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize)); NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = new char[ptxSize]; char *ptx = new char[ptxSize];
NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program. NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx)); // Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel. NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); // Load the generated PTX and get a handle to the parent kernel.
CUdevice cuDevice; CUdevice cuDevice;
CUcontext context; CUcontext context;
CUmodule module; CUmodule module;
CUfunction cuda_op_abs_kernel; CUfunction cuda_op_abs_kernel;
CUDA_SAFE_CALL( cuInit(0)); CUDA_SAFE_CALL( cuInit(0));
CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0));
CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice));
// CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState)); // CUDA_SAFE_CALL(cuLinkCreate(0, 0 , 0, &linkState));
//CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0)); //CUDA_SAFE_CALL(cuLinkeAddFile(linkState, CU_JIT_INPUT_LIBRARY, ' ', 0, 0, 0));
//CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0)); //CUDA_SAFE_CALL(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void *)ptx, ptxSize, "dynamic_parallelism.ptx", 0, 0, 0));
//size_t cubinSize; //size_t cubinSize;
//void *cubin; //void *cubin;
//CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize)); //CUDA_SAFE_CALL(cuLinkComplete(linkState, &cubin, &cubinSize));
CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); CUDA_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 0, 0, 0));
CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs")); CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
writer << "{ // " << n->get_name() << "\n"; writer << "{ // " << n->get_name() << "\n";
...@@ -157,32 +157,33 @@ CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs")); ...@@ -157,32 +157,33 @@ CUDA_SAFE_CALL(cuModuleGetFunction(&cuda_op_abs_kernel, module, "cuda_op_abs"));
writer << "if(count == 0) return;\n"; writer << "if(count == 0) return;\n";
writer << "void *argsList[] = {(void *)" << args[0].get_name() << ", (void *)" << out[0].get_name() << ", &count};\n"; writer << "void *argsList[] = {(void *)" << args[0].get_name() << ", (void *)" << out[0].get_name() << ", &count};\n";
writer << "//cuLaunchKernel(cuda_op_abs_kernel, count, 1, 1, 1, 1, 1, 0, NULL, argsList, 0);\n"; writer << "//cuLaunchKernel(cuda_op_abs_kernel, count, 1, 1, 1, 1, 1, 0, NULL, argsList, 0);\n";
writer.indent--;
writer << "}\n"; writer << "}\n";
// Generate input for execution, and create output buffers. // Generate input for execution, and create output buffers.
//size_t nt = numBlocks * numThreads; //size_t nt = numBlocks * numThreads;
//size_t bufferSize = nt * sizeof(float); //size_t bufferSize = nt * sizeof(float);
//float *hOut = new float[nt]; //float *hOut = new float[nt];
//float *hIn = new float[nt]; //float *hIn = new float[nt];
//for(int i = 0; i< nt; i++) hIn[i] = -i; //for(int i = 0; i< nt; i++) hIn[i] = -i;
// //
//CUdeviceptr dOut, dIn; //CUdeviceptr dOut, dIn;
//cuMemAlloc(&dOut, bufferSize); // Execute parent kernel. //cuMemAlloc(&dOut, bufferSize); // Execute parent kernel.
//cuMemAlloc(&dIn, bufferSize); // Execute parent kernel. //cuMemAlloc(&dIn, bufferSize); // Execute parent kernel.
//cuMemcpyHtoD(dIn, hIn, bufferSize); //cuMemcpyHtoD(dIn, hIn, bufferSize);
// //
//void *argst[] = {&dIn, &dOut, &nt}; //void *argst[] = {&dIn, &dOut, &nt};
// CUDA_SAFE_CALL( // CUDA_SAFE_CALL(
// cuLaunchKernel(kernel, // cuLaunchKernel(kernel,
// numBlocks , 1, 1, // grid dim // numBlocks , 1, 1, // grid dim
// numThreads, 1, 1, // block dim // numThreads, 1, 1, // block dim
// 0, NULL, // shared mem and stream // 0, NULL, // shared mem and stream
// argst, 0)); // arguments // argst, 0)); // arguments
//CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output. //CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
//cuMemcpyDtoH(hOut, dOut, bufferSize); //cuMemcpyDtoH(hOut, dOut, bufferSize);
//for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources. //for (size_t i = 0; i < nt; ++i) { std::cout << hOut[i] << '\n'; } // Release resources.
//cuMemFree(dOut); //cuMemFree(dOut);
//cuModuleUnload(module); //cuModuleUnload(module);
} }
void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer, void runtime::gpu::GPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
...@@ -242,7 +243,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -242,7 +243,7 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
const vector<runtime::gpu::GPU_TensorViewWrapper>& out) const vector<runtime::gpu::GPU_TensorViewWrapper>& out)
{ {
writer << " // " << n->get_name() << "\n return;\n"; writer << " // " << n->get_name() << "\n return;\n";
/* /*
const Shape& arg0_shape = args[0].get_shape(); const Shape& arg0_shape = args[0].get_shape();
const Shape& arg1_shape = args[1].get_shape(); const Shape& arg1_shape = args[1].get_shape();
if (arg0_shape.empty() || arg1_shape.empty()) if (arg0_shape.empty() || arg1_shape.empty())
...@@ -342,11 +343,11 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer, ...@@ -342,11 +343,11 @@ void runtime::gpu::GPU_Emitter::EmitDot(codegen::CodeWriter& writer,
// clang-format on // clang-format on
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
} }
else else
{ {
// General ND Call? // General ND Call?
} }
*/ */
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment